I thought that the ABI of my compiler might be not so effective as
managing my own recursion with my own stack iteratively. So I wrote
an iterative solution, still with the buffer on the stack and if
the buffer is too large, the buffer on the heap:
#pragma once
#include "disarmed-iterators.h"
#include <iterator>
#include <type_traits>
#include <utility>
#include <vector>
#include <memory>
#include <new>
#include <memory>
#include <cstddef>
#include "ndi_t.h"
#include "xassert.h"
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable: 6255) // alloca may oveflow stack
#pragma warning(disable: 26812) // prefer uncsoped enums
#endif
template<typename ItType, typename IsLess, typename Alloc =
std::allocator<char>>
void xstable_sort( ItType begin, ItType end, IsLess const &isLess, Alloc
alloc = Alloc() )
{
using namespace std;
using T = typename std::iterator_traits<ItType>::value_type;
using frame_alloc = typename allocator_traits<Alloc>::template
rebind_alloc<char>;
static_assert(is_nothrow_move_constructible<T>::value, "*begin must be
nothrow move-contructrible");
static_assert(is_nothrow_move_assignable<T>::value, "*begin must be
nothrow move-assignable");
struct frame_t
{
frame_t *prevFrame,
*nextFrame;
T *inputBegin;
size_t nInputs;
enum : std::int8_t
{
CALL = 0,
SECOND_CALL = 1,
RET = 2
} state;
T buf[1];
};
auto alignUp = []( size_t size, size_t align ) -> size_t
{
return size + align - 1 & ~(align - 1);
};
size_t n = end - begin,
frameSpace = alignof(frame_t) - 1;
for( size_t nInputs = n; nInputs > 2; nInputs -= nInputs / 2 )
frameSpace += alignUp( offsetof(frame_t, buf) + nInputs * sizeof(T),
alignof(frame_t) );
frameSpace += sizeof(frame_t);
vector<ndi_t<char>> vecBuf;
void *bufBegin;
if( frameSpace <= 0x10000 )
bufBegin = (char *)alloca( frameSpace );
else
vecBuf.resize( frameSpace ),
bufBegin = &vecBuf[0];
frame_t *firstFrame = (frame_t *)alignUp( (size_t)bufBegin,
alignof(frame_t) );
frame_t *curFrame = firstFrame,
*prevFrame = nullptr;
size_t nInputs = n;
for( ; nInputs > 2; nInputs -= nInputs / 2 )
try
{
curFrame->prevFrame = prevFrame;
T *t = curFrame->buf;
try
{
if constexpr( !is_trivial<T>::value )
for( T *t = curFrame->buf; t != curFrame->buf + nInputs; ++t )
new( (void *)t )T();
}
catch( ... )
{
while( t-- > bufBegin )
t->~T();
throw;
}
prevFrame = curFrame;
(char *&)curFrame += alignUp( offsetof(frame_t, buf) + nInputs *
sizeof(T), alignof(frame_t) );
prevFrame->nextFrame = curFrame;
}
catch( ... )
{
while( (curFrame = prevFrame) != nullptr )
for( T *t = curFrame->buf; t != curFrame->buf + curFrame->nInputs; )
t->~T();
throw;
}
curFrame->prevFrame = prevFrame;
curFrame->nextFrame = nullptr;
firstFrame->inputBegin = &*begin;
firstFrame->nInputs = n;
firstFrame->state = frame_t::CALL;
curFrame = firstFrame;
for( ; curFrame; )
{
xassert(curFrame->state >= frame_t::CALL && curFrame->state <=
frame_t::RET);
switch( curFrame->state )
{
case frame_t::CALL:
{
call:
if( curFrame->nInputs <= 2 )
{
if( curFrame->nInputs == 2 && isLess( curFrame->inputBegin[1],
curFrame->inputBegin[0] ) )
swap( curFrame->inputBegin[1], curFrame->inputBegin[0] );
curFrame = curFrame->prevFrame;
break;
}
T *inputs = curFrame->inputBegin,
*buf = curFrame->buf;
for( size_t i = 0; i != curFrame->nInputs; ++i )
buf[i] = move( inputs[i] );
size_t n = curFrame->nInputs,
left = n - n / 2;
frame_t *nextFrame = curFrame->nextFrame;
nextFrame->inputBegin = curFrame->buf;
nextFrame->nInputs = left;
curFrame->state = frame_t::SECOND_CALL;
curFrame = nextFrame;
goto call;
}
case frame_t::SECOND_CALL:
{
size_t n = curFrame->nInputs,
right = n / 2,
left = n - right;
frame_t *nextFrame = curFrame->nextFrame;
nextFrame->inputBegin = curFrame->buf + left;
nextFrame->nInputs = right;
curFrame->state = frame_t::RET;
curFrame = nextFrame;
goto call;
}
break;
case frame_t::RET:
{
size_t n = curFrame->nInputs,
right = n / 2,
left = n - right;
T *pLeft = curFrame->buf,
*pRight = pLeft + left;
T *pLeftEnd = pRight,
*pRightEnd = pLeft + n;
for( T *moveBack = curFrame->inputBegin; ; )
{
if( isLess( *pLeft, *pRight ) )
{
*moveBack++ = move( *pLeft );
if( ++pLeft == pLeftEnd )
{
do
*moveBack++ = move( *pRight );
while( ++pRight != pRightEnd );
break;
}
}
else
{
*moveBack++ = move( *pRight );
if( ++pRight == pRightEnd )
{
do
*moveBack++ = move( *pLeft );
while( ++pLeft != pLeftEnd );
break;
}
}
}
curFrame = curFrame->prevFrame;
break;
}
}
}
for( curFrame = firstFrame; curFrame->nextFrame; curFrame =
curFrame->nextFrame )
{
T *buf = curFrame->buf;
for( size_t i = 0; i != curFrame->nInputs; ++i )
buf[i].~T();
}
}
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
// ndi_t:
#pragma once
#include <type_traits>
template<typename T>
union ndi_t
{
ndi_t();
ndi_t( T value );
~ndi_t();
operator T &();
private:
T m_t;
};
template<typename T>
inline
ndi_t<T>::ndi_t()
{
if constexpr( !std::is_trivially_copyable<T>::value )
new( (void *)&m_t )T();
}
template<typename T>
inline
ndi_t<T>::ndi_t( T value ) :
m_t( value )
{
}
template<typename T>
inline
ndi_t<T>::~ndi_t()
{
if constexpr( !std::is_trivially_copyable<T>::value )
m_t.~T();
}
template<typename T>
inline
ndi_t<T>::operator T &()
{
return m_t;
}
// xassert.h
#pragma once
#include <cassert>
#if !defined(assume)
#if defined(_MSC_VER)
#define assume(e) (__assume(e))
#elif defined(__GNUC__)
#define assume(e) \
{ \
if( !(e) ) \
__builtin_unreachable(); \
}
#else
#define assume(e) ((void)0)
#endif
#endif
#if !defined(xassert)
#if !defined(NDEBUG)
#define xassert(e) (assert(e))
#else
#if defined(_MSC_VER)
#define xassert(e) assume(e)
#elif defined(__GNUC__)
#define xassert(e) \
{ \
if( !(e) ) \
__builtin_unreachable(); \
}
#else
#define xassert(e) ((void)0)
#endif
#endif
#endif