<pseudo-code>
struct block_t
{
  struct block_t *next;
  struct per_thread_t *thread;
};
/* no aba counting needed */
struct block_anchor_t
{
  block_t *front;
};
struct per_thread_t
{
  block_anchor_t local;
  block_anchor_t shared;
  block_t *blocks;
  size_t block_sz;
  int refs; /* init to 1 */
};
void* slab_malloc( size_t );
void slab_free( void* );
block_t* slab_sys_local_pop( per_thread_t* );
void slab_sys_local_push( block_t*, int );
int slab_sys_shared_pop( per_thread_t*, int );
void slab_sys_shared_push( block_t* );
void slab_sys_tls_dtor( void* );
void* slab_malloc( size_t sz )
{
  per_thread_t *_this = pthread_getspecific(...);
  block_t *block = slab_sys_local_pop( _this );
  return ( block ) ? block + 1 : 0;
}
void slab_free( void *mem )
{
  block_t *block = ((block_t*)mem) - 1;
  per_thread_t *_this = pthread_getspecific(...);
  if ( block->thread == _this )
  {
    slab_sys_local_push( block, 0 );
  }
  else
  {
    slab_sys_shared_push( block );
  }
}
block_t* slab_sys_local_pop( per_thread_t *_this )
{
  block_t *block = _this->local.front;
  if ( ! block )
  {
    if ( ! slab_sys_shared_pop( _this, 0 ) )
    {
      return 0;
    }
    block = _this->local.front;
  }
  _this->local.front = block->next;
  ++_this->refs;
  return block;
}
void slab_sys_local_push( block_t *_this, int marked )
{
  _this->next = _this->thread->local.front;
  _this->thread->local.front = _this;
  if ( ! marked ) { --_this->refs; }
}
int slab_sys_shared_pop( per_thread_t *_this, int mark )
{
  int count = 0;
  block_t *nx, *block = XCHG( &_this->shared.front, mark );
  while ( block )
  {
    nx = block->next;
    slab_sys_local_push( block, mark );
    block = next;
    ++count;
  }
  return count;
}
void slab_sys_shared_push( block_t *_this )
{
  block_t *cmp;
  do
  {
    cmp = _this->thread->shared.front;
    _this->next = cmp & ~1;
  }
  while ( ! CAS( &_this->thread->shared.front,
                 cmp,
                 ( cmp & 1 ) ? _this | 1 : _this ) );
  if ( cmp & 1 && XADD( &_this->refs, -1 ) == 1 )
  {
    /* destroy _this->thread */
  }
}
void slab_sys_tls_dtor( void *state )
{
  per_thread_t *_this = state;
  int count = slab_sys_shared_pop( _this, 1 );
  if ( XADD( &_this->refs, -count ) == 1 )
  {
    /* destroy _this */
  }
}
This is an experimental version. Any questions?
;)
;(....
The slab_sys_tls_dtor function needs to be changed from this
> void slab_sys_tls_dtor( void *state )
> {
>  per_thread_t *_this = state;
>  int count = slab_sys_shared_pop( _this, 1 );
>
>  if ( XADD( &_this->refs, -count ) == 1 )
>  {
>    /* destroy _this */
>  }
> }
to this:
void slab_sys_tls_dtor( void *state )
{
  per_thread_t *_this = state;
int count = slab_sys_shared_pop( _this, 1 ) + 1;
  if ( XADD( &_this->refs, -count ) == 1 )
  {
    /* destroy _this */
  }
}
Sorry!
:O
>  if ( XADD( &_this->refs, -count ) == 1 )
^^^^^^^^^^^^^^^^
Needs to be:
> if ( XADD( &_this->refs, -count ) == count )
This is because XADD returns the old value! Humm... I think I should create 
some compliable code. This particular allocator design seems very promising.
:)