[RFC] atomic refcounting alternative for boost shared_ptr...

32 views
Skip to first unread message

Chris Thomasson

unread,
Oct 6, 2006, 6:09:38 PM10/6/06
to
I am presenting some source code for a sample implementation of my new
mostly lock-free reference counting algorithm that I first mention here:


http://groups.google.com/group/comp.programming.threads/browse_frm/thread/4e8717d3bcdedfe9


All of its "critical functionality", as of now, is created in pure MASM IA32
assembly language; a GAS compatible version is coming soon. I decided to go
ahead and do most of it in assembly language simple because I did not want
to deal with any compiler ordering. Atomic lock-free reference counting
algorithms need to be precisely executed, as is. The assembly language API
adheres to the C ABI; you can use it in your existing C programs.


I will posted the source code in a couple of following posts; there are a
total of 5 files:


refcount_ia32.asm
refcount_ia32.h
spinlock_ia32.h
helper.h
refcount.h


If all goes well, I think that my algorithm could be a great candidate for a
replacement for Boosts shared_ptr. As you may know by now, shared_ptr is not
atomically thread-safe. I advise you to take a look at this code if you are
interested in improving Boost shared_ptr functionality and performance...

My current algorithm does not use anything like GC or PDR. Instead, I have
developed a new double-checked method that is coupled with a hashed static
spinlock table scheme in order to get the job done, the right way. The
algorithm only has to access the hashed spinlock table during strong
increments and when the count drops to, or below, zero. Its a fairly
efficient setup...

I would be very interested to hear if anybody can assembly and compile it.
If your having any trouble whatsoever, or if you have any questions, feel
free to fire away!

:)


Any thoughts?


Chris Thomasson

unread,
Oct 6, 2006, 6:19:52 PM10/6/06
to
refcount_ia32.asm
-------------------------------------------


; Copyright 2005 Chris Thomasson

.686
.XMM
.MODEL FLAT, C
.CODE

align 16
; int refcount_ia32_init(
; refcount_ia32_t volatile*,
; int,
; refcount_ia32_fpdtor_t,
; void*)
refcount_ia32_init PROC
MOV ECX, [ESP + 4]
MOV EAX, [ESP + 8]
CMP EAX, 0
JLE refcount_ia32_init_failed
MOV EDX, [ESP + 12]
MOV [ECX], EAX
MOV [ECX + 4], EDX
MOV EDX, [ESP + 16]
MOV [ECX + 8], EDX
MOV EAX, 1
RET

refcount_ia32_init_failed:
XOR EAX, EAX
RET
refcount_ia32_init ENDP

align 16
; refcount_ia32_t* refcount_ia32_add_load_strong(
; refcount_ia32_t* volatile*,
; int,
; spinlock_ia32_table_t const*,
; size_t)
refcount_ia32_add_load_strong PROC
PUSH ESI
PUSH EBX
MOV ECX, [ESP + 12]
MOV EDX, [ESP + 20]
MOV ESI, [ESP + 16]
CMP ESI, 0
JG refcount_ia32_add_load_strong_retry
XOR EAX, EAX
RET

refcount_ia32_add_load_strong_retry:
MOV EAX, [ECX]
MOV EBX, EAX
IMUL EBX, 567
SHR EBX, 8
AND EBX, 0FFh
IMUL EBX, [ESP + 24]
LEA EBX, [EDX + EBX]

refcount_ia32_add_load_strong_lock_retry:
MOV ESI, 1
XCHG [EBX], ESI
TEST ESI, ESI
JNE refcount_ia32_add_load_strong_lock_pause_retry
MOV ESI, [ECX]
CMP EAX, ESI
JNE refcount_ia32_add_load_strong_unlock_retry
XADD [EAX], ESI
CMP ESI, 0
JLE refcount_ia32_add_load_strong_unlock_retry
XOR ESI, ESI
MOV [EBX], ESI
POP EBX
POP ESI
RET

refcount_ia32_add_load_strong_lock_pause_retry:
PAUSE
JMP refcount_ia32_add_load_strong_lock_retry

refcount_ia32_add_load_strong_unlock_retry:
XOR ESI, ESI
MOV [EBX], ESI
JMP refcount_ia32_add_load_strong_retry
refcount_ia32_add_load_strong ENDP

align 16
; int refcount_ia32_sub_strong(
; refcount_ia32_t* volatile,
; int,
; spinlock_ia32_table_t const*,
; size_t)
refcount_ia32_sub_strong PROC
MOV EAX, [ESP + 4]
MOV EDX, [ESP + 8]
CMP EDX, 0
JLE refcount_ia32_sub_strong_skip
NEG EDX
LOCK XADD [EAX], EDX
JZ refcount_ia32_sub_strong_destroy
JS refcount_ia32_sub_strong_destroy
refcount_ia32_sub_strong_skip:
XOR EAX, EAX
RET

refcount_ia32_sub_strong_destroy:
MOV EDX, [ESP + 12]
MOV ECX, EAX
IMUL ECX, 567
SHR ECX, 8
AND ECX, 0FFh
IMUL ECX, [ESP + 16]
LEA ECX, [EDX + ECX]

refcount_ia32_sub_strong_lock_retry:
MOV EDX, 1
XCHG [ECX], EDX
TEST EDX, EDX
JNE refcount_ia32_sub_strong_lock_pause_retry
MOV EDX, [EAX + 4]
PUSH [EAX + 8]
XOR EAX, EAX
MOV [ECX], EAX
TEST EDX, EDX
JE refcount_ia32_sub_strong_done
CALL EDX

refcount_ia32_sub_strong_done:
POP EDX
MOV EAX, 1
RET

refcount_ia32_sub_strong_lock_pause_retry:
PAUSE
JMP refcount_ia32_sub_strong_lock_retry
refcount_ia32_sub_strong ENDP

align 16
; void refcount_ia32_add_weak(
; refcount_ia32_t volatile*,
; int)
refcount_ia32_add_weak PROC
MOV ECX, [ESP + 4]
MOV EAX, [ESP + 8]
CMP EAX, 0
JLE refcount_ia32_add_weak_done
LOCK XADD [ECX], EAX

refcount_ia32_add_weak_done:
RET
refcount_ia32_add_weak ENDP

align 16
; refcount_ia32_t* refcount_ia32_add_swap_weak(
; refcount_ia32_t* volatile*,
; refcount_ia32_t*,
; int)
refcount_ia32_add_swap_weak PROC
MOV ECX, [ESP + 4]
MOV EAX, [ESP + 8]
TEST EAX, EAX
JE refcount_swap_weak_execute
MOV EDX, [ESP + 12]
CMP EDX, 0
JLE refcount_swap_weak_execute
LOCK XADD [EAX], EDX

refcount_swap_weak_execute:
XCHG [ECX], EAX
RET
refcount_ia32_add_swap_weak ENDP

END


Chris Thomasson

unread,
Oct 6, 2006, 6:22:29 PM10/6/06
to
spinlock_ia32.h
---------------------------------------

/* Copyright 2005 Chris Thomasson */

#if ! defined(SPINLOCK_IA32_H_INCLUDED)
#define SPINLOCK_IA32_H_INCLUDED
#if defined(__cplusplus)
extern "C" {
#endif
#include "helper.h"

#define IA32_L2CACHELINESZ 128

#define SPINLOCK_IA32_PADSZ IA32_L2CACHELINESZ
#define SPINLOCK_IA32_TABLE_DEPTH 0xFF
#define SPINLOCK_IA32_TABLE_ALIGNSZ IA32_L2CACHELINESZ

#define SPINLOCK_IA32_TABLE_MEMALIGNSZ(mp_tabletype) \
(MEMALIGNSZ(sizeof(mp_tabletype), SPINLOCK_IA32_TABLE_ALIGNSZ))

#define SPINLOCK_IA32_TABLE_MEMALIGNPTR(mp_tablemem) \
((spinlock_ia32_table_t*)MEMALIGNPTR((mp_tablemem).base,
SPINLOCK_IA32_TABLE_ALIGNSZ))


typedef struct spinlock_ia32_s spinlock_ia32_t;
typedef struct spinlock_ia32_padded_s spinlock_ia32_padded_t;
typedef struct spinlock_ia32_table_s spinlock_ia32_table_t;
typedef struct spinlock_ia32_table_mem_s spinlock_ia32_table_mem_t;

struct spinlock_ia32_s {
void *state;
};

struct spinlock_ia32_padded_s {
spinlock_ia32_t _this;
char pad1[TYPEPADSZ(spinlock_ia32_t, SPINLOCK_IA32_PADSZ)];
};

struct spinlock_ia32_table_s {
spinlock_ia32_padded_t locks[SPINLOCK_IA32_TABLE_DEPTH];
};

struct spinlock_ia32_table_mem_s {
char base[SPINLOCK_IA32_TABLE_MEMALIGNSZ(spinlock_ia32_table_t)];
};

static spinlock_ia32_table_mem_t g_spinlock_ia32_table_mem = {0};
static spinlock_ia32_table_t* const g_spinlock_ia32_table =
SPINLOCK_IA32_TABLE_MEMALIGNPTR(g_spinlock_ia32_table_mem);

#if defined(__cplusplus)
}
#endif
#endif


Chris Thomasson

unread,
Oct 6, 2006, 6:24:14 PM10/6/06
to
refcount_ia32.h
-----------------------------------------


/* Copyright 2005 Chris Thomasson */

#if ! defined(REFCOUNT_IA32_H_INCLUDED)
#define REFCOUNT_IA32_H_INCLUDED


#if defined(__cplusplus)
extern "C" {
#endif

#include "spinlock_ia32.h"

typedef struct refcount_ia32_s refcount_ia32_t;
typedef struct refcount_ia32_userstate_s refcount_ia32_userstate_t;
typedef void (*refcount_ia32_fpdtor_t) (void*);

struct refcount_ia32_userstate_s {
refcount_ia32_fpdtor_t fpdtor;
void *state;
};

struct refcount_ia32_s {
int refs;
refcount_ia32_userstate_t ustate;
};

extern int refcount_ia32_init(refcount_ia32_t*, int, refcount_ia32_fpdtor_t,
void*);
extern refcount_ia32_t* refcount_ia32_add_load_strong(refcount_ia32_t*
volatile*, int, spinlock_ia32_table_t const*, size_t);
extern int refcount_ia32_sub_strong(refcount_ia32_t volatile*, int,
spinlock_ia32_table_t const*, size_t);
extern refcount_ia32_t* refcount_ia32_add_swap_weak(refcount_ia32_t*
volatile*, refcount_ia32_t*, int);
extern void refcount_ia32_add_weak(refcount_ia32_t volatile*, int);

typedef refcount_ia32_t refcount_t;

#define refcount_init refcount_ia32_init
#define refcount_add_swap_weak refcount_ia32_add_swap_weak
#define refcount_add_weak refcount_ia32_add_weak

#define refcount_add_load_strong(mp_pthis, mp_count) \
(refcount_ia32_add_load_strong( \
(mp_pthis), \
(mp_count), \
g_spinlock_ia32_table, \
sizeof(spinlock_ia32_padded_t) \
))

#define refcount_sub_strong(mp_this, mp_count) \
(refcount_ia32_sub_strong( \
(mp_this), \
(mp_count), \
g_spinlock_ia32_table, \
sizeof(spinlock_ia32_padded_t) \
))

#if defined(__cplusplus)
}
#endif
#endif


Chris Thomasson

unread,
Oct 6, 2006, 6:30:22 PM10/6/06
to
helper.h
-------------------------------------------


/* Copyright 2005 Chris Thomasson */

#if ! defined(HELPER_H_INCLUDED)
#define HELPER_H_INCLUDED


#if defined(__cplusplus)
extern "C" {
#endif

#include <stddef.h>


#define ROUNDUP(mp_val, mp_rnd) \
(((mp_val) % (mp_rnd)) ? (mp_val) + ((mp_rnd) - ((mp_val) \
% (mp_rnd))) : (mp_val))

#define ROUNDDOWN(mp_val, mp_rnd) \
(ROUNDUP((mp_val), (mp_rnd)) - (mp_rnd))

#define MEMALIGNSZ(mp_align) \
(((size_t)(mp_align)) - 1)

#define MEMALIGNPTR(mp_base, mp_align) \
((void*)((((ptrdiff_t)(mp_base)) + MEMALIGNSZ(mp_align)) \
& (-((ptrdiff_t)(mp_align)))))

#define MEMPADSZ(mp_sz, mp_pad) \
(((size_t)(mp_pad)) - ((size_t)(mp_sz)))

#define TYPEPADSZ(mp_type, mp_pad) \
(MEMPADSZ(sizeof(mp_type), (mp_pad)))

#if defined(__cplusplus)
}
#endif
#endif


Joe Seigh

unread,
Oct 6, 2006, 6:22:25 PM10/6/06
to
Chris Thomasson wrote:
> I am presenting some source code for a sample implementation of my new
> mostly lock-free reference counting algorithm that I first mention here:
>
...

>
> I will posted the source code in a couple of following posts; there are a
> total of 5 files:
>
>
> refcount_ia32.asm
> refcount_ia32.h
> spinlock_ia32.h
> helper.h
> refcount.h
>
>
> If all goes well, I think that my algorithm could be a great candidate for a
> replacement for Boosts shared_ptr. As you may know by now, shared_ptr is not
> atomically thread-safe. I advise you to take a look at this code if you are
> interested in improving Boost shared_ptr functionality and performance...
>
> My current algorithm does not use anything like GC or PDR. Instead, I have
> developed a new double-checked method that is coupled with a hashed static
> spinlock table scheme in order to get the job done, the right way. The
> algorithm only has to access the hashed spinlock table during strong
> increments and when the count drops to, or below, zero. Its a fairly
> efficient setup...
>
> I would be very interested to hear if anybody can assembly and compile it.
> If your having any trouble whatsoever, or if you have any questions, feel
> free to fire away!
>

Some pseudocode so we don't have to figure out the assembler would be nice.

--
Joe Seigh

When you get lemons, you make lemonade.
When you get hardware, you make software.

Chris Thomasson

unread,
Oct 6, 2006, 6:31:15 PM10/6/06
to
refcount.h
-----------------------------------------


/* Copyright 2005 Chris Thomasson */

#if ! defined(REFCOUNT_H_INCLUDED)
#define REFCOUNT_H_INCLUDED


#if defined(__cplusplus)
extern "C" {
#endif

#include "refcount_ia32.h"

#if defined(__cplusplus)
}
#endif
#endif


Chris Thomasson

unread,
Oct 6, 2006, 6:41:51 PM10/6/06
to
Some of the code got a little munged when I was posting it; a couple of
"extra line-feeds" somehow got added. You can cut-and-paste it directly,
don't worry too much, your C compiler will direct you to the munged lines. I
am probably going to post the files to my AppCore website sometime today.
That way, there will be no possibility of Outlook or some other newsreader
messing with by code!

:)


The specific file that I noticed where the extra lines were added is:

spinlock_ia32.h


This is the exact macro where extra line-feeds were added:

SPINLOCK_IA32_TABLE_MEMALIGNPTR


I don't think it messed with the any of the others, fix that macro and it
should cut-paste-and-compile. Sorry for the minor inconvenience... Like I
said, I am going to post all of this stuff to my AppCore website.


Enjoy!


;)


Chris Thomasson

unread,
Oct 6, 2006, 7:42:23 PM10/6/06
to
DOH!!! I mixed some files up when I was making the post. I accidentally
posted an older version of this file. It has an error in the
refcount_ia32_add_load_strong function. Basically, the error goes like
this... It did not check the loaded refcount pointer from the shared
location for null. So, if it loads a null, it tries to process anyway!


If you have already downloaded it, please REPLACE the
refcount_ia32_add_load_strong that you have with the following one:


align 16
; refcount_ia32_t* refcount_ia32_add_load_strong(
; refcount_ia32_t* volatile*,
; int,
; spinlock_ia32_table_t const*,
; size_t)
refcount_ia32_add_load_strong PROC
PUSH ESI
PUSH EBX
MOV ECX, [ESP + 12]
MOV EDX, [ESP + 20]
MOV ESI, [ESP + 16]
CMP ESI, 0

JLE refcount_ia32_add_load_strong_failed

refcount_ia32_add_load_strong_retry:
MOV EAX, [ECX]
TEST EAX, EAX
JE refcount_ia32_add_load_strong_failed


MOV EBX, EAX
IMUL EBX, 567
SHR EBX, 8
AND EBX, 0FFh
IMUL EBX, [ESP + 24]
LEA EBX, [EDX + EBX]

refcount_ia32_add_load_strong_lock_retry:
MOV ESI, 1
XCHG [EBX], ESI
TEST ESI, ESI
JNE refcount_ia32_add_load_strong_lock_pause_retry
MOV ESI, [ECX]
CMP EAX, ESI
JNE refcount_ia32_add_load_strong_unlock_retry
XADD [EAX], ESI
CMP ESI, 0
JLE refcount_ia32_add_load_strong_unlock_retry
XOR ESI, ESI
MOV [EBX], ESI
POP EBX
POP ESI
RET

refcount_ia32_add_load_strong_failed:
XOR EAX, EAX
RET

refcount_ia32_add_load_strong_lock_pause_retry:
PAUSE
JMP refcount_ia32_add_load_strong_lock_retry

refcount_ia32_add_load_strong_unlock_retry:
XOR ESI, ESI
MOV [EBX], ESI
JMP refcount_ia32_add_load_strong_retry
refcount_ia32_add_load_strong ENDP


I am going to post full working code and a Visual C++ 6.0 Workspace sometime
today.


I am sorry for making this stupid mistake and making you replace the
function...


;^(...


Chris Thomasson

unread,
Oct 6, 2006, 8:11:35 PM10/6/06
to
"Joe Seigh" <jsei...@xemaps.com> wrote in message
news:i5Wdnfd9U9f_S7vY...@comcast.com...

> Chris Thomasson wrote:
>> I am presenting some source code for a sample implementation of my new
>> mostly lock-free reference counting algorithm that I first mention here:

[...]


> Some pseudocode so we don't have to figure out the assembler would be
> nice.

Okay. ;)

Let me start off with getting the hashed spinlock layer (e.g.,
spinlock_ia32.h) out of the way:


static g_spinlock_ia32_table = ...;

spinlock_ia32_t* spinlock_hash_and_lock(void *ptr) {
spinlock_ia32_t *lock = g_spinlock_ia32_table[HASH(ptr)];
SPINLOCK_LOCK(lock);
return lock;
}

void spinlock_unlock(spinlock_ia32_t *lock) {
SPINLOCK_UNLOCK(lock);
}

That should be fairly self-explainable. Now, on to show how strong
subtractations/decrements are performed. The pseudo-code shows how the
following function works, error checking omitted:

int refcount_ia32_sub_strong(refcount_ia32_t volatile*, int,
spinlock_ia32_table_t const*, size_t);


bool sub_strong(refcount_t *_this, int count, /* lock table access */) {


// decrement the references by neg count.
if (XADD(&this->refs, -count) <= count) {


// the count dropped below 1; XADD returns old value.

// now, we need to lock the associated spinlock because
// there could be a strong incrementer/adder holding it.
// after we lock and unlock, we are guaranteed to be alone.

spinlock_ia32_t *lock = spinlock_hash_and_lock(_this);

// this is actually unnecessary; there is no need to read ahead of time.
// as i stated earlier, once we unlock, we are alone for good. well
// might as well do something in here! ;)

refcount_t copy = *_this;

spinlock_unlock(lock);

// okay; were quiescent! :)
// call the user-provided dtor, if any

if (copy.ustate.fpdtor) {
copy.ustate.fpdtor(copy.ustate.state);
}

// return true to let the caller know it can call free on _this.
return true;
}


// return false to let the caller know it cannot call free on _this.
return false;
}


Finally we get to how the strong additions/increments performed. The
pseudo-code shows how the following function works, error checking omitted:


refcount_ia32_t* refcount_ia32_add_load_strong(refcount_ia32_t* volatile*,
int, spinlock_ia32_table_t const*, size_t);


refcount_t* add_strong(refcount_t **_pthis, int count, /* lock table access
*/) {

// first things first; we need make the initial load from the pointer to
the shared location.

refcount_t *_this = *_pthis;

// only operate on NON-NULL values. I made a mistake and posted an older
// version of the strong increment function; it did not check for null!
Stupid mistake.
// anyway, here is the check for null in the pseudo-code, and I posted a
correction!
// :)


while(_this) {


// we now need to lock the associated spinlock. We need to be alone wrt
// the refcount structure that _this points to.

spinlock_ia32_t *lock = spinlock_hash_and_lock(_this);

// we now need to double-check our previous load. the pointer could
// have been swapped out, and might be in the middle of a concurrent
// reference count subtraction/decrement.

refcount_t *cmp = *_pthis;


// make sure they are the same; remember we are under the
critical-section
// provided by the hashed spinlock from the pointer value of _this.
ditto for
// the pointer value of cmp. if there are equal, then we have
consistency...

if (_this == cmp ) {

// go ahead and make the increment, and record the previous value

int prev = XADD(&_this->refs, count);


// we need to make sure that we incremented a reference count value
// that was over at least over zero!

if (prev > 0) {

// we have successfully incremented the reference counter from a
previous
// value that was at least greater than zero. we now own a
reference. finally,
// we need to unlock the previously acquired spinlock, and return a
pointer
// to the newly acquired refcount object.

spinlock_unlock(lock);

// that's all folks! ;)
return _this;
}


// we detected a drop to zero condition; count was less than zero!
this is safe
// because the decrement function calls the exact same hashed lock
that we own,
// before it frees any memory. we need to unlock, and re-read. the
shared location
// is not going to be the same pointer as the one we have now. it had
to have been
// swapped out of this shared location before any decrements that
would drop it to
// zero got applied...


} // if (_this == cmp ) {

// we need to unlock, and
spinlock_unlock(lock);


// make another read from the pointer to the shared location
// and try the increment all over again!

_this = *_pthis;


} // while(_this) {

// we detected a null pointer value was read from the shared location!

return 0;
}

I think I only needed to show how the strong dec's are performed alongside
strong inc's. After you replace the bugged version of the
refcount_ia32_add_load_strong I accordantly posted, it should work. Its
still experimental, however, I believe the algorithm is correct. I really
need to finely polish my code. Luckily, I don't have any algorithm logic
errors; just perhaps another stupid programming error wrt not checking for
null. I am scouring over the assembly language and other than that failure
to test a loaded pointer value to null, everything looks okay... Man, I
really hate this type of bug!


Let's have fun dereference a null pointers everybody, oh yeah!!!

;)


Any questions?


Eric Sosman

unread,
Oct 6, 2006, 9:39:09 PM10/6/06
to
Chris Thomasson wrote:

> I am presenting some source code for a sample implementation of my new
> mostly lock-free reference counting algorithm that I first mention here:

> [...]

Completely beside the point, but I find the phrase "mostly
lock-free" peculiar. Is it anything like "mostly virginal?"
If another algorithm that used only half as many locks came along,
would it be "really mostly lock-free," or perhaps "more mostly
lock-free?" And if another were invented that cut the number of
locks in half yet again, would it be "really more mostly lock-free
and this time I almost sort of mean it?"

And after all this Madison Avenue hyperbole, would any term
remain to describe an algorithm that used no locks at all? Or
have we "mostly cheapened" the language?

--
Eric "mostly harmless" Sosman
eso...@acm-dot-org.invalid

Chris Thomasson

unread,
Oct 6, 2006, 10:55:15 PM10/6/06
to
If found some more bugs in the assembly language. After I got it all
compiler through GCC, I found another bug in one of the C header files.


/* C bugs */

1. Pass macro too many parameters
2. Initializer not constant
3. Missing braces in intitalizer
4. None of these showed up in visual c++ 7


/* Assembly Bugs */

1. Forgot to assert the LOCK prefix on some atomic operations

2. Forgot to pop the stack before returning
- function: refcount_ia32_add_load_strong

3. Forgot to check for null pointer
- function: refcount_ia32_sub_strong

4. Adds in a corrupted value
- refcount_ia32_add_load_strong


I have fixed all of the god damn bugs, and am going to post them all to my
AppCore website. Before tomorrow.


Chris Thomasson

unread,
Oct 7, 2006, 3:06:10 AM10/7/06
to
"Eric Sosman" <eso...@acm-dot-org.invalid> wrote in message
news:U4qdnZXak_vemLrY...@comcast.com...

Let me see if I can clear some things up...

;^)


Here is an IA-32 based prototype of my idea; it's currently in "pre-alpha"
status:

http://appcore.home.comcast.net/vzoom/refcount/


Here is an example of how my reference counting implementation is "mostly"
lock-free... Take a look at the refcount_ia32_add_swap_weak function; it's
located at the bottom of the following file:


http://appcore.home.comcast.net/vzoom/refcount/refcount_ia32_gcc.asm
(in fact, browse the whole file; there's not a lot of locking going on at
all.)


This is a 100% pure lock-free swap function; I am also adding a 100%
lock-free cas function. That means that you can use cas/swap on the pointers
to the reference counters themselves, no locks at all. In fact, the "only
time my refcount implementation uses ANY locking at all" is when a thread
grabs its "first/initial" atomic reference to a shared object; this is the
slow-path. After the thread acquires its "first/initial" reference,
"everything wrt counter modifications or pointer swaps/cas's, stays 100%
lock-free, until the count drops to zero"; this is the last slow-path. So,
my entire counting algorithm only has 2 slow-paths:


1: Thread Acquiring Very First/Initial Reference
2: Reference Count Dropped To Zero


The locking that the slow-paths make use of is based on a global and static
hashed spinlock scheme:

http://appcore.home.comcast.net/vzoom/refcount/spinlock_ia32.h


So, multiple threads can indeed lock multiple reference counting objects in
parallel, and they never park in the kernel. The critical-sections that the
global spinlocks enforce are naturally short; they are made up of
handcrafted assembly language...

Does that address any of your concerns?

:)


Joe Seigh

unread,
Oct 7, 2006, 9:00:09 AM10/7/06
to
Chris Thomasson wrote:
[...]

>
>
>
> This is a 100% pure lock-free swap function; I am also adding a 100%
> lock-free cas function. That means that you can use cas/swap on the pointers
> to the reference counters themselves, no locks at all. In fact, the "only
> time my refcount implementation uses ANY locking at all" is when a thread
> grabs its "first/initial" atomic reference to a shared object; this is the
> slow-path. After the thread acquires its "first/initial" reference,
> "everything wrt counter modifications or pointer swaps/cas's, stays 100%
> lock-free, until the count drops to zero"; this is the last slow-path. So,
> my entire counting algorithm only has 2 slow-paths:
>
>
>
>
> 1: Thread Acquiring Very First/Initial Reference
> 2: Reference Count Dropped To Zero
>
>
>
>
> The locking that the slow-paths make use of is based on a global and static
> hashed spinlock scheme:
>
> http://appcore.home.comcast.net/vzoom/refcount/spinlock_ia32.h
>
>
> So, multiple threads can indeed lock multiple reference counting objects in
> parallel, and they never park in the kernel. The critical-sections that the
> global spinlocks enforce are naturally short; they are made up of
> handcrafted assembly language...
>

Let's see if I have this right.

You're replacing using a single global lock with hashed global
locks to reduce lock contention.

The rest of it look like how you'd do refcounting with a global
lock(s).

The problem with Boost is it's not that kind of thread safe, so
they're not likely to buy into the extra overhead. If you do
add thread safety to C++ it's going to be in the form of a new
library similar to what Java did with java.util.concurrent.

Also there's the issue of async safety since you have spin locks
in there. What would be the advantage compared to the refcount
algorithms that are async safe, apart from patent issues of course?

Richard

unread,
Oct 7, 2006, 12:35:19 PM10/7/06
to
"Chris Thomasson" <cri...@comcast.net> writes:

> DOH!!! I mixed some files up when I was making the post. I accidentally
> posted an older version of this file. It has an error in the
> refcount_ia32_add_load_strong function. Basically, the error goes like
> this... It did not check the loaded refcount pointer from the shared
> location for null. So, if it loads a null, it tries to process anyway!
>

Why don't you just stick all this on a maintained web site? Posted code
is soon out of date and google doesn't forget.

Chris Thomasson

unread,
Oct 7, 2006, 9:57:27 PM10/7/06
to
"Joe Seigh" <jsei...@xemaps.com> wrote in message
news:a4qdnQnD4qOIObrY...@comcast.com...
> Chris Thomasson wrote:
[...]


> Let's see if I have this right.
>
> You're replacing using a single global lock with hashed global
> locks to reduce lock contention.

Yes. I only use them a thread grabs it's "first/initial" reference, and when
the count drops to zero.


> The rest of it look like how you'd do refcounting with a global
> lock(s).

All of the pointer operations (e.g., the swaps and the cas on shared pointer
locations) are 100% lock-free. I only make use of the global hashed locking
scheme for initial inc, and drop-to-zero.


> The problem with Boost is it's not that kind of thread safe, so
> they're not likely to buy into the extra overhead.

Well, all of the lock-free pointer operations "currently" have less overhead
than shared_ptr...


> If you do
> add thread safety to C++ it's going to be in the form of a new
> library similar to what Java did with java.util.concurrent.


> Also there's the issue of async safety since you have spin locks
> in there. What would be the advantage compared to the refcount
> algorithms that are async safe, apart from patent issues of course?

Well, the spinlocks can easily be changed into POSIX mutexs:

static pthread_mutex_t g_spinlock_table[0x100];

The advantage is that after you grab initial reference everything is 100%
lock-free until drop-to-zero. The spinlock critical-sections are extremely
short. They can be replaced with different type of mutexs (e.g., POSIX
mutex). Humm. I need to go right now, but I will be thinking some more about
your well founded questions Joe...

Thank you.


Chris Thomasson

unread,
Oct 7, 2006, 9:58:12 PM10/7/06
to
"Richard" <rgr...@gmail.com> wrote in message
news:87ejtkk...@gmail.com...

> "Chris Thomasson" <cri...@comcast.net> writes:
>
>> DOH!!! I mixed some files up when I was making the post. I accidentally
>> posted an older version of this file. It has an error in the
>> refcount_ia32_add_load_strong function. Basically, the error goes like
>> this... It did not check the loaded refcount pointer from the shared
>> location for null. So, if it loads a null, it tries to process anyway!
>>
>
> Why don't you just stick all this on a maintained web site?

http://appcore.home.comcast.net/vzoom/refcount/


> Posted code
> is soon out of date and google doesn't forget.

Indeed! :O


Chris Thomasson

unread,
Oct 8, 2006, 12:46:57 AM10/8/06
to
"Joe Seigh" <jsei...@xemaps.com> wrote in message
news:a4qdnQnD4qOIObrY...@comcast.com...
> Chris Thomasson wrote:


> Also there's the issue of async safety since you have spin locks
> in there. What would be the advantage compared to the refcount
> algorithms that are async safe, apart from patent issues of course?

I think I misunderstood you earlier wrt spinlocks and "true" async-safety.
Okay. The only time you can safely use my refcount algorithm in a
signal-handlers context, for now, is when a thread has "already grabbed an
initial reference to a shared data-structure". You simply "cannot use it in
a signal-handler context if a thread tries to grab a reference to a
data-object that it has not previously acquired". So, it kind of breaks down
to sticking to "shared_ptr semantics" in a signal-handler; reference must be
owned prior to any new ones being acquired...

Bottom line, you can use refcount in signal-handler after first reference
acquired; this is on a per-refcount object basis. A thread that grabs a
pointer to an object X, can safely grab another reference in a 100%
lock-free manor (e.g., to pass to another thread perhaps, whatever...) in a
signal handler context.

Humm... Does that come close to addressing what your getting at here Joe?


Chris Thomasson

unread,
Oct 8, 2006, 1:17:26 AM10/8/06
to
I forgot to mention that you CANNOT use my refcount algorithm in a
signal-handler if you are going to be performing any subtractions/decrements
at a reference count object. Technically, any subtraction/decrement could
possibly drop the count to zero and call the function pointer to a user
provided destructor. That would crash the living hell out of an application;
not to mention my algorithm takes a lock when the count drops to zero. It
takes the lock before calling the destructor, so in practice, if you
decremented the reference count to zero in a signal-handler, the result will
probably be a complete deadlock.

The only time I would use a decrement in a signal-handler is when you are
100% sure that the count will not drop to zero. A simple example:


static refcount_t* volatile g_loc = {...};
static ourapp_server_msg_ref_t* const g_appsrv_msg = {...};


/*==============================================\
\\//\\// void my_sighandler_add_and_queue(int);
\\//\\//________________________________________________
\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//
===============================================/
This signal handler increments the reference count of the object pointed to
by 'g_loc'. This *process has already acquired a reference to the object, so
it is okay to add to the reference count.

The function increments the reference count, and tries to enqueue a message
on a lock-free queue that exists in shared memory, and is monitored by our
applications server/manager process. If the server rejects the message, or
if its processing a self-maintenance cycle, the function will subsequently
decrement the counter. This is okay because we are 100% guaranteed that the
count will not drop to zero; the previous increment simply gets revoked.
_____________________________________________________
\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//
===============================================/

void my_sighandler_add_and_queue(int sig) {
refcount_t *local = refcount_add_load_weak(&g_loc, 1);

/* ourapp_server_msgpush is 100% lock-free; signal handler safe */
if (! ourapp_server_msgpush(g_appsrv_msg, local)) {
/* the server says no, revoke our previous increment */
(void)refcount_sub_strong(local, 1);
}
}


Any thoughts?


Chris Thomasson

unread,
Oct 8, 2006, 1:33:31 AM10/8/06
to
Forgot to define the asterisk!

[...]

> by 'g_loc'. This *process has already acquired a reference to the object,
> so it is okay to add to the reference count.

* I have a version of refcount that supports process-wide atomic reference
counting. A thread can grab a reference to piece of shared memory that was
created by another process. The refcount implementation has a server process
that acts as an arbiter of sorts. If a process commits some shared memory,
and wants to only decommit it when its no longer being referenced by
'anything', it can registered it with my server process via. the one of the
servers lock-free communication channels. An entity (e.g., a thread within a
process) can atomically acquire their first/initial reference to any
registered object by simply applying the refcount API to pointer to
locations in shared memory.

I only did this shared memory "thing" because I knew it would be a fairly
good "learning experience and academic exercise". I am not sure if I will
make a submission to Boost that includes a version of refcount that can
operate on shared memory. Humm...

Also, I am not sure exactly how useful this would actually be. I guess it
would be good for so-called robust server applications that are made of up
multiple multi-threaded processes. Can anybody else think of practical ways
to make use of process-wide atomic reference counting in any of their
"existing" applications?


Joe Seigh

unread,
Oct 8, 2006, 8:59:57 AM10/8/06
to

Well, there's a number of solutions out there which have various trade offs.
You need to position it against them. I don't know how important async
safety is, but when you need it, you really need it.

Joe Seigh

unread,
Oct 8, 2006, 9:10:10 AM10/8/06
to
Shared memory with multiple processes has the same problem that multi-threaded
single process applications have. When one thread or process dies, the state
of memory is possibly undefined. Usually robust multi-process applications use
IPC rather than shared memory.

The posssible exception to this is using read-only shared memory with process
level smart pointers. Using refcounting with read-only memory would be a
little tricky though.

Chris Thomasson

unread,
Oct 13, 2006, 9:50:49 PM10/13/06
to
"Joe Seigh" <jsei...@xemaps.com> wrote in message
news:wqadnSrgwfTda7XY...@comcast.com...

> Chris Thomasson wrote:
>> "Joe Seigh" <jsei...@xemaps.com> wrote in message
>> news:a4qdnQnD4qOIObrY...@comcast.com...
>>>Chris Thomasson wrote:

[...]

> Well, there's a number of solutions out there which have various trade
> offs.
> You need to position it against them. I don't know how important async
> safety is, but when you need it, you really need it.

How many of those solutions allow you to decrement the reference count in a
signal handlers context? This could drop to zero and call user dtor... I
have a work around for this. As a side-effect, it allows the thread that
drops to zero to not have to take a spinlock in my refcount code. The basic
idea is that the thread that drops an object to zero does not necessarily
have to be the thread that calls the user-provided dtor... Humm... I guess I
should prototype this in assembly asap! It would give my refcount algorithm
the ability to have weak increments, and decrements in a signal handler
context... Pretty cool huh? lol...

;)


Chris Thomasson

unread,
Oct 18, 2006, 3:32:39 AM10/18/06
to
"Joe Seigh" <jsei...@xemaps.com> wrote in message
news:a4qdnQnD4qOIObrY...@comcast.com...
> Chris Thomasson wrote:
> [...]

> You're replacing using a single global lock with hashed global


> locks to reduce lock contention.

[...]

I am thinking of augmenting my original algorithm with one of two tweaks
before I submit anything to Boost:


One is replacing the hashed spinlocks with a hashed rw-spinlock scheme...

http://appcore.home.comcast.net/appcore/src/ac_rwspinlock_algo1_c.html
http://appcore.home.comcast.net/appcore/include/ac_rwspinlock_algo1_h.html


The involves a PDR technique I posted earlier:

http://groups.google.com/group/comp.arch/browse_frm/thread/b2f751d5dad6c07b

I could replace the hashed spinlocks with hashed proxy collector anchors
(e.g., static my_proxy_anchor_t g_px[0x100])... I will probably have an
easier time explaining to Boost reviewers how the reference counting stays
atomic under solution one. Humm...


Chris Thomasson

unread,
Oct 18, 2006, 3:33:52 AM10/18/06
to

> The involves a PDR technique I posted earlier:
^

The other solution involves...


Reply all
Reply to author
Forward
0 new messages