Is my atomic refrence counting safe???

SenderX

unread,

Mar 6, 2003, 2:14:39 AM3/6/03

to

I have implemented a quick atomic reference-counter based on the following
paper:

http://citeseer.nj.nec.com/cachedpage/481457/8

I was wondering if it is safe, or a ticking time bomb ;)

Heres the my code:

/* RefCount.h */

#ifndef REFCOUNT_H
#define REFCOUNT_H

#define _WIN32_WINNT 0x0500

#define WIN32_LEAN_AND_MEAN

#include <Windows.h>

/* Functions

InitObjRef
LoadObjRef - increment
UnloadObjRef - decrement

*/

/* 16-bit integer */
typedef union U_Int16
{
volatile __int16 Value16;

struct
{
volatile __int8 Low8;
volatile __int8 High8;
};

} _INT16, *_LPINT16;

/* 32-bit integer */
typedef union U_Int32
{
volatile __int32 Value32;

struct
{
volatile _INT16 Low16;
volatile _INT16 High16;
};

} _INT32, *_LPINT32;

/* 64-bit integer */
typedef union U_Int64
{
volatile __int64 Value64;

struct
{
volatile _INT32 Low32;
volatile _INT32 High32;
};

} _INT64, *_LPINT64;

/* 128-bit integer */
typedef union U_Int128
{
volatile char Value128[sizeof(_INT64)*2];

struct
{
volatile _INT64 Low64;
volatile _INT64 High64;
};

} _INT128, _LPINT128;

/* Refrenced object */
typedef union U_ObjRef
{
volatile __int64 Value64;

struct
{
volatile void* pObj;
volatile long lRefs;
};

} OBJECT, *LPOBJECT;

/* 64-bit compare and swap */
inline int cas_64
(
volatile __int64* piDest,
volatile __int64 iComprand,
volatile __int64 iExchange
)

{
int iRet = 1;

volatile _INT64 iLocal,
iLocalXchg;

/* Read values */
iLocal.Value64 = iComprand;

iLocalXchg.Value64 = iExchange;

_asm
{
/* Comprand */
mov eax, [iLocal.Low32.Value32]
mov edx, [iLocal.High32.Value32]

/* Exchange */
mov ebx, [iLocalXchg.Low32.Value32]
mov ecx, [iLocalXchg.High32.Value32]

/* Dest */
mov esi, [piDest]

/* Execute */
lock cmpxchg8b qword ptr [esi]
jnz cas_64_Fail
jmp cas_64_Success
}

cas_64_Fail:

iRet = 0;

cas_64_Success:

return iRet;
}

/* Inits an object */
inline void InitObjRef
(
LPOBJECT pObjRef,
LPVOID pStoredObject
)

{
pObjRef->Value64 = 0;

pObjRef->pObj = pStoredObject;

pObjRef->lRefs = 1;
}

/* Copys the object ( does not increment ) */
inline void CopyObjRef
(
volatile LPOBJECT pSource,
volatile LPOBJECT pDest
)

{
for(;;)
{
pDest->Value64 = pSource->Value64;

/* Try and read */
if ( cas_64( (volatile __int64*)pDest,
pDest->Value64,
pSource->Value64 ) )
{
break;
}
}
}

/* Loads a pointer to the object */
inline volatile void* LoadObjRef
(
volatile LPOBJECT pObjRef
)

{
OBJECT oLocal;

__int64 iOldVal;

if ( ! pObjRef->pObj )
{
return NULL;
}

for(;;)
{
/* Read the object */
CopyObjRef( pObjRef, &oLocal );

iOldVal = oLocal.Value64;

/* Increment the refrence count */
oLocal.lRefs++;

/* Try and update the object */
if ( cas_64( (volatile __int64*)pObjRef,
iOldVal,
oLocal.Value64 ) )
{
break;
}
}

return oLocal.pObj;
}

/* Unloads a pointer to the object */
inline int UnloadObjRef
(
volatile LPOBJECT pObjRef
)

{
OBJECT oLocal;

__int64 iOldVal;

if ( ! pObjRef->pObj )
{
return -1;
}

for(;;)
{
/* Read the object */
CopyObjRef( pObjRef, &oLocal );

iOldVal = oLocal.Value64;

/* Decrement the refrence count */
oLocal.lRefs--;

/* Try and update the object */
if ( cas_64( (volatile __int64*)pObjRef,
iOldVal,
oLocal.Value64 ) )
{
break;
}
}

/* Check for zero */
if ( oLocal.lRefs == 0 )
{
if ( InterlockedExchange
( &pObjRef->lRefs,
-1 ) == 0 )
{
/* Destroy the object */

/* free( oLocal.pObj ); */

pObjRef->Value64 = 0;

return 1;
}
}

return 0;
}

#endif

Joseph Seigh

unread,

Mar 6, 2003, 6:25:54 AM3/6/03

to

SenderX wrote:
>
> I have implemented a quick atomic reference-counter based on the following
> paper:
>
> http://citeseer.nj.nec.com/cachedpage/481457/8
>
> I was wondering if it is safe, or a ticking time bomb ;)
>
> Heres the my code:

(snip)

No. Probably not. I haven't had time to review it and it really depends on how you
use it.

It may be threadsafe but it's certainly not atomic. The article you referenced uses
DCAS to avoid the race condition where a reference count can go to zero after you've
loaded a pointer before you've had a chance to increment it, so you end up incrementing
freed (and possibley reused) memory. DCAS as only on some M68000 processors and not on
any in use today. Most thread-safe only implementations require that you "own"
(i.e. ensure that they cannot be changed) the pointers while accessing them to ensure
that the reference count remains at least 1.

The only atomic implementation out there is the one I did a while back and posted to this
newsgroup. It doesn't require DCAS, just a doubleword compare and swap (e.g. 64 bit CAS on
32 bit systems). It may have more atomic ops than some people are comfortable with but it's
the only lock-free atomic smart pointer there is on existing hardware.

Joe Seigh

SenderX

unread,

Mar 6, 2003, 5:36:50 PM3/6/03

to

> No. Probably not. I haven't had time to review it and it really depends
on how you
> use it.

That's what I thought... Tests on the code show that it is alright to AddRef
before you pass an object to another thread. But... If a thread has access
to the shared object and has no reference to it, and tries to reference
it... Then there is a data race, the reference count drops to zero " before
" the thread had a change to increment the objects references. Damn....

> It may be threadsafe but it's certainly not atomic.

Yep...

> The only atomic implementation out there is the one I did a while back and
posted to this
> newsgroup.

I searched google, and search some more only to find this...

" Patent number is 5,295,262 " this indeed leads to a breif description of
your alogrythm.

> It doesn't require DCAS, just a doubleword compare and swap (e.g. 64 bit
CAS on
> 32 bit systems). It may have more atomic ops than some people are
comfortable with but it's
> the only lock-free atomic smart pointer there is on existing hardware.

Do you think you could post the code one more time... Please ;)

Hillel Y. Sims

unread,

Mar 6, 2003, 10:29:12 PM3/6/03

to

"SenderX" <x...@xxx.com> wrote in message
news:54Q9a.375294$HN5.1...@rwcrnsc51.ops.asp.att.net...

> But... If a thread has access
> to the shared object and has no reference to it, and tries to
reference
> it... Then there is a data race, the reference count drops to
zero " before
> " the thread had a change to increment the objects references.
Damn....

I still do not understand why it is necessary to share
(external) pointers to stack-based objects across threads
without explicit synchronization of that operation anyhow in
order to be declared "safe"? If you forget about reference
counting for a moment, assume you are just sharing a plain old
int across threads, you will have the same "data race" problem
without external synchronization. There is nothing implicit or
hidden in this operation from the perspective of the application
programmer (as opposed to the internal reference counting, which
is hidden but does not really apply here). Am I missing
something?

hys

--
(c) 2003 Hillel Y. Sims
hsims AT factset.com

Alexander Terekhov

unread,

Mar 7, 2003, 8:34:24 AM3/7/03

to

"Hillel Y. Sims" wrote:
[...]

> I still do not understand why it is necessary to share
> (external) pointers to stack-based objects across threads
> without explicit synchronization of that operation anyhow in
> order to be declared "safe"? If you forget about reference
> counting for a moment, assume you are just sharing a plain old
> int across threads, you will have the same "data race" problem
> without external synchronization. There is nothing implicit or
> hidden in this operation from the perspective of the application
> programmer (as opposed to the internal reference counting, which
> is hidden but does not really apply here). Am I missing
> something?

Uhmm, perhaps:

http://www.hpl.hp.com/personal/Hans_Boehm/gc/example.html
("Expensive Explicit Deallocation: An Example". Note that even with
GC one would still need memory barriers to insure proper visibility
[Java's revised volatiles would work really good here], but GC makes
it possible to get rid of rather expensive locking/ref.counting.)

regards,
alexander.

--
"...<quote> IBM will put US $1 billion this year into Linux, the free
operating system...</quote>... That team of IBM programmers is
improperly extracting and using SCO's UNIX technology from the same
building that was previously the UNIX technology center."

-- http://www.sco.com/scosource/complaint3.06.03.html

"... As a result of IBM's unfair competition and the marketplace
injury sustained by SCO, SCO is requesting damages in an amount to
be proven at trial, but no less than $1 billion, together with
additional damages through and after the time of trial."

-- http://ir.sco.com/ReleaseDetail.cfm?ReleaseID=103273

SenderX

unread,

Mar 7, 2003, 3:07:48 PM3/7/03

to

> Am I missing something?

The question is how to delete the nodes that make up the fast and safe ( i
have been using them for a while ) lifo stack, or the fifo queue presented
here:

http://216.239.53.100/search?q=cache:Lkcf9uxfxlsC:perso.wanadoo.fr/gmem/even
ements/jim2002/articles/L17_Fober.pdf+lock+free+fifo+queues&hl=en&ie=UTF-8

These wait-free collections are indeed safe ( avoiding the aba problem using
sequence numbers ). There is a problem though... You CAN'T delete ANY of the
nodes from the stack or the queue. If you do, your system will crash for
sure. ( do to a race condition )

For example... Take a look at the lifo stack's pop operation:

node* pop( stack *ps )
{
node *n;

long lSeq;

for(;;)
{
/* Read head */
n = ps->pHead;

/* Read sequence */
lSeq = ps->lSeq;

/* Identify the operation: for aba */
lSeq++;

/* snip */
}
}

Now... Imagine that 6 threads concurrently enter the pop function...

All 6 read the SAME head.

Thread 1, pops the head and deletes it.

Threads 2 - 6 may or may not ( if they saw the aba, and loop around again )
have a deleted pointer, it depends if they all beat thread 1 in the deadly
data race that occurs when you free memory on wait-free objects...

I have been tinkering with another solution that seems to be working just
fine...

It involves reference counting, however you put up with the data race...

You do this by not deleting the node after the ref goes to zero. Instead you
store the node in a per-thread " hazard " list.

You know for sure that if the references dropped to zero, that there are
probably other threads looping around to the next head ( avoiding the aba
problem ). A few of them may still have pointers to the one whose references
dropped to zero. The other threads that may have references only need a
slight delay to loop around to the next head. So by storing them in a fifo
hazard list, and by only popping the back of the list when the hazard list
count gets to ( ( number of threads + number of hazard pointers ) * 2 ) you
provide a delay long enough to avoid the data race.

This paper describes a similar algorithm:

http://216.239.57.100/search?q=cache:Je1wmap_EyUC:www.research.ibm.com/peopl
e/m/michael/podc-2002.pdf+lock-free+stack&hl=en&ie=UTF-8

Joseph Seigh

unread,

Mar 7, 2003, 5:52:59 PM3/7/03

to

"Hillel Y. Sims" wrote:
> I still do not understand why it is necessary to share
> (external) pointers to stack-based objects across threads
> without explicit synchronization of that operation anyhow in
> order to be declared "safe"? If you forget about reference
> counting for a moment, assume you are just sharing a plain old
> int across threads, you will have the same "data race" problem
> without external synchronization. There is nothing implicit or
> hidden in this operation from the perspective of the application
> programmer (as opposed to the internal reference counting, which
> is hidden but does not really apply here). Am I missing
> something?
>

They're useful for implementing wait-free and lock-free algorithms.

Joe Seigh

Joseph Seigh

unread,

Mar 7, 2003, 6:08:16 PM3/7/03

to

SenderX wrote:
>
> > The only atomic implementation out there is the one I did a while back and
> posted to this
> > newsgroup.
>
> I searched google, and search some more only to find this...
>
> " Patent number is 5,295,262 " this indeed leads to a breif description of
> your alogrythm.
>
> > It doesn't require DCAS, just a doubleword compare and swap (e.g. 64 bit
> CAS on
> > 32 bit systems). It may have more atomic ops than some people are
> comfortable with but it's
> > the only lock-free atomic smart pointer there is on existing hardware.
>
> Do you think you could post the code one more time... Please ;)

I think I only posted the version with the atomic parts simulated with
mutexes here

http://groups.google.com/groups?hl=en&lr=&ie=UTF-8&selm=3DDBF8B8.AD1089BF%40xemaps.com

I'll post the version using compare and swap later.

Joe Seigh

Joseph Seigh

unread,

Mar 9, 2003, 12:24:35 PM3/9/03

to

SenderX wrote:
>
> Do you think you could post the code one more time... Please ;)

This is the cas64 version. No membars, just comments and one or two of
those aren't entirely correct.

Joe Seigh

-- atomic_ptr.h --
extern "C" {
extern int cas64(void *target, void *oldval, void *newval);
}

template<typename T> class atomic_ptr;
template<typename T> class local_ptr;
template<typename T> class atomic_ptr_ref;

struct refcount {
long ecount; // ephemeral count
long rcount; // reference count
};

template<typename T> struct ref {
long ecount; // ephemeral count
atomic_ptr_ref<T> *ptr;
};

//=============================================================================
// atomic_ptr_ref
//
//
//=============================================================================
template<typename T> class atomic_ptr_ref {
friend class atomic_ptr<T>;
friend class local_ptr<T>;

private:

atomic_ptr_ref(T * p = 0) {
count.ecount = 0;
count.rcount = 1;
ptr = p;
};

// atomic
int adjust(long xephemeralCount, long xreferenceCount) {
refcount oldval, newval;

oldval.ecount = count.ecount;
oldval.rcount = count.rcount;
do {
newval.ecount = oldval.ecount + xephemeralCount;
newval.rcount = oldval.rcount + xreferenceCount;

}
while (cas64(&count, &oldval, &newval) == 0);

return (newval.ecount == 0 && newval.rcount == 0) ? 0 : 1;
}

refcount count; // reference counts
T * ptr; // ptr to actual object

}; // class atomic_ptr_ref

//=============================================================================
// local_ptr
//
//
//=============================================================================
template<typename T> class local_ptr {
public:
local_ptr() { refptr = 0; }

local_ptr(const local_ptr<T> & src) {
if ((refptr = src.refptr) != 0)
refptr->adjust(+1, 0);
}

local_ptr(atomic_ptr<T> & src) {
refptr = src.getrefptr();
// fetch membar
}

~local_ptr() {
if (refptr != 0 && refptr->adjust(-1, 0) == 0) {
delete refptr->ptr;
delete refptr;
}
}

// only valid argument is 0 (null). All other values silently ignored.
local_ptr<T> & operator = (const float ** (atomic_ptr<T>:: **)()) {
local_ptr<T> temp;
swap(temp); // non-atomic
return *this;
}

local_ptr<T> & operator = (local_ptr<T> & src) {
local_ptr<T> temp(src);
swap(temp); // non-atomic
return *this;
}

local_ptr<T> & operator = (atomic_ptr<T> & src) {
local_ptr<T> temp(src);
swap(temp); // non-atomic
return *this;
}

T * get() { return (refptr != 0) ? refptr->ptr : 0; }

T * operator -> () { return get(); }
T & operator * () { return get(); }

bool operator == (T * rhd) { return (rhd == get() ); }
bool operator != (T * rhd) { return (rhd != get() ); }

// refptr == rhd.refptr iff refptr->ptr == rhd.refptr->ptr
bool operator == (local_ptr<T> & rhd) { return (refptr == rhd.refptr);}
bool operator != (local_ptr<T> & rhd) { return (refptr != rhd.refptr);}
bool operator == (atomic_ptr<T> & rhd) { return (refptr == rhd.refptr);}
bool operator != (atomic_ptr<T> & rhd) { return (refptr != rhd.refptr);}

private:
void * operator new (size_t) {} // auto only

atomic_ptr_ref<T> * refptr;

inline void swap(local_ptr & other) { // non-atomic swap
atomic_ptr_ref<T> * temp;
temp = refptr;
refptr = other.refptr;
other.refptr = temp;
}

}; // class local_ptr

template<typename T> inline bool operator == (T * lhd, local_ptr<T> & rhd)
{ return (rhd == lhd); }

template<typename T> inline bool operator != (T * lhd, local_ptr<T> & rhd)
{ return (rhd != lhd); }

//=============================================================================
// atomic_ptr
//
//
//=============================================================================
template<typename T> class atomic_ptr {
friend class local_ptr<T>;

public:

atomic_ptr(T * obj = 0) {
xxx.ecount = 0;
if (obj != 0)
xxx.ptr = new atomic_ptr_ref<T>(obj);
else
xxx.ptr = 0;
// store membar
}

atomic_ptr(atomic_ptr & src) { // copy constructor

xxx.ecount = 0;
xxx.ptr = src.getrefptr(); // atomic

// adjust link count
if (xxx.ptr != 0)
xxx.ptr->adjust(-1, +1); // atomic
// store membar
}

~atomic_ptr() { // destructor
if (xxx.ptr != 0 && xxx.ptr->adjust(xxx.ecount, -1) == 0) {
delete xxx.ptr->ptr;
delete xxx.ptr;
}
}

atomic_ptr & operator = (atomic_ptr & src) {
atomic_ptr temp(src); // copy
swap(temp); // atomic
return *this;
}

atomic_ptr & operator = (T * obj) {
atomic_ptr temp(obj);
swap(temp);
return *this;
}

// generate local temp ptr to guarantee validity of ptr
// during lifetime of expression. temp is dtor'd after
// expression has finished execution.
local_ptr<T> operator -> () { return local_ptr<T>(*this); }
local_ptr<T> operator * () { return local_ptr<T>(*this); }

bool operator == (T * rhd) {
if (rhd == 0)
return (xxx.ptr == 0);
else
return (local_ptr<T>(*this) == rhd);
}

bool operator != (T * rhd) {
if (rhd == 0)
return (xxx.ptr != 0);
else
return (local_ptr<T>(*this) != rhd);
}

bool operator == (local_ptr<T> & rhd) {return (local_ptr<T>(*this) == rhd); }
bool operator != (local_ptr<T> & rhd) {return (local_ptr<T>(*this) != rhd); }
bool operator == (atomic_ptr<T> & rhd) {return (local_ptr<T>(*this) == local_ptr<T>(rhd)); }
bool operator != (atomic_ptr<T> & rhd) {return (local_ptr<T>(*this) != local_ptr<T>(rhd)); }

private:

//long ephemeralCount; // ephemeral reference count
//atomic_ptr_ref<T> * refptr; // reference count object

ref<T> xxx;

// atomic
void swap(atomic_ptr & obj) { // obj is local & non-shared
ref<T> temp;

temp.ecount = xxx.ecount;
temp.ptr = xxx.ptr;

while(cas64(&xxx, &temp, &obj.xxx) == 0);

obj.xxx.ecount = temp.ecount;
obj.xxx.ptr = temp.ptr;
}

// atomic
atomic_ptr_ref<T> * getrefptr() {
ref<T> oldval, newval;

oldval.ecount = xxx.ecount;
oldval.ptr = xxx.ptr;

do {
newval.ecount = oldval.ecount + 1;
newval.ptr = oldval.ptr;
}
while (cas64(&xxx, &oldval, &newval) == 0);

return oldval.ptr;
}

}; // class atomic_ptr

template<typename T> inline bool operator == (T * lhd, atomic_ptr<T> & rhd)
{ return (rhd == lhd); }

template<typename T> inline bool operator != (T * lhd, atomic_ptr<T> & rhd)
{ return (rhd != lhd); }

/*-*/

SenderX

unread,

Mar 10, 2003, 7:31:52 AM3/10/03

to

Thank you so much for a stable smart pointer!

I am having a little problem using your wonderful smart pointer for
implementing my wait-free stack...

There is no way I can see that would allow me to avoid the ABA problem,
except a 128-bit cas...

Here is basically my current stack header and node structures:

typedef struct S_StackNode
{
volatile __int64 Value64;

struct
{
volatile LPSTACK_NODE pNext;
volatile __int16 SeqLow;
volatile __int16 SeqHigh;
};

} STACK_NODE, *LPSTACK_NODE;

typedef union U_Stack
{
volatile __int64 Value64;

struct
{
volatile LPSTACK_NODE pFront;

volatile __int16 Seq;
volatile __int16 Count;
};

} STACK, *LPSTACK;

The stack union provides a solution to the aba problem...

I think in order to use your smart pointers I would have to convert the
stack union to this:

typedef union U_Stack
{
volatile char Value128[sizeof( __int64 )*2];

struct
{
/* Contents of an atomic_ptr */
volatile __int32 Ephemeral;
volatile atomic_ptr_ref<T> *pFront;

volatile __int32 Seq;
volatile __int32 Count;
};

} STACK, *LPSTACK;

Do you have any ideas on the possible solutions?

( I would think that there is a logical soultion, hummm )

SenderX

unread,

Mar 10, 2003, 7:42:23 AM3/10/03

to

> typedef struct S_StackNode
> {
> volatile __int64 Value64;
>
> struct
> {
> volatile LPSTACK_NODE pNext;
> volatile __int16 SeqLow;
> volatile __int16 SeqHigh;
> };
>
> } STACK_NODE, *LPSTACK_NODE;

Oops, i mean't this: =)

typedef union U_StackNode
{
volatile __int64 Value64;

struct
{
volatile struct S_StackNode *pNext;

Joseph Seigh

unread,

Mar 10, 2003, 10:41:27 AM3/10/03

to

SenderX wrote:
>
> Thank you so much for a stable smart pointer!
>
> I am having a little problem using your wonderful smart pointer for
> implementing my wait-free stack...
>
> There is no way I can see that would allow me to avoid the ABA problem,
> except a 128-bit cas...

There is no 128 bit CAS except on IBM Z architecture.
...

> Do you have any ideas on the possible solutions?

You can't do CAS on the smart pointers themselves. I'd have to add a
CAS method to the smart pointer class for it to work corretly.

If all you are doing is pushing and popping from a stack then all you
have to do to avoid the ABA problem is add a version counter to the
stack head and increment the version counter on either the pop or on
the push operation. If you increment it on the pop, then you can get
away with a single word CAS for the push and only have to use the
double word CAS for the pop to change the version count and stack
pointer. You don't need smart pointers to do this.

Joe Seigh

SenderX

unread,

Mar 10, 2003, 4:14:20 PM3/10/03

to

> If all you are doing is pushing and popping from a stack then all you
> have to do to avoid the ABA problem is add a version counter to the
> stack head and increment the version counter on either the pop or on
> the push operation.

Thats what I do now. Example pseudocode of aba soultion:

/* Read head */
a1: STACK lLocal = GlobalStack;
a2: lOld = lLocal.Value64;
a3: lLocal.Seq++;
a4. lLocal.Count++;
a5. cas64( &GlobalStack, lOld, lLocal )
a6. if a5 false loop to a1

As you can see, this avoids the aba.

Buy the stack is not dynamic... I can't free nodes from this...

If a5 was popping the head read in a1 and deleted it. Other threads that
concurrently read the same head as a5 will have deleted pointers.

I solve this now by using unsafe-reference counting and hazard pointers.

I wanted to remove the need for the hazard pointers all together.

> You don't need smart pointers to do this.

The hazard pointer method I use now seems to be working. Your smart pointers
would be better solution.

How could I convert the previous pseudo-code to use your smart pointers?

I have come up with one idea, you can change you ref struct from this:

template<typename T> struct ref {
long ecount; // ephemeral count
atomic_ptr_ref<T> *ptr;
};

to this:

template<typename T> union ref {

__int64 Value64;

struct
{
__int16 ecount;
__int16 seq; /* For an aba safe compare and swap */
atomic_ptr_ref<T> *ptr;
};
};

That might do it.

Joseph Seigh

unread,

Mar 10, 2003, 7:14:34 PM3/10/03

to

SenderX wrote:
>
> > If all you are doing is pushing and popping from a stack then all you
> > have to do to avoid the ABA problem is add a version counter to the
> > stack head and increment the version counter on either the pop or on
> > the push operation.
>
> Thats what I do now. Example pseudocode of aba soultion:
>
> /* Read head */
> a1: STACK lLocal = GlobalStack;
> a2: lOld = lLocal.Value64;
> a3: lLocal.Seq++;
> a4. lLocal.Count++;
> a5. cas64( &GlobalStack, lOld, lLocal )
> a6. if a5 false loop to a1
>
> As you can see, this avoids the aba.
>
> Buy the stack is not dynamic... I can't free nodes from this...
>
> If a5 was popping the head read in a1 and deleted it. Other threads that
> concurrently read the same head as a5 will have deleted pointers.
>
> I solve this now by using unsafe-reference counting and hazard pointers.
>
> I wanted to remove the need for the hazard pointers all together.

Here's the z architecture Principles of Operation
http://publibz.boulder.ibm.com/epubs/pdf/dz9zr001.pdf

See Appendix A. Multiprogramming examples, free pool manipulation

>
> > You don't need smart pointers to do this.
>
> The hazard pointer method I use now seems to be working. Your smart pointers
> would be better solution.
>
> How could I convert the previous pseudo-code to use your smart pointers?
>
> I have come up with one idea, you can change you ref struct from this:
>
> template<typename T> struct ref {
> long ecount; // ephemeral count
> atomic_ptr_ref<T> *ptr;
> };
>
> to this:
>
> template<typename T> union ref {
>
> __int64 Value64;
>
> struct
> {
> __int16 ecount;
> __int16 seq; /* For an aba safe compare and swap */
> atomic_ptr_ref<T> *ptr;
> };
> };
>
> That might do it.

Smart pointers solve the ABA problem if you unreference the object instead of reusing
it. The reference counting will ensure that no outstanding queue operations are going
on when it is deleted. Which means when the memory is reused to create a new object,
there are no outstanding operations using the reused address. You could override the
delete operator to recycle objects. But you'd have to figure out how to recycle them
without locks.

Joe Seigh

SenderX

unread,

Mar 11, 2003, 4:49:32 AM3/11/03

to

> Smart pointers solve the ABA problem if you unreference the object instead
of reusing
> it. The reference counting will ensure that no outstanding queue
operations are going
> on when it is deleted.

So you saying I can use your smart pointers to implement the wait-free stack
cause it will avoid the aba problem?

If you mean something like the following quick sample will be safe using
your pointers, it won't:

class C_Stack
{
public:

class C_Node
{
public:

C_Node( LPVOID pObj = NULL )
{
m_pObj = pObj;
}

atomic_ptr< C_Node > m_Next;
LPVOID m_pObj;
};

VOID Push( LPVOID pObj )
{
// Alloc new node
atomic_ref< C_Node > NewNode( new C_Node( pObj ) );

NewNode->m_Next = m_Front;

// Needs atomic_ptr cas

m_Front = NewNode;
}

VOID Pop( LPVOID *ppObj )
{
*ppObj = NULL;

atomic_ref< C_Node > PopNode = m_Front;

if ( PopNode == NULL )
{
return;
}

// Needs atomic_ptr cas

m_Front = PopNode->m_Next;

*ppObj = PopNode->m_pObj;
}

private:

atomic_ptr< C_Node > m_Front;

};

This will not leak anything, but it is not thread safe at all.

If multiple threads concurrently pop and push, the stack will crash for
sure.

How can I possibly convert the sample stack using your smart pointers? I
know how to do it using per-thread hazard storage, but i think that solution
is crappy. I would rather use your pointers. How would you implement a
compare and swap of a atomic_ptr? If smart pointers could be cas'd, you
could implement wait-free stacks and queues with these pointers... Without
it, i don't see how you can use these pointers for wait-free collections (
multiple threads popping and pushing concurrently ).

Any ideas???

Joseph Seigh

unread,

Mar 11, 2003, 9:16:04 AM3/11/03

to

SenderX wrote:
>
> > Smart pointers solve the ABA problem if you unreference the object instead
> of reusing
> > it. The reference counting will ensure that no outstanding queue
> operations are going
> > on when it is deleted.
>
> So you saying I can use your smart pointers to implement the wait-free stack
> cause it will avoid the aba problem?
>
> If you mean something like the following quick sample will be safe using
> your pointers, it won't:
>

(snip)

You are right, it won't because is needs CAS which is not in atomic_ptr yet.

>
> This will not leak anything, but it is not thread safe at all.
>
> If multiple threads concurrently pop and push, the stack will crash for
> sure.
>
> How can I possibly convert the sample stack using your smart pointers? I
> know how to do it using per-thread hazard storage, but i think that solution
> is crappy. I would rather use your pointers. How would you implement a
> compare and swap of a atomic_ptr? If smart pointers could be cas'd, you
> could implement wait-free stacks and queues with these pointers... Without
> it, i don't see how you can use these pointers for wait-free collections (
> multiple threads popping and pushing concurrently ).
>
> Any ideas???

You can use the technique from the z architecture manual with plain pointers.
Or, you can wait until I put CAS into atomic_ptr which won't be right away.
I have other stuff I should be doing.

I'd recommend the former for now. atomic_ptr is pretty heavywait and there is
a forward progress issue with non-trivial CAS loops. Trivial ones are ok since
the hardware implementers are told they'd better be.

The atomic smart pointer are ok as is for read lock-free algorithms where you use
normal synchronization for write access. DCL and AWTEventMulticaster are
examples of this.

Joe Seigh

Joseph Seigh

unread,

Mar 13, 2003, 10:02:02 AM3/13/03

to

I wrote:
> ... atomic_ptr is pretty heavywait and there is

> a forward progress issue with non-trivial CAS loops. Trivial ones are ok since
> the hardware implementers are told they'd better be.

Speaking of that, I actually ran into that testing CAS on w2k. All this time
I was trying to get thread interleaving on linux and windows does it fine.

Test program uses copy on write to increment a shared integer object. Ten
threads are running looping the following code. z is the global shared ptr,
x and y are local ptrs.

y = new Item();
do {
x = z;
y->count = x->count + 1;
}
while (z.cas(x, y) == false);

This has a 80% cas retry rate. The likely cause is window heap manager locking (and/or
the debug mode C runtime lib on top of that). It probably uses critical sections which
have really bad contention characteristics.

Adding one simple statement to the above changes the cas retry rate to under 1% and it
runs 20% faster.

y = new Item();
do {
x = 0;
x = z;
y->count = x->count + 1;
}
while (z.cas(x, y) == false);

The "x = 0" forces the deallocation of the object referenced by x before the cas "critical
section" proper is entered. I.e., the copy of z is more recent and so a retry is less
likely to be required.

y = new Item();
do {
x = z;
y->count = x->count + 1;
}
while (z.cas(x, y) == false);
SwitchToThread();

The above has a 0% cas retry rate and runs 50% faster. Windows critical sections run much faster
when there is no contention apparently.

y = new Item();
do {
x = z;
y->count = x->count + 1;
}
while (z.cas(x, y) == false);
y = x; // reuse old Item

Avoiding heap manager calls is of course the most efficient. The above runs 5x faster and has
less than 1% cas retry rate.

I haven't run this on linux yet. But most heap managers are likely to use locking. So, strictly
speaking, I'd have to make atomic_ptr use asynchronous deletes to achieve true lock-free behavior.
That would probably require virtual destructors and a designated thread to do it.

Joe Seigh

SenderX

unread,

Mar 13, 2003, 6:01:52 PM3/13/03

to

> y = new Item();
> do {
> x = z;
> y->count = x->count + 1;
> }
> while (z.cas(x, y) == false);
>
> This has a 80% cas retry rate. The likely cause is window heap manager
locking (and/or
> the debug mode C runtime lib on top of that). It probably uses critical
sections which
> have really bad contention characteristics.

The heap manager does use critical sections with spin counts ( on smp ).

> y = new Item();
> do {
> x = z;
> y->count = x->count + 1;
> }
> while (z.cas(x, y) == false);
> y = x; // reuse old Item
>
> Avoiding heap manager calls is of course the most efficient. The above
runs 5x faster and has
> less than 1% cas retry rate.
>
> I haven't run this on linux yet. But most heap managers are likely to use
locking. So, strictly
> speaking, I'd have to make atomic_ptr use asynchronous deletes to achieve
true lock-free behavior.
> That would probably require virtual destructors and a designated thread to
do it.

So now the problem with deleting nodes is the heap manager, which is minor
compared with an application crashing do to threads modifying deleted
pointers. I would really be interested in how you can compare an atomic_ptr
with a local_ptr, and swap a local_ptr with an atomic_ptr?

Even if you need to tweak the way nodes are deleted, you don't have to worry
about other threads working on deleted pointers ( cause of your
atomic_ptr ). I think that cas'ing atomic_ptrs would be the, by far, the
best solution for implementing wait-free dynamic collections. The only other
solution that seems to work is hazard pointers.

By the way, on using hazard pointers for deleting nodes ( mine haven't
crashed yet )... It that a good solution or is it asking for trouble? It
seems the algorithm for hazard pointers is to provide a simple delay of node
deletion based on the number of threads and number of hazard pointers. This
seems like the odds of crashing are slim. The faster the processors and the
more concurrent running threads they can handle ( hyperthreading ), the more
solid the hazard pointer algo. gets cause other threads that have pointers
to nodes in hazard will move on to the next node ( avoiding aba ) faster,
and faster.

But as i said before, your atomic_ptrs seem like the best wait-free solution
out there, with atomic_ptr cas that is.

Joseph Seigh

unread,

Mar 13, 2003, 7:29:45 PM3/13/03

to

SenderX wrote:
>
> So now the problem with deleting nodes is the heap manager, which is minor
> compared with an application crashing do to threads modifying deleted
> pointers. I would really be interested in how you can compare an atomic_ptr
> with a local_ptr, and swap a local_ptr with an atomic_ptr?

bool cas(local_ptr<T> cmp, atomic_ptr<T> xchg) {
ref<T> temp;
bool rc = false;

temp.ecount = xxx.ecount;
temp.ptr = cmp.refptr;

do {
if (cas64(&xxx, &temp, &xchg.xxx) != 0) {
xchg.xxx = temp;
rc = true;
break;
}

}
while (cmp.refptr == temp.ptr);

return rc;
}

Either parameter can be atomic_ptr or local_ptr. There are copy ctors
to cover either case. (note to self - I do need to come up with a
better field name than xxx sometime)

>
> Even if you need to tweak the way nodes are deleted, you don't have to worry
> about other threads working on deleted pointers ( cause of your
> atomic_ptr ). I think that cas'ing atomic_ptrs would be the, by far, the
> best solution for implementing wait-free dynamic collections. The only other
> solution that seems to work is hazard pointers.
>
> By the way, on using hazard pointers for deleting nodes ( mine haven't
> crashed yet )... It that a good solution or is it asking for trouble? It
> seems the algorithm for hazard pointers is to provide a simple delay of node
> deletion based on the number of threads and number of hazard pointers. This
> seems like the odds of crashing are slim. The faster the processors and the
> more concurrent running threads they can handle ( hyperthreading ), the more
> solid the hazard pointer algo. gets cause other threads that have pointers
> to nodes in hazard will move on to the next node ( avoiding aba ) faster,
> and faster.

I didn't look at hazard pointers in detail. They're probably ok. I just
don't how well they scale though.

Joe Seigh

SenderX

unread,

Mar 13, 2003, 11:26:28 PM3/13/03

to

When i converted that sample stack code i posted to use the new cas:

class C_AtomicStack
{
public:

class C_Node
{
public:

C_Node( LPVOID pObj = NULL )
{
m_pObj = pObj;
}

atomic_ptr< C_Node > m_Next;
LPVOID m_pObj;
};

VOID Push( LPVOID pObj )
{
// Alloc new node

atomic_ref< C_Node > NewNode( new C_Node( pObj ) );

do
{
// Read the front

NewNode->m_Next = m_Front;
}

// Compare the read front with the real front, and swap the new node

while( ! m_Front.cas( NewNode->m_Next, NewNode ) );
}

VOID Pop( LPVOID *ppObj )
{
*ppObj = NULL;

local_ref< C_Node > PopNode;

do
{
// Read the front

PopNode = m_Front;
}

// Compare the read front with the real front, and swap the next
node

while( ! m_Front.cas( PopNode, PopNode->m_Next ) );

*ppObj = PopNode->m_pObj;
}

private:

atomic_ptr< C_Node > m_Front;

};

The was an unresolved issue of the following pieces of code looping forever:

1. bool atomic_ptr::cas(local_ptr<T> cmp, atomic_ptr<T> xchg)

> do {
> if (cas64(&xxx, &temp, &xchg.xxx) != 0) {
> xchg.xxx = temp;
> rc = true;
> break;
> }
> while (cmp.refptr == temp.ptr);

2. int atomic_ptr_ref::adjust(long xephemeralCount, long xreferenceCount)

> oldval.ecount = count.ecount;
> oldval.rcount = count.rcount;
> do {
> newval.ecount = oldval.ecount + xephemeralCount;
> newval.rcount = oldval.rcount + xreferenceCount;
> }
> while (cas64(&count, &oldval, &newval) == 0);

Is there something in the sample stack code that can cause this?

Joseph Seigh

unread,

Mar 14, 2003, 7:08:37 AM3/14/03

to

SenderX wrote:
>
> When i converted that sample stack code i posted to use the new cas:
>
> class C_AtomicStack

...

>
> The was an unresolved issue of the following pieces of code looping forever:
>
> 1. bool atomic_ptr::cas(local_ptr<T> cmp, atomic_ptr<T> xchg)

...

> 2. int atomic_ptr_ref::adjust(long xephemeralCount, lon

...

> Is there something in the sample stack code that can cause this?

Your cas_64 poster earlier doesn't return the updated value. The
cas64 I am using is styled after IBM's cs/cds. So the loops in the
above methods will loop forever since they don't get a more current
copy of the target. The atomic_ptr::cas method doesn't return a
current target value. I haven't decided if it's worth it yet.

The cas64 I am using is below. Unlocked cmpxchng8b is 20% faster then
locked. I need to test the call vs. inline overhead against always
using the lock prefix.

I'll post an updated atomic_ptr.h later. I need to write a testcase
to check all the valid uses compile ok.

Joe Seigh

-- cas64.h --
#ifndef CAS64_H
#define CAS64_H

#ifdef __cplusplus
extern "C" {
#endif

extern int (*cas64_)(void *, void *, void *);
#define cas64(a, b, c) (*cas64_)(a, b, c)

#ifdef __cplusplus
}
#endif

#endif // CAS64_H

-- cas64.c --
#define _WIN32_WINNT 0x0500
#include <windows.h>
#include <winbase.h>
#include <cas64.h>

//-----------------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------------
int cas64_up(void * dest, void * xcmp, void * xxchg) {
BYTE retval;

__asm
{
mov esi, [xcmp] ; comparand
mov eax, [esi + 0]
mov edx, [esi + 4]

mov esi, [xxchg] ; exchange
mov ebx, [esi + 0]
mov ecx, [esi + 4]

mov esi, [dest] ; destination
cmpxchg8b [esi]
sete retval
jz xxxx

mov esi, [xcmp] ; update comparand
mov [esi + 0], eax;
mov [esi + 4], edx;
xxxx:
};

return (int)retval;
}

//-----------------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------------
int cas64_mp(void * dest, void * xcmp, void * xxchg) {
BYTE retval;

__asm
{
mov esi, [xcmp] ; comparand
mov eax, [esi + 0]
mov edx, [esi + 4]

mov esi, [xxchg] ; exchange
mov ebx, [esi + 0]
mov ecx, [esi + 4]

mov esi, [dest] ; destination
lock cmpxchg8b [esi]
sete retval
jz yyyy

mov esi, [xcmp] ; update comparand
mov [esi + 0], eax;
mov [esi + 4], edx;
yyyy:
};

return (int)retval;
}

//-----------------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------------
int cas64_a(void * dest, void * xcmp, void * xxchg) {
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
if (sysinfo.dwNumberOfProcessors > 1)
cas64_ = cas64_mp;
else
cas64_ = cas64_up;

return (*cas64_)(dest, xcmp, xxchg);
}

//-----------------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------------
int (*cas64_)(void *, void *, void *) = &cas64_a;

/*-*/

Joseph Seigh

unread,

Mar 14, 2003, 5:13:26 PM3/14/03

to

This is the current version. Changed to exploit stuff that turns out
only works with vc++ but not gcc on linux. Changed back except for
a couple of things to make vc++ recognise a null pointer on the left
side of a comparison. wtf is c++ so incredably brain damaged about
null pointers?

Joe Seigh

-- atomic_ptr.h --
#include <cas64.h>

template<typename T> class atomic_ptr;
template<typename T> class local_ptr;
template<typename T> class atomic_ptr_ref;

struct refcount {

long ecount; // ephemeral count

long rcount; // reference count
};

template<typename T> struct ref {

long ecount; // ephemeral count
atomic_ptr_ref<T> *ptr;
};

//=============================================================================

// atomic_ptr_ref
//
//
//=============================================================================
template<typename T> class atomic_ptr_ref {
friend class atomic_ptr<T>;
friend class local_ptr<T>;

private:

atomic_ptr_ref(T * p = 0) {
count.ecount = 0;
count.rcount = 1;
ptr = p;
};

// atomic
int adjust(long xephemeralCount, long xreferenceCount) {
refcount oldval, newval;

oldval.ecount = count.ecount;

oldval.rcount = count.rcount;
do {
newval.ecount = oldval.ecount + xephemeralCount;
newval.rcount = oldval.rcount + xreferenceCount;

}
while (cas64(&count, &oldval, &newval) == 0);

return (newval.ecount == 0 && newval.rcount == 0) ? 0 : 1;
}

refcount count; // reference counts
T * ptr; // ptr to actual object

}; // class atomic_ptr_ref

//=============================================================================
// local_ptr
//
//
//=============================================================================
template<typename T> class local_ptr {

friend class atomic_ptr<T>;
public:

local_ptr(T * obj = 0) {
if (obj != 0) {
refptr = new atomic_ptr_ref<T>(obj);
refptr->count.ecount = 1;
refptr->count.rcount = 0;
}
else
refptr = 0;
}

local_ptr(const local_ptr<T> & src) {
if ((refptr = src.refptr) != 0)
refptr->adjust(+1, 0);
}

local_ptr(atomic_ptr<T> & src) {
refptr = src.getrefptr();
}

~local_ptr() {

if (refptr != 0 && refptr->adjust(-1, 0) == 0) {
delete refptr->ptr;
delete refptr;
}
}

local_ptr<T> & operator = (T * obj) {
local_ptr<T> temp(obj);

atomic_ptr_ref<T> * refptr;

}; // class local_ptr

template<typename T> inline bool operator == (int lhd, local_ptr<T> & rhd)
{ return ((T *)lhd == rhd); }

template<typename T> inline bool operator != (int lhd, local_ptr<T> & rhd)
{ return ((T *)lhd != rhd); }

template<typename T> inline bool operator == (T * lhd, local_ptr<T> & rhd)
{ return (rhd == lhd); }

template<typename T> inline bool operator != (T * lhd, local_ptr<T> & rhd)
{ return (rhd != lhd); }

//=============================================================================
// atomic_ptr
//
//
//=============================================================================
template<typename T> class atomic_ptr {
friend class local_ptr<T>;

public:

atomic_ptr(T * obj = 0) {
xxx.ecount = 0;
if (obj != 0) {
xxx.ptr = new atomic_ptr_ref<T>(obj);
}
else
xxx.ptr = 0;
}

atomic_ptr(local_ptr<T> & src) { // copy constructor
xxx.ecount = 0;
if ((xxx.ptr = src.refptr) != 0)
xxx.ptr->adjust(0, +1);
}

atomic_ptr(atomic_ptr<T> & src) { // copy constructor

xxx.ecount = 0;
xxx.ptr = src.getrefptr(); // atomic

// adjust link count
if (xxx.ptr != 0)
xxx.ptr->adjust(-1, +1); // atomic
}

~atomic_ptr() { // destructor

if (xxx.ptr != 0 && xxx.ptr->adjust(xxx.ecount, -1) == 0) {
delete xxx.ptr->ptr;
delete xxx.ptr;
}
}

atomic_ptr & operator = (T * obj) {
atomic_ptr<T> temp(obj);

swap(temp); // atomic
return *this;
}

atomic_ptr & operator = (local_ptr<T> & src) {
atomic_ptr<T> temp(src);

swap(temp); // atomic
return *this;
}

atomic_ptr & operator = (atomic_ptr<T> & src) {
atomic_ptr<T> temp(src);

swap(temp); // atomic
return *this;
}

// generate local temp ptr to guarantee validity of ptr
// during lifetime of expression. temp is dtor'd after
// expression has finished execution.
local_ptr<T> operator -> () { return local_ptr<T>(*this); }
local_ptr<T> operator * () { return local_ptr<T>(*this); }

bool operator == (T * rhd) {
if (rhd == 0)
return (xxx.ptr == 0);
else
return (local_ptr<T>(*this) == rhd);
}

bool operator != (T * rhd) {
if (rhd == 0)
return (xxx.ptr != 0);
else
return (local_ptr<T>(*this) != rhd);
}

bool operator == (local_ptr<T> & rhd) {return (local_ptr<T>(*this) == rhd); }
bool operator != (local_ptr<T> & rhd) {return (local_ptr<T>(*this) != rhd); }
bool operator == (atomic_ptr<T> & rhd) {return (local_ptr<T>(*this) == local_ptr<T>(rhd)); }
bool operator != (atomic_ptr<T> & rhd) {return (local_ptr<T>(*this) != local_ptr<T>(rhd)); }

bool cas(local_ptr<T> cmp, atomic_ptr<T> xchg) {

ref<T> temp;
bool rc = false;

temp.ecount = xxx.ecount;
temp.ptr = cmp.refptr;

do {

if (cas64(&xxx, &temp, &xchg.xxx) != 0) {
xchg.xxx = temp;
rc = true;
break;
}

}
while (cmp.refptr == temp.ptr);

return rc;
}

private:

ref<T> xxx;

// atomic
void swap(atomic_ptr & obj) { // obj is local & non-shared
ref<T> temp;

temp.ecount = xxx.ecount;
temp.ptr = xxx.ptr;

while(cas64(&xxx, &temp, &obj.xxx) == 0);

obj.xxx.ecount = temp.ecount;
obj.xxx.ptr = temp.ptr;
}

// atomic
atomic_ptr_ref<T> * getrefptr() {
ref<T> oldval, newval;

oldval.ecount = xxx.ecount;
oldval.ptr = xxx.ptr;

do {

newval.ecount = oldval.ecount + 1;
newval.ptr = oldval.ptr;
}
while (cas64(&xxx, &oldval, &newval) == 0);

return oldval.ptr;
}

}; // class atomic_ptr

template<typename T> inline bool operator == (int lhd, atomic_ptr<T> & rhd)
{ return ((T *)lhd == rhd); }

template<typename T> inline bool operator != (int lhd, atomic_ptr<T> & rhd)
{ return ((T *)lhd != rhd); }

SenderX

unread,

Mar 15, 2003, 1:54:40 AM3/15/03

to

Huston, we have lift off! ;)

I have successfully implemented the lock-free lifo stack and fifo queue
using your super pointers...

Thank you very much for your time... This pointer works great!!!

Come to think about it, you could use this pointer for very complex
wait-free structures, perfect for real-time systems and hyperthreading, smp,
ect...

I have added per-thread memory pools for the collection nodes and the
atomic_ptr_ref structs, it improves performance. Your pointers combined with
free lists can possibly turn c++ into a wait-free garbage collected language
that allows threads to reference shared data, when they don't already own a
reference themselves. Programmers that use .net just for garbage collection
should look at your pointers...

Thank you.