http://groups.google.com/group/comp.programming.threads/browse_frm/thread/4e8717d3bcdedfe9
All of its "critical functionality", as of now, is created in pure MASM IA32 
assembly language; a GAS compatible version is coming soon. I decided to go 
ahead and do most of it in assembly language simple because I did not want 
to deal with any compiler ordering. Atomic lock-free reference counting 
algorithms need to be precisely executed, as is. The assembly language API 
adheres to the C ABI; you can use it in your existing C programs.
I will posted the source code in a couple of following posts; there are a 
total of 5 files:
refcount_ia32.asm
refcount_ia32.h
spinlock_ia32.h
helper.h
refcount.h
If all goes well, I think that my algorithm could be a great candidate for a 
replacement for Boosts shared_ptr. As you may know by now, shared_ptr is not 
atomically thread-safe. I advise you to take a look at this code if you are 
interested in improving Boost shared_ptr functionality and performance...
My current algorithm does not use anything like GC or PDR. Instead, I have 
developed a new double-checked method that is coupled with a hashed static 
spinlock table scheme in order to get the job done, the right way. The 
algorithm only has to access the hashed spinlock table during strong 
increments and when the count drops to, or below, zero. Its a fairly 
efficient setup...
I would be very interested to hear if anybody can assembly and compile it. 
If your having any trouble whatsoever, or if you have any questions, feel 
free to fire away!
:)
Any thoughts?
; Copyright 2005 Chris Thomasson
.686
.XMM
.MODEL FLAT, C
.CODE
align 16
; int refcount_ia32_init(
;   refcount_ia32_t volatile*,
;   int,
;   refcount_ia32_fpdtor_t,
;   void*)
refcount_ia32_init PROC
  MOV ECX, [ESP + 4]
  MOV EAX, [ESP + 8]
  CMP EAX, 0
  JLE refcount_ia32_init_failed
  MOV EDX, [ESP + 12]
  MOV [ECX], EAX
  MOV [ECX + 4], EDX
  MOV EDX, [ESP + 16]
  MOV [ECX + 8], EDX
  MOV EAX, 1
  RET
refcount_ia32_init_failed:
  XOR EAX, EAX
  RET
refcount_ia32_init ENDP
align 16
; refcount_ia32_t* refcount_ia32_add_load_strong(
;   refcount_ia32_t* volatile*,
;   int,
;   spinlock_ia32_table_t const*,
;   size_t)
refcount_ia32_add_load_strong PROC
  PUSH ESI
  PUSH EBX
  MOV ECX, [ESP + 12]
  MOV EDX, [ESP + 20]
  MOV ESI, [ESP + 16]
  CMP ESI, 0
  JG refcount_ia32_add_load_strong_retry
  XOR EAX, EAX
  RET
  refcount_ia32_add_load_strong_retry:
  MOV EAX, [ECX]
  MOV EBX, EAX
  IMUL EBX, 567
  SHR EBX, 8
  AND EBX, 0FFh
  IMUL EBX, [ESP + 24]
  LEA EBX, [EDX + EBX]
refcount_ia32_add_load_strong_lock_retry:
  MOV ESI, 1
  XCHG [EBX], ESI
  TEST ESI, ESI
  JNE refcount_ia32_add_load_strong_lock_pause_retry
  MOV ESI, [ECX]
  CMP EAX, ESI
  JNE refcount_ia32_add_load_strong_unlock_retry
  XADD [EAX], ESI
  CMP ESI, 0
  JLE refcount_ia32_add_load_strong_unlock_retry
  XOR ESI, ESI
  MOV [EBX], ESI
  POP EBX
  POP ESI
  RET
refcount_ia32_add_load_strong_lock_pause_retry:
  PAUSE
  JMP refcount_ia32_add_load_strong_lock_retry
refcount_ia32_add_load_strong_unlock_retry:
  XOR ESI, ESI
  MOV [EBX], ESI
  JMP refcount_ia32_add_load_strong_retry
refcount_ia32_add_load_strong ENDP
align 16
; int refcount_ia32_sub_strong(
;   refcount_ia32_t* volatile,
;   int,
;   spinlock_ia32_table_t const*,
;   size_t)
refcount_ia32_sub_strong PROC
  MOV EAX, [ESP + 4]
  MOV EDX, [ESP + 8]
  CMP EDX, 0
  JLE refcount_ia32_sub_strong_skip
  NEG EDX
  LOCK XADD [EAX], EDX
  JZ refcount_ia32_sub_strong_destroy
  JS refcount_ia32_sub_strong_destroy
  refcount_ia32_sub_strong_skip:
  XOR EAX, EAX
  RET
refcount_ia32_sub_strong_destroy:
  MOV EDX, [ESP + 12]
  MOV ECX, EAX
  IMUL ECX, 567
  SHR ECX, 8
  AND ECX, 0FFh
  IMUL ECX, [ESP + 16]
  LEA ECX, [EDX + ECX]
refcount_ia32_sub_strong_lock_retry:
  MOV EDX, 1
  XCHG [ECX], EDX
  TEST EDX, EDX
  JNE refcount_ia32_sub_strong_lock_pause_retry
  MOV EDX, [EAX + 4]
  PUSH [EAX + 8]
  XOR EAX, EAX
  MOV [ECX], EAX
  TEST EDX, EDX
  JE refcount_ia32_sub_strong_done
  CALL EDX
refcount_ia32_sub_strong_done:
  POP EDX
  MOV EAX, 1
  RET
refcount_ia32_sub_strong_lock_pause_retry:
  PAUSE
  JMP refcount_ia32_sub_strong_lock_retry
refcount_ia32_sub_strong ENDP
align 16
; void refcount_ia32_add_weak(
;   refcount_ia32_t volatile*,
;   int)
refcount_ia32_add_weak PROC
  MOV ECX, [ESP + 4]
  MOV EAX, [ESP + 8]
  CMP EAX, 0
  JLE refcount_ia32_add_weak_done
  LOCK XADD [ECX], EAX
refcount_ia32_add_weak_done:
  RET
refcount_ia32_add_weak ENDP
align 16
; refcount_ia32_t* refcount_ia32_add_swap_weak(
;   refcount_ia32_t* volatile*,
;   refcount_ia32_t*,
;   int)
refcount_ia32_add_swap_weak PROC
  MOV ECX, [ESP + 4]
  MOV EAX, [ESP + 8]
  TEST EAX, EAX
  JE refcount_swap_weak_execute
  MOV EDX, [ESP + 12]
  CMP EDX, 0
  JLE refcount_swap_weak_execute
  LOCK XADD [EAX], EDX
refcount_swap_weak_execute:
  XCHG [ECX], EAX
  RET
refcount_ia32_add_swap_weak ENDP
END
/* Copyright 2005 Chris Thomasson */
#if ! defined(SPINLOCK_IA32_H_INCLUDED)
#define SPINLOCK_IA32_H_INCLUDED
#if defined(__cplusplus)
extern "C" {
#endif
#include "helper.h"
#define IA32_L2CACHELINESZ 128
#define SPINLOCK_IA32_PADSZ IA32_L2CACHELINESZ
#define SPINLOCK_IA32_TABLE_DEPTH 0xFF
#define SPINLOCK_IA32_TABLE_ALIGNSZ IA32_L2CACHELINESZ
#define SPINLOCK_IA32_TABLE_MEMALIGNSZ(mp_tabletype) \
  (MEMALIGNSZ(sizeof(mp_tabletype), SPINLOCK_IA32_TABLE_ALIGNSZ))
#define SPINLOCK_IA32_TABLE_MEMALIGNPTR(mp_tablemem) \
  ((spinlock_ia32_table_t*)MEMALIGNPTR((mp_tablemem).base, 
SPINLOCK_IA32_TABLE_ALIGNSZ))
typedef struct spinlock_ia32_s spinlock_ia32_t;
typedef struct spinlock_ia32_padded_s spinlock_ia32_padded_t;
typedef struct spinlock_ia32_table_s spinlock_ia32_table_t;
typedef struct spinlock_ia32_table_mem_s spinlock_ia32_table_mem_t;
struct spinlock_ia32_s {
  void *state;
};
struct spinlock_ia32_padded_s {
  spinlock_ia32_t _this;
  char pad1[TYPEPADSZ(spinlock_ia32_t, SPINLOCK_IA32_PADSZ)];
};
struct spinlock_ia32_table_s {
  spinlock_ia32_padded_t locks[SPINLOCK_IA32_TABLE_DEPTH];
};
struct spinlock_ia32_table_mem_s {
  char base[SPINLOCK_IA32_TABLE_MEMALIGNSZ(spinlock_ia32_table_t)];
};
static spinlock_ia32_table_mem_t g_spinlock_ia32_table_mem = {0};
static spinlock_ia32_table_t* const g_spinlock_ia32_table =
SPINLOCK_IA32_TABLE_MEMALIGNPTR(g_spinlock_ia32_table_mem);
#if defined(__cplusplus)
}
#endif
#endif
/* Copyright 2005 Chris Thomasson */
#if ! defined(REFCOUNT_IA32_H_INCLUDED)
#define REFCOUNT_IA32_H_INCLUDED
#if defined(__cplusplus)
extern "C" {
#endif
#include "spinlock_ia32.h"
typedef struct refcount_ia32_s refcount_ia32_t;
typedef struct refcount_ia32_userstate_s refcount_ia32_userstate_t;
typedef void (*refcount_ia32_fpdtor_t) (void*);
struct refcount_ia32_userstate_s {
  refcount_ia32_fpdtor_t fpdtor;
  void *state;
};
struct refcount_ia32_s {
  int refs;
  refcount_ia32_userstate_t ustate;
};
extern int refcount_ia32_init(refcount_ia32_t*, int, refcount_ia32_fpdtor_t, 
void*);
extern refcount_ia32_t* refcount_ia32_add_load_strong(refcount_ia32_t* 
volatile*, int, spinlock_ia32_table_t const*, size_t);
extern int refcount_ia32_sub_strong(refcount_ia32_t volatile*, int, 
spinlock_ia32_table_t const*, size_t);
extern refcount_ia32_t* refcount_ia32_add_swap_weak(refcount_ia32_t* 
volatile*, refcount_ia32_t*, int);
extern void refcount_ia32_add_weak(refcount_ia32_t volatile*, int);
typedef refcount_ia32_t refcount_t;
#define refcount_init refcount_ia32_init
#define refcount_add_swap_weak refcount_ia32_add_swap_weak
#define refcount_add_weak refcount_ia32_add_weak
#define refcount_add_load_strong(mp_pthis, mp_count) \
  (refcount_ia32_add_load_strong( \
    (mp_pthis), \
    (mp_count), \
    g_spinlock_ia32_table, \
    sizeof(spinlock_ia32_padded_t) \
  ))
#define refcount_sub_strong(mp_this, mp_count) \
  (refcount_ia32_sub_strong( \
    (mp_this), \
    (mp_count), \
    g_spinlock_ia32_table, \
    sizeof(spinlock_ia32_padded_t) \
  ))
#if defined(__cplusplus)
}
#endif
#endif
/* Copyright 2005 Chris Thomasson */
#if ! defined(HELPER_H_INCLUDED)
#define HELPER_H_INCLUDED
#if defined(__cplusplus)
extern "C" {
#endif
#include <stddef.h>
#define ROUNDUP(mp_val, mp_rnd) \
  (((mp_val) % (mp_rnd)) ? (mp_val) + ((mp_rnd) - ((mp_val) \
    % (mp_rnd))) : (mp_val))
#define ROUNDDOWN(mp_val, mp_rnd) \
  (ROUNDUP((mp_val), (mp_rnd)) - (mp_rnd))
#define MEMALIGNSZ(mp_align) \
  (((size_t)(mp_align)) - 1)
#define MEMALIGNPTR(mp_base, mp_align) \
  ((void*)((((ptrdiff_t)(mp_base)) + MEMALIGNSZ(mp_align)) \
    & (-((ptrdiff_t)(mp_align)))))
#define MEMPADSZ(mp_sz, mp_pad) \
  (((size_t)(mp_pad)) - ((size_t)(mp_sz)))
#define TYPEPADSZ(mp_type, mp_pad) \
  (MEMPADSZ(sizeof(mp_type), (mp_pad)))
#if defined(__cplusplus)
}
#endif
#endif
> 
> I will posted the source code in a couple of following posts; there are a 
> total of 5 files:
> 
> 
> refcount_ia32.asm
> refcount_ia32.h
> spinlock_ia32.h
> helper.h
> refcount.h
> 
> 
> If all goes well, I think that my algorithm could be a great candidate for a 
> replacement for Boosts shared_ptr. As you may know by now, shared_ptr is not 
> atomically thread-safe. I advise you to take a look at this code if you are 
> interested in improving Boost shared_ptr functionality and performance...
> 
> My current algorithm does not use anything like GC or PDR. Instead, I have 
> developed a new double-checked method that is coupled with a hashed static 
> spinlock table scheme in order to get the job done, the right way. The 
> algorithm only has to access the hashed spinlock table during strong 
> increments and when the count drops to, or below, zero. Its a fairly 
> efficient setup...
> 
> I would be very interested to hear if anybody can assembly and compile it. 
> If your having any trouble whatsoever, or if you have any questions, feel 
> free to fire away!
> 
Some pseudocode so we don't have to figure out the assembler would be nice.
-- 
Joe Seigh
When you get lemons, you make lemonade.
When you get hardware, you make software. 
/* Copyright 2005 Chris Thomasson */
#if ! defined(REFCOUNT_H_INCLUDED)
#define REFCOUNT_H_INCLUDED
#if defined(__cplusplus)
extern "C" {
#endif
#include "refcount_ia32.h"
#if defined(__cplusplus)
}
#endif
#endif
:)
The specific file that I noticed where the extra lines were added is:
spinlock_ia32.h
This is the exact macro where extra line-feeds were added:
SPINLOCK_IA32_TABLE_MEMALIGNPTR
I don't think it messed with the any of the others, fix that macro and it 
should cut-paste-and-compile. Sorry for the minor inconvenience... Like I 
said, I am going to post all of this stuff to my AppCore website.
Enjoy!
;)
If you have already downloaded it, please REPLACE the 
refcount_ia32_add_load_strong that you have with the following one:
align 16
; refcount_ia32_t* refcount_ia32_add_load_strong(
;   refcount_ia32_t* volatile*,
;   int,
;   spinlock_ia32_table_t const*,
;   size_t)
refcount_ia32_add_load_strong PROC
  PUSH ESI
  PUSH EBX
  MOV ECX, [ESP + 12]
  MOV EDX, [ESP + 20]
  MOV ESI, [ESP + 16]
  CMP ESI, 0
JLE refcount_ia32_add_load_strong_failed
refcount_ia32_add_load_strong_retry:
  MOV EAX, [ECX]
  TEST EAX, EAX
  JE refcount_ia32_add_load_strong_failed
  MOV EBX, EAX
  IMUL EBX, 567
  SHR EBX, 8
  AND EBX, 0FFh
  IMUL EBX, [ESP + 24]
  LEA EBX, [EDX + EBX]
refcount_ia32_add_load_strong_lock_retry:
  MOV ESI, 1
  XCHG [EBX], ESI
  TEST ESI, ESI
  JNE refcount_ia32_add_load_strong_lock_pause_retry
  MOV ESI, [ECX]
  CMP EAX, ESI
  JNE refcount_ia32_add_load_strong_unlock_retry
  XADD [EAX], ESI
  CMP ESI, 0
  JLE refcount_ia32_add_load_strong_unlock_retry
  XOR ESI, ESI
  MOV [EBX], ESI
  POP EBX
  POP ESI
  RET
refcount_ia32_add_load_strong_failed:
  XOR EAX, EAX
  RET
refcount_ia32_add_load_strong_lock_pause_retry:
  PAUSE
  JMP refcount_ia32_add_load_strong_lock_retry
refcount_ia32_add_load_strong_unlock_retry:
  XOR ESI, ESI
  MOV [EBX], ESI
  JMP refcount_ia32_add_load_strong_retry
refcount_ia32_add_load_strong ENDP
I am going to post full working code and a Visual C++ 6.0 Workspace sometime 
today.
I am sorry for making this stupid mistake and making you replace the 
function...
;^(... 
[...]
> Some pseudocode so we don't have to figure out the assembler would be 
> nice.
Okay. ;)
Let me start off with getting the hashed spinlock layer (e.g., 
spinlock_ia32.h) out of the way:
static g_spinlock_ia32_table = ...;
spinlock_ia32_t* spinlock_hash_and_lock(void *ptr) {
  spinlock_ia32_t *lock = g_spinlock_ia32_table[HASH(ptr)];
  SPINLOCK_LOCK(lock);
  return lock;
}
void spinlock_unlock(spinlock_ia32_t *lock) {
  SPINLOCK_UNLOCK(lock);
}
That should be fairly self-explainable. Now, on to show how strong 
subtractations/decrements are performed. The pseudo-code shows how the 
following function works, error checking omitted:
int refcount_ia32_sub_strong(refcount_ia32_t volatile*, int, 
spinlock_ia32_table_t const*, size_t);
bool sub_strong(refcount_t *_this, int count, /* lock table access */) {
  // decrement the references by neg count.
  if (XADD(&this->refs, -count) <= count) {
    // the count dropped below 1; XADD returns old value.
    // now, we need to lock the associated spinlock because
    // there could be a strong incrementer/adder holding it.
    // after we lock and unlock, we are guaranteed to be alone.
spinlock_ia32_t *lock = spinlock_hash_and_lock(_this);
    // this is actually unnecessary; there is no need to read ahead of time.
    // as i stated earlier, once we unlock, we are alone for good. well
    // might as well do something in here!   ;)
refcount_t copy = *_this;
spinlock_unlock(lock);
    // okay; were quiescent!   :)
    // call the user-provided dtor, if any
    if (copy.ustate.fpdtor) {
      copy.ustate.fpdtor(copy.ustate.state);
    }
    // return true to let the caller know it can call free on _this.
    return true;
  }
  // return false to let the caller know it cannot call free on _this.
  return false;
}
Finally we get to how the strong additions/increments performed. The 
pseudo-code shows how the following function works, error checking omitted:
refcount_ia32_t* refcount_ia32_add_load_strong(refcount_ia32_t* volatile*, 
int, spinlock_ia32_table_t const*, size_t);
refcount_t* add_strong(refcount_t **_pthis, int count, /* lock table access 
*/) {
  // first things first; we need make the initial load from the pointer to 
the shared location.
refcount_t *_this = *_pthis;
  // only operate on NON-NULL values. I made a mistake and posted an older
  // version of the strong increment function; it did not check for null! 
Stupid mistake.
  // anyway, here is the check for null in the pseudo-code, and I posted a 
correction!
  // :)
  while(_this) {
    // we now need to lock the associated spinlock. We need to be alone wrt
    // the refcount structure that _this points to.
spinlock_ia32_t *lock = spinlock_hash_and_lock(_this);
    // we now need to double-check our previous load. the pointer could
    // have been swapped out, and might be in the middle of a concurrent
    // reference count subtraction/decrement.
refcount_t *cmp = *_pthis;
    // make sure they are the same; remember we are under the 
critical-section
    // provided by the hashed spinlock from the pointer value of _this. 
ditto for
    // the pointer value of cmp. if there are equal, then we have 
consistency...
if (_this == cmp ) {
// go ahead and make the increment, and record the previous value
int prev = XADD(&_this->refs, count);
      // we need to make sure that we incremented a reference count value
      // that was over at least over zero!
if (prev > 0) {
        // we have successfully incremented the reference counter from a 
previous
        // value that was at least greater than zero. we now own a 
reference. finally,
        // we need to unlock the previously acquired spinlock, and return a 
pointer
        // to the newly acquired refcount object.
spinlock_unlock(lock);
        // that's all folks!    ;)
        return _this;
      }
      // we detected a drop to zero condition; count was less than zero! 
this is safe
      // because the decrement function calls the exact same hashed lock 
that we own,
      // before it frees any memory. we need to unlock, and re-read. the 
shared location
      // is not going to be the same pointer as the one we have now. it had 
to have been
      // swapped out of this shared location before any decrements that 
would drop it to
      // zero got applied...
    } //   if (_this == cmp ) {
    // we need to unlock, and
    spinlock_unlock(lock);
    // make another read from the pointer to the shared location
    // and try the increment all over again!
_this = *_pthis;
  } //   while(_this) {
// we detected a null pointer value was read from the shared location!
  return 0;
}
I think I only needed to show how the strong dec's are performed alongside 
strong inc's. After you replace the bugged version of the 
refcount_ia32_add_load_strong I accordantly posted, it should work. Its 
still experimental, however, I believe the algorithm is correct. I really 
need to finely polish my code. Luckily, I don't have any algorithm logic 
errors; just perhaps another stupid programming error wrt not checking for 
null. I am scouring over the assembly language and other than that failure 
to test a loaded pointer value to null, everything looks okay... Man, I 
really hate this type of bug!
Let's have fun dereference a null pointers everybody, oh yeah!!!
;)
Any questions? 
> I am presenting some source code for a sample implementation of my new 
> mostly lock-free reference counting algorithm that I first mention here:
> [...]
     Completely beside the point, but I find the phrase "mostly
lock-free" peculiar.  Is it anything like "mostly virginal?"
If another algorithm that used only half as many locks came along,
would it be "really mostly lock-free," or perhaps "more mostly
lock-free?"  And if another were invented that cut the number of
locks in half yet again, would it be "really more mostly lock-free
and this time I almost sort of mean it?"
     And after all this Madison Avenue hyperbole, would any term
remain to describe an algorithm that used no locks at all?  Or
have we "mostly cheapened" the language?
-- 
Eric "mostly harmless" Sosman
eso...@acm-dot-org.invalid
/* C bugs */
1. Pass macro too many parameters
2. Initializer not constant
3. Missing braces in intitalizer
4. None of these showed up in visual c++ 7
/* Assembly Bugs */
1. Forgot to assert the LOCK prefix on some atomic operations
2. Forgot to pop the stack before returning
 - function: refcount_ia32_add_load_strong
3. Forgot to check for null pointer
 - function: refcount_ia32_sub_strong
4. Adds in a corrupted value
 - refcount_ia32_add_load_strong
I have fixed all of the god damn bugs, and am going to post them all to my 
AppCore website. Before tomorrow. 
Let me see if I can clear some things up...
;^)
Here is an IA-32 based prototype of my idea; it's currently in "pre-alpha" 
status:
http://appcore.home.comcast.net/vzoom/refcount/
Here is an example of how my reference counting implementation is "mostly" 
lock-free... Take a look at the refcount_ia32_add_swap_weak function; it's 
located at the bottom of the following file:
http://appcore.home.comcast.net/vzoom/refcount/refcount_ia32_gcc.asm
(in fact, browse the whole file; there's not a lot of locking going on at 
all.)
This is a 100% pure lock-free swap function; I am also adding a 100% 
lock-free cas function. That means that you can use cas/swap on the pointers 
to the reference counters themselves, no locks at all. In fact, the "only 
time my refcount implementation uses ANY locking at all" is when a thread 
grabs its "first/initial" atomic reference to a shared object; this is the 
slow-path. After the thread acquires its "first/initial" reference, 
"everything wrt counter modifications or pointer swaps/cas's, stays 100% 
lock-free, until the count drops to zero"; this is the last slow-path. So, 
my entire counting algorithm only has 2 slow-paths:
1:  Thread Acquiring Very First/Initial Reference
2:  Reference Count Dropped To Zero
The locking that the slow-paths make use of is based on a global and static 
hashed spinlock scheme:
http://appcore.home.comcast.net/vzoom/refcount/spinlock_ia32.h
So, multiple threads can indeed lock multiple reference counting objects in 
parallel, and they never park in the kernel. The critical-sections that the 
global spinlocks enforce are naturally short; they are made up of 
handcrafted assembly language...
Does that address any of your concerns?
:)
Let's see if I have this right.
You're replacing using a single global lock with hashed global
locks to reduce lock contention.
The rest of it look like how you'd do refcounting with a global
lock(s).
The problem with Boost is it's not that kind of thread safe, so
they're not likely to buy into the extra overhead.  If you do
add thread safety to C++ it's going to be in the form of a new
library similar to what Java did with java.util.concurrent.
Also there's the issue of async safety since you have spin locks
in there.  What would be the advantage compared to the refcount
algorithms that are async safe, apart from patent issues of course?
> DOH!!! I mixed some files up when I was making the post. I accidentally 
> posted an older version of this file. It has an error in the 
> refcount_ia32_add_load_strong function. Basically, the error goes like 
> this... It did not check the loaded refcount pointer from the shared 
> location for null. So, if it loads a null, it tries to process anyway!
>
Why don't you just stick all this on a maintained web site? Posted code
is soon out of date and google doesn't forget.
> Let's see if I have this right.
>
> You're replacing using a single global lock with hashed global
> locks to reduce lock contention.
Yes. I only use them a thread grabs it's "first/initial" reference, and when 
the count drops to zero.
> The rest of it look like how you'd do refcounting with a global
> lock(s).
All of the pointer operations (e.g., the swaps and the cas on shared pointer 
locations) are 100% lock-free. I only make use of the global hashed locking 
scheme for initial inc, and drop-to-zero.
> The problem with Boost is it's not that kind of thread safe, so
> they're not likely to buy into the extra overhead.
Well, all of the lock-free pointer operations "currently" have less overhead 
than shared_ptr...
> If you do
> add thread safety to C++ it's going to be in the form of a new
> library similar to what Java did with java.util.concurrent.
> Also there's the issue of async safety since you have spin locks
> in there.  What would be the advantage compared to the refcount
> algorithms that are async safe, apart from patent issues of course?
Well, the spinlocks can easily be changed into POSIX mutexs:
static pthread_mutex_t g_spinlock_table[0x100];
The advantage is that after you grab initial reference everything  is 100% 
lock-free until drop-to-zero. The spinlock critical-sections are extremely 
short. They can be replaced with different type of mutexs (e.g., POSIX 
mutex). Humm. I need to go right now, but I will be thinking some more about 
your well founded questions Joe...
Thank you.
http://appcore.home.comcast.net/vzoom/refcount/
> Posted code
> is soon out of date and google doesn't forget.
Indeed! :O
> Also there's the issue of async safety since you have spin locks
> in there.  What would be the advantage compared to the refcount
> algorithms that are async safe, apart from patent issues of course?
I think I misunderstood you earlier wrt spinlocks and "true" async-safety. 
Okay. The only time you can safely use my refcount algorithm in a 
signal-handlers context, for now, is when a thread has "already grabbed an 
initial reference to a shared data-structure". You simply "cannot use it in 
a signal-handler context if a thread tries to grab a reference to a 
data-object that it has not previously acquired". So, it kind of breaks down 
to sticking to "shared_ptr semantics" in a signal-handler; reference must be 
owned prior to any new ones being acquired...
Bottom line, you can use refcount in signal-handler after first reference 
acquired; this is on a per-refcount object basis. A thread that grabs a 
pointer to an object X, can safely grab another reference in a 100% 
lock-free manor (e.g., to pass to another thread perhaps, whatever...) in a 
signal handler context.
Humm... Does that come close to addressing what your getting at here Joe?
The only time I would use a decrement in a signal-handler is when you are 
100% sure that the count will not drop to zero. A simple example:
static refcount_t* volatile g_loc = {...};
static ourapp_server_msg_ref_t* const g_appsrv_msg = {...};
/*==============================================\
\\//\\//   void my_sighandler_add_and_queue(int);
\\//\\//________________________________________________
\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//
===============================================/
This signal handler increments the reference count of the object pointed to 
by 'g_loc'. This *process has already acquired a reference to the object, so 
it is okay to add to the reference count.
The function increments the reference count, and tries to enqueue a message 
on a lock-free queue that exists in shared memory, and is monitored by our 
applications server/manager process. If the server rejects the message, or 
if its processing a self-maintenance cycle, the function will subsequently 
decrement the counter. This is okay because we are 100% guaranteed that the 
count will not drop to zero; the previous increment simply gets revoked.
_____________________________________________________
\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//\\//
===============================================/
void my_sighandler_add_and_queue(int sig) {
  refcount_t *local = refcount_add_load_weak(&g_loc, 1);
  /* ourapp_server_msgpush is 100% lock-free; signal handler safe */
  if (! ourapp_server_msgpush(g_appsrv_msg, local)) {
    /* the server says no, revoke our previous increment */
    (void)refcount_sub_strong(local, 1);
  }
}
Any thoughts? 
[...]
> by 'g_loc'. This *process has already acquired a reference to the object, 
> so it is okay to add to the reference count.
* I have a version of refcount that supports process-wide atomic reference 
counting. A thread can grab a reference to piece of shared memory that was 
created by another process. The refcount implementation has a server process 
that acts as an arbiter of sorts. If a process commits some shared memory, 
and wants to only decommit it when its no longer being referenced by 
'anything', it can registered it with my server process via. the one of the 
servers lock-free communication channels. An entity (e.g., a thread within a 
process) can atomically acquire their first/initial reference to any 
registered object by simply applying the refcount API to pointer to 
locations in shared memory.
I only did this shared memory "thing" because I knew it would be a fairly 
good "learning experience and academic exercise". I am not sure if I will 
make a submission to Boost that includes a version of refcount that can 
operate on shared memory. Humm...
Also, I am not sure exactly how useful this would actually be. I guess it 
would be good for so-called robust server applications that are made of up 
multiple multi-threaded processes. Can anybody else think of practical ways 
to make use of process-wide atomic reference counting in any of their 
"existing" applications? 
Well, there's a number of solutions out there which have various trade offs.
You need to position it against them.  I don't know how important async
safety is, but when you need it, you really need it.
The posssible exception to this is using read-only shared memory with process
level smart pointers.  Using refcounting with read-only memory would be a
little tricky though.
[...]
> Well, there's a number of solutions out there which have various trade 
> offs.
> You need to position it against them.  I don't know how important async
> safety is, but when you need it, you really need it.
How many of those solutions allow you to decrement the reference count in a 
signal handlers context?  This could drop to zero and call user dtor... I 
have a work around for this. As a side-effect, it allows the thread that 
drops to zero to not have to take a spinlock in my refcount code. The basic 
idea is that the thread that drops an object to zero does not necessarily 
have to be the thread that calls the user-provided dtor... Humm... I guess I 
should prototype this in assembly asap! It would give my refcount algorithm 
the ability to have weak increments, and decrements in a signal handler 
context... Pretty cool huh?  lol...
;)
> You're replacing using a single global lock with hashed global
> locks to reduce lock contention.
[...]
I am thinking of augmenting my original algorithm with one of two tweaks 
before I submit anything to Boost:
One is replacing the hashed spinlocks with a hashed rw-spinlock scheme...
http://appcore.home.comcast.net/appcore/src/ac_rwspinlock_algo1_c.html
http://appcore.home.comcast.net/appcore/include/ac_rwspinlock_algo1_h.html
The involves a PDR technique I posted earlier:
http://groups.google.com/group/comp.arch/browse_frm/thread/b2f751d5dad6c07b
I could replace the hashed spinlocks with hashed proxy collector anchors 
(e.g., static my_proxy_anchor_t g_px[0x100])... I will probably have an 
easier time explaining to Boost reviewers how the reference counting stays 
atomic under solution one. Humm... 
The other solution involves...