Scalable distributed reference counting

Dmitriy V'jukov

unread,

Feb 14, 2008, 11:41:42 AM2/14/08

to

Scalable distributed reference counting with lightweight acquire/
release operations.

It supports:
- Reference counting with basic-thread safety
- Reference counting with strong-thread safety
- Reference counting with cross-thread references
- Ultra low overhead PDR
- PDR with long-term cross-thread references

Assumptions about environment I made.
- Fixed number of threads. I made this assumption partly for
simplicity and partly because I'm interested in such environment. This
can be overcome with addition of thread registration mechanism.
- Fixed maximum number of reference counted objects. Solely for
simplicity. Dynamic allocation of helper structures must be added.
- User threads must periodically execute special function. I made this
assumption partly for simplicity and partly because I'm interested in
such environment. This can be overcome if call to this special
function will be inserted into acquire/release operations. Or with
some other mechanisms like signals/APC etc.

Here is public API:

typedef void(*rcx_dtor_fp_t)(void*);
typedef unsigned rcx_t [4];

int rcx_sys_init(unsigned thread_count);
void rcx_sys_deinit();

int rcx_sys_thread_init(unsigned thread_idx);
void rcx_sys_thread_deinit();

// must be periodically executed by all user threads
void rcx_process();
// voluntarily initiate garbage collection (optional)
void rcx_collect();

int rcx_create(rcx_t* rcx, rcx_dtor_fp_t dtor_fp, void* state);
void rcx_destroy(rcx_t* rcx);
void* rcx_get_state(rcx_t* rcx);

void rcx_acquire(rcx_t* rcx);
void rcx_release(rcx_t* rcx);

Here is implementation:

#include "rcx.h"
#include <stdlib.h>

typedef char rcx_cacheline_pad_t [128];
static unsigned const rcx_max_object_count = 1024;
static unsigned const rcx_activity_acq = 1;
static unsigned const rcx_activity_rel = 1024;
static unsigned const rcx_activity_rel_new = 16 * 1024;
static unsigned const rcx_activity_threshold = 128 * 16 * 1024;
static unsigned const rcx_in_list_flag = 1u << 31;

#define FULL_FENCE() __asm mfence

static void rcx_process_epoch();

// decoding of rcx_t
typedef struct rcx_int_s
{
// index of per thread and global object descriptor
unsigned idx_;
// index of owner thread
unsigned thread_idx_;
// user supplied dtor and state
rcx_dtor_fp_t dtor_fp_;
void* state_;
}
rcx_int_t;

// per thread object descriptor
typedef struct rcx_obj_data_s
{
// cache for thread's acquire/release operations
// high bit used as flag
unsigned rc_;
}
rcx_obj_data_t;

// global object descriptor
typedef struct rcx_gl_obj_data_s
{
// pointer to owning rcx_t object
rcx_int_t* rcx_;
// global reference count
unsigned rc_;
// double-linked list of object descriptors
// for which rc drops to zero
struct rcx_gl_obj_data_s* volatile next_;
struct rcx_gl_obj_data_s* volatile prev_;
}
rcx_gl_obj_data_t;

// per thread data
typedef struct rcx_thread_s
{
// index [0 .. thread_count-1]
unsigned index_;
// list of free object descriptors
rcx_gl_obj_data_t* volatile freelist_head_;
rcx_gl_obj_data_t* freelist_tail_;
// cache of foreign free object descriptors
rcx_gl_obj_data_t* freelist_cache_;
// array of per thread object descriptors
rcx_obj_data_t* obj_data_;
// array of object descriptors
// for which acquire/release operations
// was executed in current epoch
rcx_int_t** rc_list_;
// list of object descriptors
// for which rc drops to zero
rcx_gl_obj_data_t* dtor_list_;
// head and tail nodes for dtor_list_
rcx_gl_obj_data_t dtor_fake_ [2];
rcx_cacheline_pad_t pad_;
}
rcx_thread_t;

// per thread data stored in tls
typedef struct rcx_thread_local_s
{
// copy of rcx_thread_t::obj_data_
rcx_obj_data_t* obj_data_;
// copy of rcx_thread_t::rc_list_
rcx_int_t** rc_list_;
// estimation of 'activity' produced by thread
// used for initiation of epoch shift
unsigned activity_;
// current position in rc_list_
unsigned rc_list_pos_;
// copy of rcx_thread_t::index_
unsigned index_;
}
rcx_thread_local_t;

// global data
typedef struct rcx_global_s
{
// array of global object descriptors
rcx_gl_obj_data_t* gl_obj_data_;
// array of per thread data
rcx_thread_t* threads_;
// thread count
unsigned thread_count_;
// flag that some thread
// wants to make epoch shift
unsigned volatile epoch_pending_;
// index of thread which allowed
// to copy local counters to global counters
unsigned volatile epoch_order_;
}
rcx_global_t;

static rcx_global_t rcx_global;
static __declspec(thread) rcx_thread_local_t rcx_thread;

int rcx_sys_init(unsigned thread_count)
{
unsigned i, j, first, last;
rcx_global.gl_obj_data_ =
calloc(rcx_max_object_count, sizeof(rcx_gl_obj_data_t));
rcx_global.threads_ =
calloc(thread_count, sizeof(rcx_thread_t));
rcx_global.thread_count_ = thread_count;
rcx_global.epoch_pending_ = 0;
rcx_global.epoch_order_ = thread_count;
for (i = 0; i != rcx_global.thread_count_; ++i)
{
rcx_thread_t* th = &rcx_global.threads_[i];
th->index_ = i;
th->rc_list_ =
calloc(rcx_max_object_count, sizeof(rcx_int_t*));
th->obj_data_ =
calloc(rcx_max_object_count, sizeof(rcx_obj_data_t));
first = i * (rcx_max_object_count / thread_count);
last = (i + 1) * (rcx_max_object_count / thread_count) - 1;
for (j = first; j != last; ++j)
{
rcx_global.gl_obj_data_[j].rcx_ = (rcx_int_t*)j;
rcx_global.gl_obj_data_[j].next_ =
&rcx_global.gl_obj_data_[j + 1];
}
rcx_global.gl_obj_data_[last].rcx_ = (rcx_int_t*)last;
th->freelist_head_ = &rcx_global.gl_obj_data_[last];
th->freelist_tail_ = &rcx_global.gl_obj_data_[first];
th->freelist_cache_ = 0;
th->dtor_list_ = &th->dtor_fake_[0];
th->dtor_fake_[0].next_ = &th->dtor_fake_[1];
th->dtor_fake_[0].prev_ = 0;
th->dtor_fake_[1].next_ = 0;
th->dtor_fake_[1].prev_ = &th->dtor_fake_[0];
}
return 0;
}

void rcx_sys_deinit()
{
// here we must free all survived rcx_t objects
// it's not implemented yet
free(rcx_global.threads_);
free(rcx_global.gl_obj_data_);
}

int rcx_sys_thread_init(unsigned thread_idx)
{
rcx_thread.obj_data_ =
rcx_global.threads_[thread_idx].obj_data_;
rcx_thread.rc_list_ =
rcx_global.threads_[thread_idx].rc_list_;
rcx_thread.rc_list_pos_ = 0;
rcx_thread.activity_ = 0;
rcx_thread.index_ = thread_idx;
return 0;
}

void rcx_sys_thread_deinit()
{
}

int rcx_create(rcx_t* rcx, rcx_dtor_fp_t dtor_fp, void* state)
{
rcx_int_t* x = (rcx_int_t*)rcx;
rcx_thread_t* th = &rcx_global.threads_[rcx_thread.index_];
// get free object descriptor from per thread queue
rcx_gl_obj_data_t* gl = th->freelist_tail_;
th->freelist_tail_ = th->freelist_tail_->next_;
x->idx_ = (unsigned)gl->rcx_;
x->thread_idx_ = rcx_thread.index_;
x->dtor_fp_ = dtor_fp;
x->state_ = state;
gl->rcx_ = x;
gl->rc_ = 1;
gl->next_ = 0;
gl->prev_ = 0;
return 0;
}

void rcx_destroy(rcx_t* rcx)
{
rcx_int_t* x = (rcx_int_t*)rcx;
rcx_thread_t* th = &rcx_global.threads_[rcx_thread.index_];
rcx_gl_obj_data_t* gl = &rcx_global.gl_obj_data_[x->idx_];
gl->rcx_ = (rcx_int_t*)x->idx_;
gl->rc_ = x->thread_idx_;
// return object descriptor to local cache
gl->next_ = th->freelist_cache_;
th->freelist_cache_ = gl;
}

void rcx_acquire(rcx_t* rcx)
{
rcx_int_t* x = (rcx_int_t*)rcx;
rcx_thread_local_t* th = &rcx_thread;
// find per thread object descriptor
rcx_obj_data_t* obj = &th->obj_data_[x->idx_];
// check whether object descriptor already in local list
// of descriptors for which thread have executed
// acquire/release operation
if (obj->rc_)
{
// descriptor already in list
// just increment counter
obj->rc_ += 1;
}
else
{
// descriptor not in list
// increment and mark counter
obj->rc_ += 1 + rcx_in_list_flag;
// add descriptor to list
th->rc_list_[th->rc_list_pos_++] = x;
}
th->activity_ += rcx_activity_acq;
}

void rcx_release(rcx_t* rcx)
{
rcx_int_t* x = (rcx_int_t*)rcx;
rcx_thread_local_t* th = &rcx_thread;
rcx_obj_data_t* obj = &th->obj_data_[x->idx_];
if (obj->rc_)
{
obj->rc_ -= 1;
th->activity_ += rcx_activity_rel;
}
else
{
obj->rc_ += -1 + rcx_in_list_flag;
th->rc_list_[th->rc_list_pos_++] = x;
th->activity_ += rcx_activity_rel_new;
}
}

void* rcx_get_state(rcx_t* rcx)
{
rcx_int_t* x = (rcx_int_t*)rcx;
return x->state_;
}

void rcx_process()
{
rcx_thread_local_t* th = &rcx_thread;
if (th->activity_ >= rcx_activity_threshold
&& 0 == rcx_global.epoch_pending_)
{
// we've created enough activity
// initiate epoch shift
rcx_global.epoch_pending_ = 1;
}
if (0 == th->index_
&& rcx_global.epoch_pending_
&& rcx_global.epoch_order_ ==
rcx_global.thread_count_)
{
// thread with index 0
// starts epoch processing
rcx_global.epoch_order_ = 0;
}
if (th->index_ == rcx_global.epoch_order_)
{
// it's my turn - process epoch
rcx_process_epoch();
// notify next thread
rcx_global.epoch_order_ += 1;
if (rcx_global.epoch_order_ ==
rcx_global.thread_count_)
{
rcx_global.epoch_pending_ = 0;
}
}
}

void rcx_collect()
{
if (0 == rcx_global.epoch_pending_)
rcx_global.epoch_pending_ = 1;
rcx_process();
}

void rcx_process_epoch()
{
rcx_thread_t* th = &rcx_global.threads_[rcx_thread.index_];
rcx_thread_t* th2;
rcx_gl_obj_data_t* cur;
rcx_gl_obj_data_t* next;
rcx_obj_data_t* loc;
unsigned i;
rcx_int_t* req;

// transfer cached object descriptors to owner thread
while (th->freelist_cache_)
{
cur = th->freelist_cache_;
next = cur->next_;
th2 = &rcx_global.threads_[cur->rc_];
th2->freelist_head_->next_ = cur;
th2->freelist_head_ = cur;
th->freelist_cache_ = next;
}

// execute dtor function for objects
// which are located in dtor list
// for full epoch
cur = th->dtor_list_->next_;
if (cur->next_)
{
while (cur->next_)
{
next = cur->next_;
cur->rc_ = 1;
cur->next_ = 0;
cur->prev_ = 0;
// here object descriptor is ready for reuse
cur->rcx_->dtor_fp_(cur->rcx_->state_);
cur = next;
}
th->dtor_fake_[0].next_ = &th->dtor_fake_[1];
th->dtor_fake_[1].prev_ = &th->dtor_fake_[0];
}

// transfer all acquire/release operations
// from local descriptor to global
for (i = 0; i != rcx_thread.rc_list_pos_; ++i)
{
req = th->rc_list_[i];
// global descriptor
cur = &rcx_global.gl_obj_data_[req->idx_];
// local descriptor
loc = &th->obj_data_[req->idx_];
cur->rc_ += loc->rc_ - rcx_in_list_flag;
loc->rc_ = 0;
if (cur->rc_ && cur->next_)
{
// remove object from dtor list
cur->prev_->next_ = cur->next_;
cur->next_->prev_ = cur->prev_;
cur->next_ = 0;
cur->prev_ = 0;
}
else if (0 == cur->rc_ && 0 == cur->next_)
{
// insert object to dtor list
cur->next_ = th->dtor_list_->next_;
cur->prev_ = th->dtor_list_;
th->dtor_list_->next_->prev_ = cur;
th->dtor_list_->next_ = cur;
}
else if (0 == cur->rc_ && cur->next_)
{
// remove and reinsert object to dtor list
cur->prev_->next_ = cur->next_;
cur->next_->prev_ = cur->prev_;
cur->next_ = th->dtor_list_->next_;
cur->prev_ = th->dtor_list_;
th->dtor_list_->next_->prev_ = cur;
th->dtor_list_->next_ = cur;
}
}

// reset local per-epoch variables
rcx_thread.activity_ = 0;
rcx_thread.rc_list_pos_ = 0;

// to support PDR
FULL_FENCE();
}

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 14, 2008, 12:23:29 PM2/14/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
news:5543280b-50b0-4e99...@v67g2000hse.googlegroups.com...

> Scalable distributed reference counting with lightweight acquire/
> release operations.
>
> It supports:
> - Reference counting with basic-thread safety
> - Reference counting with strong-thread safety
> - Reference counting with cross-thread references
> - Ultra low overhead PDR
> - PDR with long-term cross-thread references

[...]

Need to verify claims. However, vZOOM ref-counting still looks good.

Dmitriy V'jukov

unread,

Feb 14, 2008, 2:48:09 PM2/14/08

to

On Feb 14, 7:41 pm, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> Scalable distributed reference counting with lightweight acquire/
> release operations.

Basic outline of the algorithm:
One global descriptor and one local descriptor per every thread are
allocated for every reference counted object.
When executing acquire/release operation thread increments/decrements
counter in local descriptor, and adds local descriptor to local list
(if descriptor is not already in the list).
Threads process epoch shift in order. I.e.: thread 0, thread 0
notifies thread 1, thread 1, thread 1 notifies thread 2 and so on.
When processing epoch shift thread transfers counters from local
descriptor to global descriptor. If counter in global descriptor drops
to zero, global descriptor is added to dtor list. If counter becomes
not zero, global descriptor is removed from dtor list. After one full
epoch all objects in dtor list are deleted.

Space consumption per object: (8 words) + (1 word per thread) + (1
word per thread, if thread executes acquire/release in current epoch)

First acquire/release operation for object takes around 20 cycles,
subsequent acquire/release operations take around 10 cycles.

Algorithm especially well handles situation when there is large number
of acquire/release operations for small amount of objects.

Dmitriy V'jukov

unread,

Feb 14, 2008, 3:52:24 PM2/14/08

to

On Feb 13, 11:31 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> > > It must gracefully handle zillions of acquire/release operations :)
> > I almost complete implementation. It will handle:
> > 1. Reference counting with basic-thread safety
> > 2. Reference counting with strong-thread safety
> > 3. Reference counting with cross-thread references
> > 4. Ultra low overhead PDR
> > 5. PDR with long-term cross-thread references
> > I.e. basically any usage patterns.
> > I hope I will post implementation in near future.
>
> So do I as I am interested to see how it completely differs from the
> very-basic vZOOM distributed reference counting scheme...

First of all, my algorithm (RCX) is a bit similar to VZOOM. In a sense
that there are local and global counters.

But RCX is more powerful. VZOOM relies on user to call defer function
for object, then VZOOM cames into play and tracks when counter
eventually drops to zero. Sometimes user doesn't know when to call
defer function. For example, message-passing environment. Threads pass
messages through queues, and execute acquire before pass and release
after consumption. And nobody knows who is 'the last'. RCX can handle
such situation.
Or, for example, arbitrary graph of objects. Each object holds
arbitrary amount of references to other objects. Reference counting
can handle life-time of objects in such environment (I assume that
application logic ensures that there is no circular graphs of hanging
objects).
On the other hand, VZOOM starts making aggregation of counters for
object only when object is deferred. RCX have to make aggregation of
counters in every epoch.

Dmitriy V'jukov

unread,

Feb 14, 2008, 4:02:41 PM2/14/08

to

There is some differences in internal work too.
VZOOM have to aggregate counters from *all* threads, no matter whether
they worked with object or not. RCX only aggregates counters from
threads which actually worked with object.
For example, if only one thread ever works with object, then only this
thread will transfer his counter to global counter. I.e. there will be
no cache-line transfers, because all locations is in cache of this
thread.
Also in VZOOM collector thread reads local counters of user threads.
It incurs 2 cache-line transfers. First - to transfer local counter to
collector thread, second - to transfer cache-line back when user
thread will make write to this cache-line.
In RCX user threads transfer local counters to global counters by
himself. It incurs only one cache-line transfer.

Dmitriy V'jukov

unread,

Feb 14, 2008, 4:13:12 PM2/14/08

to

On Feb 15, 12:02 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:

> There is some differences in internal work too.
> VZOOM have to aggregate counters from *all* threads, no matter whether
> they worked with object or not. RCX only aggregates counters from
> threads which actually worked with object.
> For example, if only one thread ever works with object, then only this
> thread will transfer his counter to global counter. I.e. there will be
> no cache-line transfers, because all locations is in cache of this
> thread.
> Also in VZOOM collector thread reads local counters of user threads.
> It incurs 2 cache-line transfers. First - to transfer local counter to
> collector thread, second - to transfer cache-line back when user
> thread will make write to this cache-line.
> In RCX user threads transfer local counters to global counters by
> himself. It incurs only one cache-line transfer.

Totally forgot. VZOOM is coarse-grained (fixed array of counters with
hashing). RCX is fine-grained (per object counters).

Btw, I think that RCX can work with VZOOM style coarse-grained fixed
array of counters and hashing of pointers.
I.e. for every hash cell there will be list of living objects. When
hash cell drops to zero all objects in the list are deleted. All other
things will remain the same. I think it will work.
Or put it another way. VZOOM can work with RCX style fine-grained
dynamic array of per object counters.
What do you think?

Dmitriy V'jukov

unread,

Feb 14, 2008, 4:28:32 PM2/14/08

to

On Feb 14, 7:41 pm, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> Scalable distributed reference counting with lightweight acquire/
> release operations.

Possible tweak for the algorithm.
While epoch processing remember whether more than one thread ever
worked with the object. When counter drops to zero, if only one thread
ever worked with object, object can be deleted instantly, without
settling in dtor list. It will increase reactivity of reclamation.

Here is patch:

> void rcx_process_epoch()
> {
> [...]

> // transfer all acquire/release operations
> // from local descriptor to global
> for (i = 0; i != rcx_thread.rc_list_pos_; ++i)
> {
> req = th->rc_list_[i];
> // global descriptor
> cur = &rcx_global.gl_obj_data_[req->idx_];
> // local descriptor
> loc = &th->obj_data_[req->idx_];
> cur->rc_ += loc->rc_ - rcx_in_list_flag;
> loc->rc_ = 0;

cur->single_thread_ &&= th->index_ == req->thread_idx_;

> if (cur->rc_ && cur->next_)
> {
> // remove object from dtor list
> cur->prev_->next_ = cur->next_;
> cur->next_->prev_ = cur->prev_;
> cur->next_ = 0;
> cur->prev_ = 0;
> }
> else if (0 == cur->rc_ && 0 == cur->next_)
> {

if (cur->single_thread_)
{
cur->rc_ = 1;
cur->rcx_->dtor_fp_(cur->rcx_->state_);
}
else

{
// insert object to dtor list
cur->next_ = th->dtor_list_->next_;
cur->prev_ = th->dtor_list_;
th->dtor_list_->next_->prev_ = cur;
th->dtor_list_->next_ = cur;
}
> }
> else if (0 == cur->rc_ && cur->next_)
> {
> // remove and reinsert object to dtor list
> cur->prev_->next_ = cur->next_;
> cur->next_->prev_ = cur->prev_;
> cur->next_ = th->dtor_list_->next_;
> cur->prev_ = th->dtor_list_;
> th->dtor_list_->next_->prev_ = cur;
> th->dtor_list_->next_ = cur;
> }
> }

> [...]
> }

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 15, 2008, 5:44:18 AM2/15/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:df7eb7c6-c23b-4094...@i29g2000prf.googlegroups.com...

> On Feb 14, 11:52 pm, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>> On Feb 13, 11:31 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>>
>> > > > It must gracefully handle zillions of acquire/release operations :)
>> > > I almost complete implementation. It will handle:
>> > > 1. Reference counting with basic-thread safety
>> > > 2. Reference counting with strong-thread safety
>> > > 3. Reference counting with cross-thread references
>> > > 4. Ultra low overhead PDR
>> > > 5. PDR with long-term cross-thread references
>> > > I.e. basically any usage patterns.
>> > > I hope I will post implementation in near future.
>>
>> > So do I as I am interested to see how it completely differs from the
>> > very-basic vZOOM distributed reference counting scheme...
>>
>> First of all, my algorithm (RCX) is a bit similar to VZOOM. In a sense
>> that there are local and global counters.

[...]

>
> There is some differences in internal work too.
> VZOOM have to aggregate counters from *all* threads,

all __registered__ threads; not all application threads...

> no matter whether
> they worked with object or not. RCX only aggregates counters from
> threads which actually worked with object.
> For example, if only one thread ever works with object, then only this
> thread will transfer his counter to global counter. I.e. there will be
> no cache-line transfers, because all locations is in cache of this
> thread.
> Also in VZOOM collector thread reads local counters of user threads.
> It incurs 2 cache-line transfers. First - to transfer local counter to
> collector thread, second - to transfer cache-line back when user
> thread will make write to this cache-line.
> In RCX user threads transfer local counters to global counters by
> himself. It incurs only one cache-line transfer.

If RCX user threads are required to transfer counters to global counters,
how can they use automatic method of execution? vZOOM in automatic mode, a
user thread is not required to do anything, aside from grabbing and throwing
away references at will. Does your rcx_process function need to be executed
on a fairly periodic basis? This sounds similar to vZOOM configured to run
under manual mode...

Please clarify.

Chris Thomasson

unread,

Feb 15, 2008, 5:46:23 AM2/15/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message
news:dI-dnariV8md9ija...@comcast.com...

You suggest sticking rcx_process into acquire/release functions?

Chris Thomasson

unread,

Feb 15, 2008, 5:50:28 AM2/15/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:874dea16-fc02-454b...@c4g2000hsg.googlegroups.com...

> On Feb 15, 12:02 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>
>> There is some differences in internal work too.
>> VZOOM have to aggregate counters from *all* threads, no matter whether
>> they worked with object or not. RCX only aggregates counters from
>> threads which actually worked with object.
>> For example, if only one thread ever works with object, then only this
>> thread will transfer his counter to global counter. I.e. there will be
>> no cache-line transfers, because all locations is in cache of this
>> thread.
>> Also in VZOOM collector thread reads local counters of user threads.
>> It incurs 2 cache-line transfers. First - to transfer local counter to
>> collector thread, second - to transfer cache-line back when user
>> thread will make write to this cache-line.
>> In RCX user threads transfer local counters to global counters by
>> himself. It incurs only one cache-line transfer.
>
> Totally forgot. VZOOM is coarse-grained (fixed array of counters with
> hashing). RCX is fine-grained (per object counters).
>
> Btw, I think that RCX can work with VZOOM style coarse-grained fixed
> array of counters and hashing of pointers.

Yup; under manual mode. vZOOM only requires a void* in the user deferred
object space. It also does not require any special logic added into the
long-term inc/dec logic. Apparently, you want to stick rcx_acquire/release
into that code-path. You mention Signals/APC... Well, that should work.

> I.e. for every hash cell there will be list of living objects. When
> hash cell drops to zero all objects in the list are deleted. All other
> things will remain the same. I think it will work.
> Or put it another way. VZOOM can work with RCX style fine-grained
> dynamic array of per object counters.
> What do you think?

vZOOM can work under other designs indeed. I have fleshed out around 8
different embodiments which hold to the patent teachings.

Dmitriy V'jukov

unread,

Feb 15, 2008, 5:48:41 AM2/15/08

to

On Feb 15, 1:44 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> >> > So do I as I am interested to see how it completely differs from the
> >> > very-basic vZOOM distributed reference counting scheme...
>
> >> First of all, my algorithm (RCX) is a bit similar to VZOOM. In a sense
> >> that there are local and global counters.
> [...]
>
> > There is some differences in internal work too.
> > VZOOM have to aggregate counters from *all* threads,
>
> all __registered__ threads; not all application threads...

I understand this. I mean all threads that have some relation to
reference counting.

> > no matter whether
> > they worked with object or not. RCX only aggregates counters from
> > threads which actually worked with object.
> > For example, if only one thread ever works with object, then only this
> > thread will transfer his counter to global counter. I.e. there will be
> > no cache-line transfers, because all locations is in cache of this
> > thread.
> > Also in VZOOM collector thread reads local counters of user threads.
> > It incurs 2 cache-line transfers. First - to transfer local counter to
> > collector thread, second - to transfer cache-line back when user
> > thread will make write to this cache-line.
> > In RCX user threads transfer local counters to global counters by
> > himself. It incurs only one cache-line transfer.
>
> If RCX user threads are required to transfer counters to global counters,
> how can they use automatic method of execution? vZOOM in automatic mode, a
> user thread is not required to do anything, aside from grabbing and throwing
> away references at will. Does your rcx_process function need to be executed
> on a fairly periodic basis? This sounds similar to vZOOM configured to run
> under manual mode...
>
> Please clarify.

First of all, I am now interested in environment where manual periodic
execution of rcx_process() is not a problem (it's event-driven
environment). I think that in many real-life programs there are
natural points where rcx_process() can be inserted.

Well, if one want to plug in automatic epoch processing... I didn't
investigate this possibility yet.
I think that maybe I can use signal/APC to every thread.

> You suggest sticking rcx_process into acquire/release functions?

Yes. I think this will work. It's only one additional 'if' in
rcx_acquire()/rcx_release().

Dmitriy V'jukov

unread,

Feb 15, 2008, 10:21:34 AM2/15/08

to

On Feb 15, 1:44 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message

What do you think about this approach in general? I would like to hear
your professional opinion on it.

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 18, 2008, 2:03:13 PM2/18/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:a8a6d943-7ff4-4efe...@s12g2000prg.googlegroups.com...
[...]

>> If RCX user threads are required to transfer counters to global counters,
>> how can they use automatic method of execution? vZOOM in automatic mode,
>> a
>> user thread is not required to do anything, aside from grabbing and
>> throwing
>> away references at will. Does your rcx_process function need to be
>> executed
>> on a fairly periodic basis? This sounds similar to vZOOM configured to
>> run
>> under manual mode...
>
> What do you think about this approach in general? I would like to hear
> your professional opinion on it.

It will work and scale well if a users environment can cope with some of its
requirements. The per-object meta-data is a bit expensive, IMHO that is. I
am not too sure about the rcx_process() function... How can this aspect cope
with threads that block? AFAICT, an epoch requires threads to process in
"lock-step", if one of those threads blocks, well, its going to hold up an
epoch. I don't think you can execute rcx_process() from signals because you
can't get TLS in a handler. APC should work, but then you need to be sure
that your in an alterable wait. Also, do you put a cap on the total number
of objects (e.g., 'rcx_max_object_count' constant)?

Have you been running this in your actor framework?

Try not to flame me too bad if I am completely wrong on this!

;^)

Chris Thomasson

unread,

Feb 18, 2008, 2:09:12 PM2/18/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:16f6a6ce-1240-42d6...@d4g2000prg.googlegroups.com...
[...]

> First of all, I am now interested in environment where manual periodic
> execution of rcx_process() is not a problem (it's event-driven
> environment). I think that in many real-life programs there are
> natural points where rcx_process() can be inserted.
>
> Well, if one want to plug in automatic epoch processing... I didn't
> investigate this possibility yet.
> I think that maybe I can use signal/APC to every thread.

I believe that APC should work. However, since you need to access per-thread
data, well, that rules out signals...

Correct me if I am wrong, but even if a thread executes rcx_process() before
it blocks, and the threads index did not match the current epoch_order, will
that not block further epoch's from being processed for at least the
duration of the subsequent wait?

Chris Thomasson

unread,

Feb 18, 2008, 2:13:48 PM2/18/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:76811231-df16-4529...@s13g2000prd.googlegroups.com...

> On Feb 13, 11:31 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> > > It must gracefully handle zillions of acquire/release operations :)
>> > I almost complete implementation. It will handle:
>> > 1. Reference counting with basic-thread safety
>> > 2. Reference counting with strong-thread safety
>> > 3. Reference counting with cross-thread references
>> > 4. Ultra low overhead PDR
>> > 5. PDR with long-term cross-thread references
>> > I.e. basically any usage patterns.
>> > I hope I will post implementation in near future.
>>
>> So do I as I am interested to see how it completely differs from the
>> very-basic vZOOM distributed reference counting scheme...
>
> First of all, my algorithm (RCX) is a bit similar to VZOOM. In a sense
> that there are local and global counters.
>
> But RCX is more powerful. VZOOM relies on user to call defer function
> for object, then VZOOM cames into play and tracks when counter
> eventually drops to zero.

Right.

> Sometimes user doesn't know when to call
> defer function. For example, message-passing environment. Threads pass
> messages through queues, and execute acquire before pass and release
> after consumption. And nobody knows who is 'the last'.

I have some usage patterns that can deal with "some" of this.

> RCX can handle such situation.

I see.

> Or, for example, arbitrary graph of objects. Each object holds
> arbitrary amount of references to other objects. Reference counting
> can handle life-time of objects in such environment (I assume that
> application logic ensures that there is no circular graphs of hanging
> objects).

IMHO, circular reference are a red-herring. I can always find ways to get
around them. I don't think I would worry about that too much.

> On the other hand, VZOOM starts making aggregation of counters for
> object only when object is deferred. RCX have to make aggregation of
> counters in every epoch.

True. Also, vZOOM epoch processing samples references from registered
threads, while an RCX thread must give the epoch processor the reference
counter mutations it has built up.

I have some more questions, but I need to examine the code in greater detail
to see if I can answer them myself and not waste your time.

:^)

Chris Thomasson

unread,

Feb 18, 2008, 2:18:12 PM2/18/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:00a19f9c-f93c-476a...@q65g2000hsd.googlegroups.com...

> On Feb 14, 7:41 pm, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>> Scalable distributed reference counting with lightweight acquire/
>> release operations.
>
> Basic outline of the algorithm:
> One global descriptor and one local descriptor per every thread are
> allocated for every reference counted object.
> When executing acquire/release operation thread increments/decrements
> counter in local descriptor, and adds local descriptor to local list
> (if descriptor is not already in the list).
> Threads process epoch shift in order. I.e.: thread 0, thread 0
> notifies thread 1, thread 1, thread 1 notifies thread 2 and so on.

This kind of has me worried a bit... If one of those threads blocks, well, I
need to examine the rcx_process()-->rcx_process_epoch() relationship some
more...

> When processing epoch shift thread transfers counters from local
> descriptor to global descriptor. If counter in global descriptor drops
> to zero, global descriptor is added to dtor list. If counter becomes
> not zero, global descriptor is removed from dtor list. After one full
> epoch all objects in dtor list are deleted.
>
> Space consumption per object: (8 words) + (1 word per thread) + (1
> word per thread, if thread executes acquire/release in current epoch)

This might be a problem under certain scenarios. Think about handling
hundreds of thousands of objects with a lot of threads. I still need to
examine your code in detail. I have printed it out, and plan on reading it.
At least its nice and clear. You're a good coder.

> First acquire/release operation for object takes around 20 cycles,
> subsequent acquire/release operations take around 10 cycles.

> Algorithm especially well handles situation when there is large number
> of acquire/release operations for small amount of objects.

Completely agreed.

Dmitriy V'jukov

unread,

Feb 19, 2008, 1:52:31 AM2/19/08

to

On Feb 18, 10:09 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message

>
> news:16f6a6ce-1240-42d6...@d4g2000prg.googlegroups.com...
> [...]
>
> > First of all, I am now interested in environment where manual periodic
> > execution of rcx_process() is not a problem (it's event-driven
> > environment). I think that in many real-life programs there are
> > natural points where rcx_process() can be inserted.
>
> > Well, if one want to plug in automatic epoch processing... I didn't
> > investigate this possibility yet.
> > I think that maybe I can use signal/APC to every thread.
>
> I believe that APC should work. However, since you need to access per-thread
> data, well, that rules out signals...

For APC I need alertable wait, as you are noticed. So it wont work in
arbitrary user environment. Special care must be taken to make it work
with APC.
I am not very familiar with signals. I really can't access TLS/TSS? So
maybe I can get thread id, and map it to thread index?

> >> On Feb 15, 1:44 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> >> You suggest sticking rcx_process into acquire/release functions?
>
> > Yes. I think this will work. It's only one additional 'if' in
> > rcx_acquire()/rcx_release().
>
> Correct me if I am wrong, but even if a thread executes rcx_process() before
> it blocks, and the threads index did not match the current epoch_order, will
> that not block further epoch's from being processed for at least the
> duration of the subsequent wait?

Yes, you are totally right.
I was considering variant when threads process epoch shift not in
strict order, but in arbitrary order. Something like
try_enter_critical_section logic:

int try_process_epoch_shift()
{
if (epoch_processing_busy)
// we are not lucky this time
return 1;
if (0 != atomic_xchg(&epoch_processing_busy, 1))
// we are not lucky this time
return 1;

// process epoch shift here

epoch_processing_busy = 0;
return 0;
}

But then I have to keep objects in rcx_thread_s::dtor_list_ one
additional epoch.

And I think that I can add speculative out-of-order epoch processing:

void process_epoch_before_blocking()
{
// process epoch shift
// even if epoch_pending_ is not set yet
while (try_process_epoch_shift()) yield();
}

In my environment I have thread-per-processor, and threads are binded
to processor, and threads are non-blocking. So I can control when
thread is going to block. And I can wake up thread if needed:

> void rcx_process()
> {
> rcx_thread_local_t* th = &rcx_thread;
> if (th->activity_ >= rcx_activity_threshold
> && 0 == rcx_global.epoch_pending_)
> {
> // we've created enough activity
> // initiate epoch shift
> rcx_global.epoch_pending_ = 1;

rcx_global.threads_[0].eventcount_.notify();

> }
> if (0 == th->index_
> && rcx_global.epoch_pending_
> && rcx_global.epoch_order_ ==
> rcx_global.thread_count_)
> {
> // thread with index 0
> // starts epoch processing
> rcx_global.epoch_order_ = 0;
> }
> if (th->index_ == rcx_global.epoch_order_)
> {
> // it's my turn - process epoch
> rcx_process_epoch();
> // notify next thread
> rcx_global.epoch_order_ += 1;
> if (rcx_global.epoch_order_ ==
> rcx_global.thread_count_)
> {
> rcx_global.epoch_pending_ = 0;
> }

else
{

rcx_global.threads_[rcx_global.epoch_order_].eventcount_.notify();
}

> }
> }

I think this will work nice.

Dmitriy V'jukov

unread,

Feb 19, 2008, 2:35:49 AM2/19/08

to

On Feb 18, 10:03 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> > What do you think about this approach in general? I would like to hear
> > your professional opinion on it.
>
> It will work and scale well if a users environment can cope with some of its
> requirements.

Yeah, it's a bit difficult to push it into self-contained library, and
make it work off the shelf. But I think that one can integrate RCX
into virtually any application, at least into server event-driven/
request-processing software.

> The per-object meta-data is a bit expensive, IMHO that is.

I was trying hard to reduce memory overhead. So it's a minimum I can
produce so far.
I think with some dirty hacks I can reduce per-object overhead to 4
words (+ 1 word per thread). 1 word per thread is necessary evil,
otherwise there will be no scalability.
On the other hand, we are living in the world where
sizeof(std::string) = 32, and nobody cares :)

> I
> am not too sure about the rcx_process() function... How can this aspect cope
> with threads that block?

As-is it can't...

> AFAICT, an epoch requires threads to process in
> "lock-step", if one of those threads blocks, well, its going to hold up an
> epoch.

Yes, it's a 'little' problem...
I believe that in 'cooperative' environment RCX can be made work. And
I'm interesting *how* it will work then.

> I don't think you can execute rcx_process() from signals because you
> can't get TLS in a handler. APC should work, but then you need to be sure
> that your in an alterable wait.

I've answered in previous post.

> Also, do you put a cap on the total number
> of objects (e.g., 'rcx_max_object_count' constant)?

rcx_max_object_count solely for simplicity. In real life one must
allocate object descriptors dynamically. I think that one can put
descriptors in double-indirection array, to make random access fast,
and allow growing at the same time.

> Have you been running this in your actor framework?

Not yet. RCX is piping hot from my mind.
My actor framework is a kind of 'virtually zero overhead', because I
can spend on it virtually no time at all :)

Dmitriy V'jukov

unread,

Feb 19, 2008, 3:04:49 AM2/19/08

to

On Feb 18, 10:13 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> > First of all, my algorithm (RCX) is a bit similar to VZOOM. In a sense
> > that there are local and global counters.
>
> > But RCX is more powerful. VZOOM relies on user to call defer function
> > for object, then VZOOM cames into play and tracks when counter
> > eventually drops to zero.
>
> Right.

I think I can add similar logic into RCX. I already have algorithm in
mind.
Usage pattern will be something like this:

rcx_create(..., rcx_temporary_disable_rc);
// until call to rcx_enable_rc()
// reference counters for this object will not be aggregated

...

rcx_t* x = remove_node_from_shared_list();
rcx_enable_rc(x):
// from this point aggregation is enabled

So it will work virtually as VZOOM. Except that user threads are
responsible for transferring local counters to global counter, instead
of collector thread.
Of course rcx_temporary_disable_rc flag will be optional.

> > Sometimes user doesn't know when to call
> > defer function. For example, message-passing environment. Threads pass
> > messages through queues, and execute acquire before pass and release
> > after consumption. And nobody knows who is 'the last'.
>
> I have some usage patterns that can deal with "some" of this.

This:
http://groups.google.com/group/comp.programming.threads/browse_frm/thread/2f18a8892a8326f
?
Btw, now I'm not sure that it will work, because with cross-thread
references collector thread can see release operation from one thread
but miss corresponding acquire operation from another thread. Without
cross-thread references it's not possible.
Are cross-thread references now allowed in VZOOM?

> > Or, for example, arbitrary graph of objects. Each object holds
> > arbitrary amount of references to other objects. Reference counting
> > can handle life-time of objects in such environment (I assume that
> > application logic ensures that there is no circular graphs of hanging
> > objects).
>
> IMHO, circular reference are a red-herring. I can always find ways to get
> around them. I don't think I would worry about that too much.

Totally agree. It's just an example.
But consider 'reversed' tree structure where 'leafs' contain
references to 'parent' nodes. Hmmm... It's not very realistic too...

> > On the other hand, VZOOM starts making aggregation of counters for
> > object only when object is deferred. RCX have to make aggregation of
> > counters in every epoch.
>
> True. Also, vZOOM epoch processing samples references from registered
> threads, while an RCX thread must give the epoch processor the reference
> counter mutations it has built up.

It simplifies handling of blocking user threads.
But imvho it is worse from performance/scalability POV. Because thread
local counters are touched (read) by collector thread.

> I have some more questions, but I need to examine the code in greater detail
> to see if I can answer them myself and not waste your time.
> :^)

Feel free to ask ;)

Dmitriy V'jukov

unread,

Feb 19, 2008, 3:15:53 AM2/19/08

to

On Feb 18, 10:18 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message

>
> news:00a19f9c-f93c-476a...@q65g2000hsd.googlegroups.com...
>
> > On Feb 14, 7:41 pm, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> >> Scalable distributed reference counting with lightweight acquire/
> >> release operations.
>
> > Basic outline of the algorithm:
> > One global descriptor and one local descriptor per every thread are
> > allocated for every reference counted object.
> > When executing acquire/release operation thread increments/decrements
> > counter in local descriptor, and adds local descriptor to local list
> > (if descriptor is not already in the list).
> > Threads process epoch shift in order. I.e.: thread 0, thread 0
> > notifies thread 1, thread 1, thread 1 notifies thread 2 and so on.
>
> This kind of has me worried a bit... If one of those threads blocks, well, I
> need to examine the rcx_process()-->rcx_process_epoch() relationship some
> more...

I've answered here:
http://groups.google.com/group/comp.programming.threads/tree/browse_frm/thread/dab22a17c32c6b13

> > When processing epoch shift thread transfers counters from local
> > descriptor to global descriptor. If counter in global descriptor drops
> > to zero, global descriptor is added to dtor list. If counter becomes
> > not zero, global descriptor is removed from dtor list. After one full
> > epoch all objects in dtor list are deleted.
>
> > Space consumption per object: (8 words) + (1 word per thread) + (1
> > word per thread, if thread executes acquire/release in current epoch)
>
> This might be a problem under certain scenarios. Think about handling
> hundreds of thousands of objects with a lot of threads. I still need to
> examine your code in detail.

I've answered here:
http://groups.google.com/group/comp.programming.threads/msg/eaf04bcc293e1df3

In addition one can build simplified proxy-collector on top of RCX.
For example, there is proxy object per collection. Readers acquire/
release references to proxy, not to nodes itself. Writers defer
removed nodes to proxy. When proxy collect N nodes, proxy is replaced
with new proxy. Old proxy will be eventually deleted and dtor will
free all collected nodes.
So nodes itself don't have to include rcx_t object.

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 20, 2008, 3:03:13 PM2/20/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:3678b226-fac4-4885...@i7g2000prf.googlegroups.com...

> On Feb 18, 10:09 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message
>>
>> news:16f6a6ce-1240-42d6...@d4g2000prg.googlegroups.com...
>> [...]
>>
>> > First of all, I am now interested in environment where manual periodic
>> > execution of rcx_process() is not a problem (it's event-driven
>> > environment). I think that in many real-life programs there are
>> > natural points where rcx_process() can be inserted.
>>
>> > Well, if one want to plug in automatic epoch processing... I didn't
>> > investigate this possibility yet.
>> > I think that maybe I can use signal/APC to every thread.
>>
>> I believe that APC should work. However, since you need to access
>> per-thread
>> data, well, that rules out signals...
>
> For APC I need alertable wait, as you are noticed. So it wont work in
> arbitrary user environment. Special care must be taken to make it work
> with APC.
> I am not very familiar with signals. I really can't access TLS/TSS? So
> maybe I can get thread id, and map it to thread index?

The signal-handler would need to access a data-structure in order to acquire
the thread id that was interrupted and whose context is transferred to the
hander.

Then it boils down to wrapping existing API's that may, or may, block.

> In my environment I have thread-per-processor, and threads are binded
> to processor, and threads are non-blocking. So I can control when
> thread is going to block. And I can wake up thread if needed:
>
>> void rcx_process()
>> {
>> rcx_thread_local_t* th = &rcx_thread;
>> if (th->activity_ >= rcx_activity_threshold
>> && 0 == rcx_global.epoch_pending_)
>> {
>> // we've created enough activity
>> // initiate epoch shift
>> rcx_global.epoch_pending_ = 1;
>
> rcx_global.threads_[0].eventcount_.notify();

^^^^^^^^^^^^^^^^^^

>> }
[...]

>> if (th->index_ == rcx_global.epoch_order_)
>> {

[...]
> else
> {
>
> rcx_global.threads_[rcx_global.epoch_order_].eventcount_.notify();
^^^^^^^^^^^^^^^^^^

> }
>
>> }
>> }
>
> I think this will work nice.

If the "user-threads" potential blocking logic takes adequate role in the
epoch processing scheme, then it should work fine.

Chris Thomasson

unread,

Feb 20, 2008, 3:11:39 PM2/20/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:23fb0e6b-7a92-46d7...@i7g2000prf.googlegroups.com...

[...]

Yes. vZOOM stuff, well it's like every thread is always in a constant proxy
collected region from registration. The readers need to perform membar every
so often. Persistent references can exist across membar. The membars are
tracked on a wide-scope such that a thread does not even need to keep any
local meta-data about epoch algorithm variables.

Chris Thomasson

unread,

Mar 3, 2008, 1:46:41 AM3/3/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:839fb026-7e59-455e...@d4g2000prg.googlegroups.com...

[...]

Cooperative in the sense of multiplexing work on groups of threads bound to
a single cpu?

Dmitriy V'jukov

unread,

Mar 8, 2008, 6:59:46 AM3/8/08

to

On 3 мар, 09:46, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message

No, I didn't mean "cooperative user-level threading". I just meant
that user threads must be 'friendly' to RCX, and not counteract with
RCX.

Dmitriy V'jukov