Proxy-collector with atomic-free fast-path

Dmitriy V'jukov

unread,

Feb 2, 2008, 9:12:44 PM2/2/08

to

Proxy-collector with atomic-free/membar-free fast-path for acquire/
release.

Chris Thomasson's original proxy-collector is algorithm basis:
http://home.comcast.net/~vzoom/demos/pc_sample.c

Instead of collector per user object I use fixed collector count. So
every collector holds multiple user objects. In code below collector
count = 4.

Also I mix in my acquire/release amortization trick:
http://groups.google.ru/group/comp.programming.threads/msg/91d01886ae690185
It allows me to achieve atomic-free/membar-free fast-path for acquire/
release.

And finally I add some amortizations like local per thread cache for
deferred objects.

So it must kick ass of any other proxy-collector out there :)
Strictly saying, it's no more pure proxy-collector, it's something
like PC+RCU+SMR :)

Public API:

void pc_init();
void pc_deinit();

void pc_thread_init();
void pc_thread_deinit();

void pc_acquire();
void pc_release();

struct pc_node_t;
void pc_defer(pc_node_t* node);

// this is the only new call
// must be executed before blocking for a long time
// to not prevent memory reclamation
void pc_flush();

Implementation:

#include <windows.h>
#include <intrin.h>
#pragma intrinsic (_InterlockedAnd)

typedef char cacheline_pad_t [128];

unsigned const pc_word_bits = 32;
unsigned const pc_collector_count = 4;
unsigned const pc_collector_bits = 2;
unsigned const pc_counter_inc =
1 << (pc_collector_count + pc_collector_bits);
unsigned const pc_is_current_inc = 1;
unsigned const pc_back_link_inc = 1 << 1;

unsigned const pc_local_garbage_cache = 8;
unsigned const pc_global_garbage_threshold = 32;

struct pc_node_t
{
pc_node_t* pc_next_;
virtual ~pc_node_t() {}
};

struct pc_master_t
{
cacheline_pad_t pad1_;

union union_t
{
long whole_;
struct
{
unsigned current_collector_ : pc_collector_bits;//lsb
unsigned collector_busy_mask_ : pc_collector_count;
unsigned outer_counter_ : pc_word_bits
- pc_collector_count - pc_collector_bits;//msb
};
};

union_t union_;

cacheline_pad_t pad2_;

union
{
long copy_whole_;
struct
{
unsigned current_collector_copy_ : pc_collector_bits;//lsb
unsigned pad_ : pc_word_bits - pc_collector_bits;//msb
};
};
};

struct pc_collector_t
{
cacheline_pad_t pad1_;

pc_node_t* defer_list_;
unsigned defer_list_size_;

cacheline_pad_t pad2_;

union
{
long whole_counter_;
struct
{
unsigned is_current_ : 1;//lsb
unsigned back_link_ : 1;
unsigned pad_ : pc_collector_count + pc_collector_bits - 2;
unsigned inner_counter_ : pc_word_bits
- pc_collector_count - pc_collector_bits;//msb
};
};
};

struct pc_thread_data_t
{
unsigned recursion_count_;
unsigned is_acquired_;
unsigned collector_index_;
pc_node_t* local_defer_list_;
pc_node_t* local_defer_list_tail_;
unsigned local_defer_list_size_;
};

pc_master_t pc_master;
pc_collector_t pc_collectors [pc_collector_count];
__declspec(thread) pc_thread_data_t pc_thread_data;

unsigned pc_acquire_impl()
{
// load current collector
// and increment outer counter
long const prev =
_InterlockedExchangeAdd(
&pc_master.union_.whole_, pc_counter_inc);
pc_master_t::union_t u = {prev};
return u.current_collector_;
}

void pc_release_impl(unsigned const index, unsigned count)
{
// decrement inner counter
pc_collector_t& collector = pc_collectors[index];
unsigned const prev =
_InterlockedExchangeAdd(
&collector.whole_counter_, -count);
if (0 == prev - count)
{
// delete all nodes from collector
while (collector.defer_list_)
{
pc_node_t* next = collector.defer_list_->pc_next_;
delete collector.defer_list_;
collector.defer_list_ = next;
}
collector.defer_list_size_ = 0;

// prepare collector for next run
collector.back_link_ = 1;
collector.is_current_ = 1;

// update collector busy mask
pc_master_t::union_t u;
u.collector_busy_mask_ = 1 << index;
_InterlockedAnd(&pc_master.union_.whole_,
~u.whole_);

// reset back-link
pc_release_impl(
(index + 1) % pc_collector_count, pc_back_link_inc);
}
}

void pc_flush_impl()
{
// transfer local defer list to collector
pc_thread_data_t& local = pc_thread_data;
pc_collector_t& collector = pc_collectors[local.collector_index_];

pc_node_t* prev = (pc_node_t*)_InterlockedExchange(
(long*)&collector.defer_list_, (long)local.local_defer_list_);
local.local_defer_list_tail_->pc_next_ = prev;

unsigned new_count = local.local_defer_list_size_ +
_InterlockedExchangeAdd((long*)&collector.defer_list_size_,
local.local_defer_list_size_);

local.local_defer_list_ = 0;
local.local_defer_list_tail_ = 0;
local.local_defer_list_size_ = 0;

if (pc_global_garbage_threshold < new_count)
{
// trying to shift collector
pc_master_t::union_t cmp;
pc_master_t::union_t val;
do
{
cmp = pc_master.union_;
if (cmp.current_collector_ != local.collector_index_)
return;
unsigned next_mask =
(1 << ((cmp.current_collector_ + 1)
% pc_collector_count));
if (cmp.collector_busy_mask_ & next_mask)
return;
val = cmp;
val.collector_busy_mask_ |= next_mask;
val.current_collector_ += 1;
val.outer_counter_ = 0;
}
while (cmp.whole_ != _InterlockedCompareExchange(
(long*)&pc_master.union_.whole_, val.whole_, cmp.whole_));
// collector is shifted
// increment collector index copy
_InterlockedIncrement((long*)&pc_master.copy_whole_);
// reset current flag and transfer
// outer count to inner counter
_InterlockedExchangeAdd((long*)&collector.whole_counter_,
(cmp.outer_counter_
<< (pc_collector_count + pc_collector_bits))
- pc_is_current_inc);
}
}

void pc_init()
{
pc_master.union_.current_collector_ = 0;
pc_master.union_.collector_busy_mask_ = 1;
pc_master.union_.outer_counter_ = 0;
pc_master.current_collector_copy_ = 0;
for (unsigned i = 0; i != pc_collector_count; ++i)
{
pc_collectors[i].defer_list_ = 0;
pc_collectors[i].defer_list_size_ = 0;
pc_collectors[i].is_current_ = 1;
pc_collectors[i].back_link_ = 1;
pc_collectors[i].inner_counter_ = 0;
}
pc_collectors[0].back_link_ = 0;
}

void pc_deinit()
{
pc_release_impl(
pc_master.union_.current_collector_, pc_is_current_inc);
}

void pc_acquire()
{
pc_thread_data_t& local = pc_thread_data;
local.recursion_count_ += 1;
if (1 != local.recursion_count_)
return;
if (local.is_acquired_)
return;
local.collector_index_ = pc_acquire_impl();
local.is_acquired_ = 1;
}

void pc_release()
{
pc_thread_data_t& local = pc_thread_data;
local.recursion_count_ -= 1;
if (0 == local.recursion_count_)
{
if ((pc_master.current_collector_copy_)
!= local.collector_index_)
{
if (local.local_defer_list_size_)
pc_flush_impl();
pc_release_impl(local.collector_index_, pc_counter_inc);
local.is_acquired_ = 0;
}
}
}

void pc_flush()
{
pc_thread_data_t& local = pc_thread_data;
if (local.recursion_count_)
return;
if (local.is_acquired_)
{
if (local.local_defer_list_size_)
pc_flush_impl();
pc_release_impl(local.collector_index_, pc_counter_inc);
local.is_acquired_ = 0;
}
}

void pc_defer(pc_node_t* node)
{
pc_thread_data_t& local = pc_thread_data;
node->pc_next_ = local.local_defer_list_;
local.local_defer_list_ = node;
if (0 == local.local_defer_list_tail_)
local.local_defer_list_tail_ = node;
local.local_defer_list_size_ += 1;
if (pc_local_garbage_cache < local.local_defer_list_size_)
pc_flush_impl();
}

void pc_thread_init()
{
pc_thread_data.recursion_count_ = 0;
pc_thread_data.is_acquired_ = 0;
pc_thread_data.collector_index_ = 0;
pc_thread_data.local_defer_list_ = 0;
pc_thread_data.local_defer_list_tail_ = 0;
pc_thread_data.local_defer_list_size_ = 0;
}

void pc_thread_deinit()
{
pc_flush();
}

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 3, 2008, 12:01:48 AM2/3/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
news:e36058fc-f534-432a...@v67g2000hse.googlegroups.com...

> Proxy-collector with atomic-free/membar-free fast-path for acquire/
> release.

[...]

Heh. Wow, I was revisiting pc_sample.c:

http://groups.google.com/group/comp.programming.threads/msg/4da7428f91773ddc
(last sentence)

and came up with something very similar! It's not ready for prime time, but
wow. We think alike!!!!

http://groups.google.com/group/comp.programming.threads/msg/f58fa040b64a4577

!

:^)

Chris Thomasson

unread,

Feb 3, 2008, 12:13:25 AM2/3/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
news:e36058fc-f534-432a...@v67g2000hse.googlegroups.com...

> Proxy-collector with atomic-free/membar-free fast-path for acquire/
> release.
>
> Chris Thomasson's original proxy-collector is algorithm basis:
> http://home.comcast.net/~vzoom/demos/pc_sample.c
>
> Instead of collector per user object I use fixed collector count. So
> every collector holds multiple user objects. In code below collector
> count = 4.

[...]

Its not zero-garbage, but that's fine.

Dmitriy V'jukov

unread,

Feb 3, 2008, 5:41:12 AM2/3/08

to

On 3 фев, 05:12, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> Proxy-collector with atomic-free/membar-free fast-path for acquire/
> release.

I think I can create N per thread caches for defered nodes:

struct pc_thread_data_t
{
unsigned recursion_count_;
unsigned is_acquired_;
unsigned collector_index_;

pc_node_t* local_defer_list_ [pc_collector_count];
pc_node_t* local_defer_list_tail_ [pc_collector_count];
unsigned local_defer_list_size_ [pc_collector_count];
};

Then thread will not transfer defered nodes to global collector at
all. Only dying threads will transfer their nodes to global
collectors.
Thread will only check local_defer_list_size_ to initiate global
collector shift when he has too many local defered nodes.
This will make it even more like RCU...

Dmitriy V'jukov

unread,

Feb 4, 2008, 6:03:40 AM2/4/08

to

On Feb 3, 8:01 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> and came up with something very similar! It's not ready for prime time, but
> wow. We think alike!!!!

Yeah. We are trampling on the very small piece of ground. It's not
surprising that we are constantly colliding :)

Now I'm thinking about solutions/alternatives for plain reference
counting (like boost::shared_ptr). This is the only thing which I
can't made "atomic-free on fast-path" yet :)
http://groups.google.com/group/comp.programming.threads/browse_frm/thread/2c24bea21c90b613

Dmitriy V'jukov

ti_chris

unread,

Feb 4, 2008, 6:26:43 PM2/4/08

to

On Feb 2, 6:12 pm, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> Proxy-collector with atomic-free/membar-free fast-path for acquire/
> release.
>
> Chris Thomasson's original proxy-collector is algorithm basis:http://home.comcast.net/~vzoom/demos/pc_sample.c
>
> Instead of collector per user object I use fixed collector count. So
> every collector holds multiple user objects. In code below collector
> count = 4.
>

> Also I mix in my acquire/release amortization trick:http://groups.google.ru/group/comp.programming.threads/msg/91d01886ae...

Very cool. Someone finally got around writing/posting one. I don't
have time right now; but I'll def take a look at it :). Looks like
all the stuff that we were churning for the last couple of weeks on
PCs came to something good :). If I get enough time to do it, I'll
get some numbers on the performance and see how well it fares wiht my
other multiple versions of the PC against a Stack.

ti_chris

unread,

Feb 4, 2008, 7:26:12 PM2/4/08

to

I know I must be missing something here... But isn't the
InterlockedExchangeAdd an atomic op?

Chris Thomasson

unread,

Feb 4, 2008, 9:45:12 PM2/4/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:ee5cb73d-829d-4621...@s19g2000prg.googlegroups.com...

> On Feb 3, 8:01 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> and came up with something very similar! It's not ready for prime time,
>> but
>> wow. We think alike!!!!
>
> Yeah. We are trampling on the very small piece of ground. It's not
> surprising that we are constantly colliding :)

You produce good works Dmitriy.

>
> Now I'm thinking about solutions/alternatives for plain reference
> counting (like boost::shared_ptr). This is the only thing which I
> can't made "atomic-free on fast-path" yet :)
> http://groups.google.com/group/comp.programming.threads/browse_frm/thread/2c24bea21c90b613

Humm...

Dmitriy V'jukov

unread,

Feb 5, 2008, 4:37:32 AM2/5/08

to

On Feb 5, 2:26 am, ti_chris <tiChri...@gmail.com> wrote:

> Very cool. Someone finally got around writing/posting one. I don't
> have time right now; but I'll def take a look at it :). Looks like
> all the stuff that we were churning for the last couple of weeks on
> PCs came to something good :). If I get enough time to do it, I'll
> get some numbers on the performance and see how well it fares wiht my
> other multiple versions of the PC against a Stack.

First of all, thank you for interest. Usually only Chris Thomasson
answers to my posts. It's a bit demoralizing :)
It will be great if you benchmark it. Btw, on what hardware platform
you will test? And note that I didn't run it. So there may be some
bugs :) But I'm sure that algorithm as a whole is correct.

Dmitriy V'jukov

unread,

Feb 5, 2008, 4:47:22 AM2/5/08

to

On Feb 5, 3:26 am, ti_chris <tiChri...@gmail.com> wrote:
> I know I must be missing something here... But isn't the
> InterlockedExchangeAdd an atomic op?

Yes, it's atomic op.
Writer executes atomic ops. Reader doesn't execute atomic ops or heavy
memory barriers on *fast-path*. Reader executes 2 atomic ops when
current collector is shifted - 1 atomic op to release previous
collector, and 1 atomic op to acquire new collector.

For example, assume there is 1 write per 100 reads, and
pc_global_garbage_threshold = 32. So there will be only 2 atomic ops
per 3200 reads on reader side.
For mpmc-stack situation is a bit worse. There is 1 write per 1 read.
But take into account that you can tweak pc_local_garbage_cache and
pc_global_garbage_threshold. For example you can set:
unsigned const pc_local_garbage_cache = 128;
unsigned const pc_global_garbage_threshold = 1024;

Actually you can tweak and collector count. But I'm not sure how it
will affect performance.

Dmitriy V'jukov

unread,

Feb 5, 2008, 4:52:34 AM2/5/08

to

On Feb 5, 5:45 am, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message

>
> news:ee5cb73d-829d-4621...@s19g2000prg.googlegroups.com...
>
> > On Feb 3, 8:01 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
> >> and came up with something very similar! It's not ready for prime time,
> >> but
> >> wow. We think alike!!!!
>
> > Yeah. We are trampling on the very small piece of ground. It's not
> > surprising that we are constantly colliding :)
>
> You produce good works Dmitriy.

Thank you.
I just hope that we are not the only people reading this!
http://groups.google.com/group/comp.programming.threads/msg/cc5eb374d5b65aeb
:)
I think there is some bug in USENET delivery system. And mails
containing 'lock-free' are delivered to you and me only :)

Dmitriy V'jukov

ti_chris

unread,

Feb 5, 2008, 1:30:19 PM2/5/08

to

Got it :). I read atomic-free and assumed the whole thing was :).
My bad

Chris Thomasson

unread,

Feb 5, 2008, 7:11:48 PM2/5/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:7c9a16cc-f729-41d3...@c4g2000hsg.googlegroups.com...

> On Feb 5, 3:26 am, ti_chris <tiChri...@gmail.com> wrote:
>> I know I must be missing something here... But isn't the
>> InterlockedExchangeAdd an atomic op?
>
> Yes, it's atomic op.
> Writer executes atomic ops. Reader doesn't execute atomic ops or heavy
> memory barriers on *fast-path*. Reader executes 2 atomic ops when
> current collector is shifted - 1 atomic op to release previous
> collector, and 1 atomic op to acquire new collector.
>

[...]

How does a reader thread introduce itself into the algorihtm? I am under
impression that the setup is something like:
________________________________________________________
struct node : public pc_node_t {
node* nx;
};

static node* g_stack = 0;

void write_push() {
node* const n = node_cache_pop();
lock();
n->nx = g_stack;
membar #LoadStore | #StoreStore;
g_stack = n->nx;
unlock();
}

void write_pop() {
node* n;
lock();
n = g_stack;
if (n) {
g_stack = n->nx;
}
unlock();
if (n) {
pc_defer(n);
}
}

void read_iterate() {
node* n;
pc_acquire();
n = load_depends(&g_stack);
while (n) {
node* const nx = load_depends(&n->nx);
[...];
n = nx;
}
pc_release();
}
________________________________________________________

Am I using your collector interface properly?

Dmitriy V'jukov

unread,

Feb 6, 2008, 5:44:43 AM2/6/08

to

On Feb 6, 3:11 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> How does a reader thread introduce itself into the algorihtm?

What do you mean?

> I am under
> impression that the setup is something like:
> ________________________________________________________
> struct node : public pc_node_t {
> node* nx;
> };

node::nx here is user data. It's not related to proxy-collector.
virtual destructor in pc_node_t is substitution for arbitrary user
supplied function. But maybe it was poor choice because user can want
to *not* destroy object but only return object to some pool for
example.

>
> static node* g_stack = 0;
>
> void write_push() {
> node* const n = node_cache_pop();
> lock();
> n->nx = g_stack;
> membar #LoadStore | #StoreStore;
> g_stack = n->nx;
> unlock();
> }

right

> void write_pop() {
> node* n;
> lock();
> n = g_stack;
> if (n) {
> g_stack = n->nx;
> }
> unlock();
> if (n) {
> pc_defer(n);
> }
> }

pc_defer() can be executed only inside pc_acquire()/pc_release()
region
Otherwise it's difficult to determine to what collector node must be
enqueued.

void write_pop() {
node* n;
lock();
n = g_stack;
if (n) {
g_stack = n->nx;
}
unlock();
if (n) {

pc_acquire();
pc_defer(n);
pc_release();
}
}

>
> void read_iterate() {
> node* n;
> pc_acquire();
> n = load_depends(&g_stack);
> while (n) {
> node* const nx = load_depends(&n->nx);
> [...];
> n = nx;
> }
> pc_release();}

right

> Am I using your collector interface properly?

approximately :)

Dmitriy V'jukov

unread,

Feb 6, 2008, 6:00:18 AM2/6/08

to

On Feb 3, 5:12 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> Proxy-collector with atomic-free/membar-free fast-path for acquire/
> release.

I've found bug in the algorithm.
pc_flush_impl() enqueues deferred nodes to collector pointed by
pc_thread_data.collector_index_. It's wrong.

Sequence which will crash algorithm:
Current collector 0
Thread 0 acquires collector 0
Current collector shifted. Current collector 1
Thread 1 acquires collector 1
Thread 0 executes pc_defer(Node). Node is enqueued to collector 0
(pointed by pc_thread_data.collector_index_ of thread 0)
Thread 0 releases collector 0
Because collector 0 is not current all nodes from collector 0 are
deleted. Including the Node.
Thread 1 still can access the Node. Crash!

To fix it I have to enqueue deferred nodes not to collector pointed by
pc_thread_data.collector_index_, but to the current collector:

> void pc_flush_impl()
> {
> // transfer local defer list to collector
> pc_thread_data_t& local = pc_thread_data;
> pc_collector_t& collector = pc_collectors[local.collector_index_];

Replace last line with:

// #StoreLoad to order node removal from collection
// and load of current collector
// #StoreLoad is not needed if node removal
// issues full memory barrier
pc_collector_t& collector =
pc_collectors[pc_master.union_.current_collector_];

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 6, 2008, 6:23:23 AM2/6/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:a3b78c55-95ec-445a...@s19g2000prg.googlegroups.com...

> On Feb 6, 3:11 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> How does a reader thread introduce itself into the algorihtm?
>
> What do you mean?

How does the algorithm know that a reader thread is in a proxy collected
region. I know the answer now: its the pc_acquire(...) function of course!

:^)

>> I am under
>> impression that the setup is something like:
>> ________________________________________________________
>> struct node : public pc_node_t {
>> node* nx;
>> };
>
> node::nx here is user data. It's not related to proxy-collector.
> virtual destructor in pc_node_t is substitution for arbitrary user
> supplied function. But maybe it was poor choice because user can want
> to *not* destroy object but only return object to some pool for
> example.

Right. I looked over the basic outline, and realized that I should derive
from pc_node_t. However, like you said, I may want to return to a pool.
Therefore, I think you might want to alter the pc_node_t data-structure to
something like:

struct pc_node_t
{
pc_node_t* pc_next_;
void (*fp_dtor_) (void*);
void* state_;
};

And change the function that deletes it to:

// delete all nodes from collector
while (collector.defer_list_)
{
pc_node_t* next = collector.defer_list_->pc_next_;

// delete collector.defer_list_;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

collector.defer_list_->fp_dtor_(collector.defer_list_->state_);

collector.defer_list_ = next;
}

Then perhaps change the defer function to something like:

void pc_defer(
pc_node_t* node,
void (*fp_dtor) (void*),
void* const state
) {
node->fp_dtor_ = fp_dtor;
node->state_ = state;

pc_thread_data_t& local = pc_thread_data;
node->pc_next_ = local.local_defer_list_;
local.local_defer_list_ = node;
if (0 == local.local_defer_list_tail_)
local.local_defer_list_tail_ = node;
local.local_defer_list_size_ += 1;
if (pc_local_garbage_cache < local.local_defer_list_size_)
pc_flush_impl();
}

That way I could do this:

struct node {
node* nx;
pc_node_t pcnode;
};

void node_defer(void* state) {
node* const _this = state;
// return to a cache or something...
}

and call into defer like:

node* n = pop(...);
if (n) {
pc_defer(&n->pcnode, node_defer, n);
}

[...]

>> void write_pop() {
[...]

>> }
>
> pc_defer() can be executed only inside pc_acquire()/pc_release()
> region
> Otherwise it's difficult to determine to what collector node must be
> enqueued.

Ahhh, I see.

> void write_pop() {
> node* n;
> lock();
> n = g_stack;
> if (n) {
> g_stack = n->nx;
> }
> unlock();
> if (n) {
> pc_acquire();
> pc_defer(n);
> pc_release();
> }
> }

>> void read_iterate() {
>> node* n;
>> pc_acquire();
>> n = load_depends(&g_stack);
>> while (n) {
>> node* const nx = load_depends(&n->nx);
>> [...];
>> n = nx;
>> }
>> pc_release();}
>
>
> right
>
>> Am I using your collector interface properly?
>
> approximately :)

;^)

Chris Thomasson

unread,

Feb 6, 2008, 6:40:00 AM2/6/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message
news:LKadnXAPcIRbCzTa...@comcast.com...
[...]

> That way I could do this:
>
>
> struct node {
> node* nx;
> pc_node_t pcnode;
> };
>
>
> void node_defer(void* state) {
> node* const _this = state;
> // return to a cache or something...
> }
>
>
> and call into defer like:
>
> node* n = pop(...);
> if (n) {
> pc_defer(&n->pcnode, node_defer, n);
> }

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

ARGH! I make same mistake. Let be try and fix thatL

node* n = pop(...);
if (n) {

pc_acquire();
pc_defer(&n->pcnode, node_defer, n);
pc_release();
}

[...]

:^)

Dmitriy V'jukov

unread,

Feb 6, 2008, 12:37:58 PM2/6/08

to

On Feb 6, 2:23 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> > node::nx here is user data. It's not related to proxy-collector.
> > virtual destructor in pc_node_t is substitution for arbitrary user
> > supplied function. But maybe it was poor choice because user can want
> > to *not* destroy object but only return object to some pool for
> > example.
>
> Right. I looked over the basic outline, and realized that I should derive
> from pc_node_t. However, like you said, I may want to return to a pool.
> Therefore, I think you might want to alter the pc_node_t data-structure to
> something like:

Agree. It's reasonable.
It's like in your pc_sample - I remember.
But I think that state_ can be omitted in pc_node_t. Because user can
always convert his object to pc_node_t and vice versa.

Inheritance:

struct X : pc_node_t {
};
pc_node_t* convert(X* x) {
return x;
}
X* convert(pc_node_t* x) {
return static_cast<X*>(x);
}

Inclusion:

struct Y {
pc_node_t pc_node_;
};
pc_node_t* convert2(Y* y) {
return &y->pc_node_;
}
Y* convert2(pc_node_t* y) {
return (Y*)((char*)y - offsetof(Y, pc_node_));
}

Thus we can safe as much as 4 bytes :)

Dmitriy V'jukov

unread,

Feb 6, 2008, 12:55:08 PM2/6/08

to

On Feb 3, 5:12 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:

> Proxy-collector with atomic-free/membar-free fast-path for acquire/
> release.

I've made it wait-free.
The only place with "CAS-loop" is when we trying to shift current
collector in pc_flush_impl() function:

> void pc_flush_impl()
> {
> [...]

> // trying to shift collector
> pc_master_t::union_t cmp;
> pc_master_t::union_t val;
> do
> {
> cmp = pc_master.union_;
> if (cmp.current_collector_ != local.collector_index_)
> return;
> unsigned next_mask =
> (1 << ((cmp.current_collector_ + 1)
> % pc_collector_count));
> if (cmp.collector_busy_mask_ & next_mask)
> return;
> val = cmp;
> val.collector_busy_mask_ |= next_mask;
> val.current_collector_ += 1;
> val.outer_counter_ = 0;
> }
> while (cmp.whole_ != _InterlockedCompareExchange(

> [...]
> }

I add special flag shift_ticket_ to pc_collector_t structure. And add
following check to pc_flush_impl() function:

void pc_flush_impl()
{
[...]

// trying to shift collector

if (collector.shift_ticket_)
return;
if (1 !=
_InterlockedIncrement(&collector.shift_ticket_))
return;
// we are the only thread here
[...]

Thus collector shift can be accomplished with _InterlockedExchange()
instead of _InterlockedCompareExchange().

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 7, 2008, 12:17:21 AM2/7/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:0f5349fe-b534-46e9...@i29g2000prf.googlegroups.com...

> On Feb 6, 2:23 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> > node::nx here is user data. It's not related to proxy-collector.
>> > virtual destructor in pc_node_t is substitution for arbitrary user
>> > supplied function. But maybe it was poor choice because user can want
>> > to *not* destroy object but only return object to some pool for
>> > example.
>>
>> Right. I looked over the basic outline, and realized that I should derive
>> from pc_node_t. However, like you said, I may want to return to a pool.
>> Therefore, I think you might want to alter the pc_node_t data-structure
>> to
>> something like:
>
> Agree. It's reasonable.
> It's like in your pc_sample - I remember.
> But I think that state_ can be omitted in pc_node_t. Because user can
> always convert his object to pc_node_t and vice versa.

[...]

Right. But that means that it has to be the first member, or the offsetof
macro needs to be used.

Dmitriy V'jukov

unread,

Feb 7, 2008, 11:30:51 AM2/7/08

to

On Feb 3, 8:13 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> > Instead of collector per user object I use fixed collector count. So
> > every collector holds multiple user objects. In code below collector
> > count = 4.
>
> [...]
>
> Its not zero-garbage, but that's fine.

You can make it nearly prompted "zero-garbage" if you set:

unsigned const pc_local_garbage_cache = 1;
unsigned const pc_global_garbage_threshold = 1;

and use this function instead of pc_release():

void pc_release_strong()
{
pc_release(); // fake release
pc_flush(); // real release
}

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 8, 2008, 2:18:42 AM2/8/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:e36058fc-f534-432a...@v67g2000hse.googlegroups.com...

> Proxy-collector with atomic-free/membar-free fast-path for acquire/
> release.

[...]

AFAICT, in order to get atomic-free/membar-free semantics on reader threads,
your going to have to use a specific usage-pattern that is compatible with
manual vZOOM periodic/episodic epoch detection. That is, the reader thread
has to something similar to the following:
_____________________________________________________________
void reader() {
int i;
int const epoch = 1000;
pc_acquire();
for (i = 1 ;; ++i) {
if (! try_get_search_request(...)) {
i = 1;
pc_release();
wait_for_search_request(...);
pc_acquire();
}
perform_extensive_search(...);
if (! (i % epoch)) {
pc_release();
pc_acquire();
}
}
pc_release();
}
_____________________________________________________________

I gather you cannot use something like this usage-pattern, which is
compatible with vZOOM automatic detection:
_____________________________________________________________
void reader() {
for ( ;; ) {
wait_for_search_request(...);
perform_extensive_search(...);
}
}
_____________________________________________________________

right?

Chris Thomasson

unread,

Feb 8, 2008, 2:20:02 AM2/8/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:15edncC6j7b7nTHa...@comcast.com...

[...]

The reason I think this is because every non-recursive call to pc_acquire()
executes an atomic-op that has to be followed by a #StoreLoad right?

Chris Thomasson

unread,

Feb 8, 2008, 2:35:08 AM2/8/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:84ydnRPlUtUonTHa...@comcast.com...

> "Chris Thomasson" <cri...@comcast.net> wrote in message
> news:15edncC6j7b7nTHa...@comcast.com...
>> "Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
>> news:e36058fc-f534-432a...@v67g2000hse.googlegroups.com...
>>> Proxy-collector with atomic-free/membar-free fast-path for acquire/
>>> release.
>> [...]
>>
>> AFAICT, in order to get atomic-free/membar-free semantics on reader
>> threads, your going to have to use a specific usage-pattern that is
>> compatible with manual vZOOM periodic/episodic epoch detection. That is,
>> the reader thread has to something similar to the following:
>> _____________________________________________________________

[...]

>> _____________________________________________________________
> [...]
>
> The reason I think this is because every non-recursive call to
> pc_acquire() executes an atomic-op that has to be followed by a #StoreLoad
> right?

Okay; your making use of the pc_thread_data_to::is_acquired_ function to
make the whole per-thread process dependant on a call to pc_flush(). I think
I see that in the pc_release() function here:

_____________________________________________________________
void pc_release()
{
pc_thread_data_to& local = pc_thread_data;

local.recursion_count_ -= 1;
if (0 == local.recursion_count_)
{
if ((pc_master.current_collector_copy_)
!= local.collector_index_)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
{

/* this path is only taken after a call to pc_flush() right? */

if (local.local_defer_list_size_)
pc_flush_impl();
pc_release_impl(local.collector_index_, pc_counter_inc);
local.is_acquired_ = 0;
}
}
}

_____________________________________________________________

so, to try and fix my example:
_____________________________________________________________
void reader_thread() {

int i;
int const epoch = 1000;
pc_acquire();
for (i = 1 ;; ++i) {
if (! try_get_search_request(...)) {
i = 1;
pc_release();

pc_flush();

wait_for_search_request(...);
pc_acquire();
}
perform_extensive_search(...);
if (! (i % epoch)) {
pc_release();

pc_flush();
pc_acquire();
}
}
pc_flush();
pc_release();
}
_____________________________________________________________

What am I missing here Dmitriy? I hope I am not 100% wrong... Anything less
would make be happy!

;^)

So far, I think that algorithm itself is going to be okay.

Dmitriy V'jukov

unread,

Feb 8, 2008, 6:57:31 AM2/8/08

to

Ok. I will try to make things more clear. First of all, you are not
100% wrong :)
Second, example must looks like:

void reader_thread() {
int i;
// not needed
// int const epoch = 1000;
// pc_acquire();

for (i = 1 ;; ++i) {

// to protect try_get_search_request()
// and perform_extensive_search()
pc_acquire();

if (! try_get_search_request(...)) {
i = 1;

// needed! because we don't want to block
// for a long with acquired collector
// I think I must add function pc_release_strong() for this
pc_release();
pc_flush();
wait_for_search_request(...);
// needed! reacquire collector
pc_acquire();
}
perform_extensive_search(...);
// not needed
// acquire/release is already amortized
// so we don't need further amortizations
//if (! (i % epoch)) {
// pc_release();
// pc_flush();
// pc_acquire();
//}
// needed!
pc_release();
}
// not needed - pc_flush() will be executed by pc_thread_deinit()
// pc_flush();
// pc_release();
}

So refined code:

void reader_thread() {
pc_thread_init();
int i;

for (i = 1 ;; ++i) {

pc_acquire();

if (! try_get_search_request(...)) {
i = 1;

pc_release_strong();
wait_for_search_request(...);
pc_acquire();
}
perform_extensive_search(...);
pc_release();
}
pc_thread_deinit();
}

Dmitriy V'jukov

unread,

Feb 8, 2008, 7:27:51 AM2/8/08

to

On Feb 8, 10:35 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> > The reason I think this is because every non-recursive call to
> > pc_acquire() executes an atomic-op that has to be followed by a #StoreLoad
> > right?

NO. NO. NO.
Non-recursive call to pc_acquire() executes an atomic-op that has to
be followed by a #StoreLoad ONLY if current collector is shifted since
last non-recursive call to pc_acquire().
It's main point!

So you can freely use it like this:
for (...)
{
pc_acquire();
...
pc_release();
}

Heavy operations will be executed only if current collector is
shifted.
It's more reasonably to think that collector is acquired ALL THE TIME.
And actually released only when detected that current collector is
shifted.

So I rely on reader thread periodically and frequently executes
pc_acquire()/pc_release() pairs. And if it's not the case (for example
long blocking, or just rare occasional pc_acquire()/pc_release()) then
reader thread have to use pc_release_strong().

I hope that things are more clear now. I'm ready to provide more
comments.

Dmitriy V'jukov

unread,

Feb 8, 2008, 7:41:52 AM2/8/08

to

On Feb 8, 10:35 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> Okay; your making use of the pc_thread_data_to::is_acquired_ function to
> make the whole per-thread process dependant on a call to pc_flush(). I think
> I see that in the pc_release() function here:
> _____________________________________________________________
> void pc_release()
> {
> pc_thread_data_to& local = pc_thread_data;
> local.recursion_count_ -= 1;
> if (0 == local.recursion_count_)
> {
> if ((pc_master.current_collector_copy_)
> != local.collector_index_)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> {
>
> /* this path is only taken after a call to pc_flush() right? */
>
> if (local.local_defer_list_size_)
> pc_flush_impl();
> pc_release_impl(local.collector_index_, pc_counter_inc);
> local.is_acquired_ = 0;
> }
> }}

If thread periodically executes pc_acquire()/pc_release() then it can
*not* execute pc_flush().

If ((0 == local.recursion_count_) &&
(pc_master.current_collector_copy_ == local.collector_index_)) (i.e.
current collector is *not* shifted) then pc_release() doesn't actually
release collector. Thus next pc_acquire() will not actually acquire
collector, because it's stay acquired.

So on fast-path reader only checks that once acquired collector
remains current: (pc_master.current_collector_copy_ ==
local.collector_index_)

pc_master.current_collector_copy_ is situated on exclusive cache line.
So it's distorted only by actual collector shift, otherwise it's
cached in all caches.

Dmitriy V'jukov

unread,

Feb 8, 2008, 7:57:13 AM2/8/08

to

On Feb 5, 5:45 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> > Now I'm thinking about solutions/alternatives for plain reference
> > counting (like boost::shared_ptr). This is the only thing which I
> > can't made "atomic-free on fast-path" yet :)

> >http://groups.google.com/group/comp.programming.threads/browse_frm/th...
>
> Humm...

It seems that I've got something to go on. It will be 100% scalable
and 100% atomic-free. I will call it '30mm machine-GC' because of its
ability to completely blow all other silly stuff out of the water :)

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 8, 2008, 9:22:43 PM2/8/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:c1edf311-c3f1-4bc0...@e23g2000prf.googlegroups.com...

Is it something like this:
_________________________________________________________________
#define OBJECT_FLAG_DTOR() 0x1

struct object {
object* next;
object* prev;
int flags;
int rc;
};

struct rc_req {
rc_req* next;
object* obj;
};

struct per_thread {
per_thread* next;
per_thread* prev;
rc_req* rc_cache;
vz_spsc_queue rc_inc;
vz_spsc_queue rc_dec;
};

struct collector {
per_thread* head;
per_thread* tail;
object* dtor_head;
object* dtor_tail;
object* dtor_final;
rc_req* rc_inc;
rc_req* rc_dec;
};

void object_inc(object* const _this) {
per_thread* const _this = pthread_getspecific(...);
rc_req* const rc = per_thread_get_rc(_this, _this);
vz_spsc_queue_push_nosig(&_this->rc_inc, rc);
}

void object_dec(object* const _this) {
per_thread* const _this = pthread_getspecific(...);
rc_req* const rc = per_thread_get_rc(_this, _this);
vz_spsc_queue_push_nosig(&_this->rc_dec, rc);
}

void* collector_thread(void* state) {
collector* const _this = state;
rc_req* rc;
object* dtor;
for (;;) {
wait_for_gc_interval(...);
for_each_thread_in_collector(...) {
/* grab increments */
rc = vz_spsc_queue_pop(&thread->rc_inc);
while (rc) {
rc->next = _this->rc_inc;
_this->rc_inc = rc;
rc = vz_spsc_queue_pop(&thread->rc_inc);
}

/* grab decrements */
rc = vz_spsc_queue_pop(&thread->rc_dec);
while (rc) {
rc->next = _this->rc_inc;
_this->rc_inc = rc;
rc = vz_spsc_queue_pop(&thread->rc_dec);
}
}

/* process increments first */
rc = _this->rc_inc;
while (rc) {
_this->rc_inc = rc->next;
++rc->obj->rc;
if (rc->obj->rc &&
rc->obj->flags & OBJECT_FLAG_DTOR()) {
rc->obj->flags &= ~OBJECT_FLAG_DTOR();
collector_object_dtor_pop(_this, rc->obj);
}
rc = _this->rc_inc;
}

dtor = _this->dtor_final;
_this->dtor_final = 0;

/* process decrements last */
rc = _this->rc_dec;
while (rc) {
_this->rc_dec = rc->next;
--rc->obj->rc;
if (rc->obj->rc &&
rc->obj->flags & OBJECT_FLAG_DTOR()) {
rc->obj->flags &= ~OBJECT_FLAG_DTOR();
collector_object_dtor_pop(_this, rc->obj);
} else if (! rc->obj->rc &&
! (rc->obj->flags & OBJECT_FLAG_DTOR())) {
rc->obj->flags |= OBJECT_FLAG_DTOR();
collector_object_dtor_push(_this, rc->obj);
} else if (! rc->obj->rc) {
collector_object_dtor_pop(_this, rc->obj);
rc->obj->next = _this->dtor_final;
_this->dtor_final = rc->obj;
}
rc = _this->rc_dec;
}

/* destroy! */
while (dtor) {
object* const next = dtor->next;
free(dtor);
dtor = next;
}
}
}
_________________________________________________________________

This is very quick, crude and dirty example of distributed deferred
reference counting. It requires no membars, no atomic-ops, and its
scaleable. I think Joe first mentioned here:

http://groups.google.com/group/comp.programming.threads/msg/cbf6c48157db9202

How far off am I?

;^)

Chris Thomasson

unread,

Feb 8, 2008, 9:32:07 PM2/8/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:2b79205a-a2b0-4348...@m34g2000hsb.googlegroups.com...

> On Feb 8, 10:35 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> > The reason I think this is because every non-recursive call to
>> > pc_acquire() executes an atomic-op that has to be followed by a
>> > #StoreLoad
>> > right?
>
> NO. NO. NO.
> Non-recursive call to pc_acquire() executes an atomic-op that has to
> be followed by a #StoreLoad ONLY if current collector is shifted since
> last non-recursive call to pc_acquire().
> It's main point!

Yeah, I noticed that here:

http://groups.google.com/group/comp.programming.threads/msg/72928e733a151d4e
(first paragraph in my response.)

Its governed by the pc_thread_data_to::is_acquired_ variable which is reset
in pc_flush(), and pc_defer(...) when the thresholds are exceeded.

I have another important question. I will post it in the form of example
code, and that will be later on today. First, I need to make sure I am
understanding this more clearly.

[...]

Chris Thomasson

unread,

Feb 8, 2008, 9:35:34 PM2/8/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:6e6dnc7k3OQZkTDa...@comcast.com...

> "Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

[...]

> void* collector_thread(void* state) {
> collector* const _this = state;
> rc_req* rc;
> object* dtor;
> for (;;) {
> wait_for_gc_interval(...);
> for_each_thread_in_collector(...) {
> /* grab increments */
> rc = vz_spsc_queue_pop(&thread->rc_inc);
> while (rc) {
> rc->next = _this->rc_inc;
> _this->rc_inc = rc;
> rc = vz_spsc_queue_pop(&thread->rc_inc);
> }
>
> /* grab decrements */
> rc = vz_spsc_queue_pop(&thread->rc_dec);
> while (rc) {
> rc->next = _this->rc_inc;
> _this->rc_inc = rc;

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Of course it would help if I actually pushed the decrement request onto the
decrement list! Correction:

rc->next = _this->rc_dec;
_this->rc_dec = rc;

ARGH!!!

> rc = vz_spsc_queue_pop(&thread->rc_dec);
> }
> }
[...]

I sketched out that code in about 2 minutes. Sorry for that!

Chris Thomasson

unread,

Feb 9, 2008, 3:01:16 AM2/9/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:ms2dnc5PbN0lkzDa...@comcast.com...

> "Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
> news:2b79205a-a2b0-4348...@m34g2000hsb.googlegroups.com...
>> On Feb 8, 10:35 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>>
>>> > The reason I think this is because every non-recursive call to
>>> > pc_acquire() executes an atomic-op that has to be followed by a
>>> > #StoreLoad
>>> > right?
>>
>> NO. NO. NO.
>> Non-recursive call to pc_acquire() executes an atomic-op that has to
>> be followed by a #StoreLoad ONLY if current collector is shifted since
>> last non-recursive call to pc_acquire().
>> It's main point!
>
> Yeah, I noticed that here:
>
> http://groups.google.com/group/comp.programming.threads/msg/72928e733a151d4e
> (first paragraph in my response.)
>
> Its governed by the pc_thread_data_to::is_acquired_ variable which is
> reset in pc_flush(), and pc_defer(...) when the thresholds are exceeded.

Also, its governed by the swap in collector indexes. The pc_release()
function detects that and acts accordingly...

>
> I have another important question. I will post it in the form of example
> code, and that will be later on today. First, I need to make sure I am
> understanding this more clearly.
>
> [...]

My question was answered by reading your source code precisely. Good job.

Dmitriy V'jukov

unread,

Feb 9, 2008, 6:16:41 AM2/9/08

to

On 9 фев, 05:32, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message

>
> news:2b79205a-a2b0-4348...@m34g2000hsb.googlegroups.com...
>
> > On Feb 8, 10:35 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
> >> > The reason I think this is because every non-recursive call to
> >> > pc_acquire() executes an atomic-op that has to be followed by a
> >> > #StoreLoad
> >> > right?
>
> > NO. NO. NO.
> > Non-recursive call to pc_acquire() executes an atomic-op that has to
> > be followed by a #StoreLoad ONLY if current collector is shifted since
> > last non-recursive call to pc_acquire().
> > It's main point!
>
> Yeah, I noticed that here:
>

> http://groups.google.com/group/comp.programming.threads/msg/72928e733...

> (first paragraph in my response.)

'the whole per-thread process dependant on a call to pc_flush()'
No. Maybe we just don't understand each other...
Reader thread can *not* execute pc_flush() at all.

> Its governed by the pc_thread_data_to::is_acquired_ variable which is reset
> in pc_flush(), and pc_defer(...) when the thresholds are exceeded.

pc_defer()? pc_defer() has nothing to do with
pc_thread_data_to::is_acquired_.
pc_thread_data_to::is_acquired_ is reset in pc_flush() and sometimes
in pc_release().

Dmitriy V'jukov

unread,

Feb 9, 2008, 6:19:28 AM2/9/08

to

On 9 фев, 11:01, "Chris Thomasson" <cris...@comcast.net> wrote:

> > Its governed by the pc_thread_data_to::is_acquired_ variable which is
> > reset in pc_flush(), and pc_defer(...) when the thresholds are exceeded.
>
> Also, its governed by the swap in collector indexes. The pc_release()
> function detects that and acts accordingly...

Yes.

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 9, 2008, 7:17:55 AM2/9/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:fde09d03-8b41-42e5...@y5g2000hsf.googlegroups.com...

Whoops! I saw the call to pc_flush_impl() in pc_defer() and thought that it
resets pc_thread_data_to::is_acquired_, when in fact, pc_flush() and
pc_release() do it... NOT pc_flush_impl()! Sorry about that...

CRAP!

Anyway, this is algorithm is looking good. I think its about time that I
implement this from scratch on my end, and run it. That way, I will not make
anymore false statements about your stuff.

Chris Thomasson

unread,

Feb 9, 2008, 7:25:12 AM2/9/08

to

The last post screwed the quoted text to a point of no return! Here is my
response:

> Whoops! I saw the call to pc_flush_impl() in pc_defer() and thought that
> it resets pc_thread_data_to::is_acquired_, when in fact, pc_flush() and
> pc_release() do it... NOT pc_flush_impl()! Sorry about that...
>
> CRAP!
>
> Anyway, this is algorithm is looking good. I think its about time that I
> implement this from scratch on my end, and run it. That way, I will not
> make anymore false statements about your stuff.

Sorry about that nonsense.

Chris Thomasson

unread,

Feb 9, 2008, 3:57:30 PM2/9/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:6e6dnc7k3OQZkTDa...@comcast.com...

> "Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
> news:c1edf311-c3f1-4bc0...@e23g2000prf.googlegroups.com...
>> On Feb 5, 5:45 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>>
>>> > Now I'm thinking about solutions/alternatives for plain reference
>>> > counting (like boost::shared_ptr). This is the only thing which I
>>> > can't made "atomic-free on fast-path" yet :)
>>> >http://groups.google.com/group/comp.programming.threads/browse_frm/th...
>>>
>>> Humm...
>>
>> It seems that I've got something to go on. It will be 100% scalable
>> and 100% atomic-free. I will call it '30mm machine-GC' because of its
>> ability to completely blow all other silly stuff out of the water :)
>
> Is it something like this:
> _________________________________________________________________

[...]

Here is a cleaner version that does not try to erroneously support
strong-thread safety:

http://groups.google.com/group/comp.programming.threads/msg/8b7b22c32cf3a7d9

> _________________________________________________________________
[...]

Chris Thomasson

unread,

Feb 10, 2008, 4:05:59 AM2/10/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:0062e70d-b1fb-4602...@d70g2000hsb.googlegroups.com...

Right. FWIW, here is example of amortized read access using pc_sample.c:

___________________________________________________
static pc_t volatile g_pc;

void reader() {
int loops;
int const epoch_interval = 1000;
pc_deferred_t* epoch = pc_inc(&g_pc);
for (loops = 1 ;; ++loops) {
if (! search_request_trywait(...)) {
pc_dec(epoch);
search_request_wait(...);
epoch = pc_inc(&g_pc);
}
do_extensive_search(...);
if (! (loops % epoch_interval)) {
if (g_pc.pc_anchor != epoch) {
pc_dec(epoch);
epoch = pc_inc(&g_pc);
}
}
}
pc_dec(epoch);
}
___________________________________________________

This works with posted pc_sample.c code as-is. I am currently tweaking this
thing. It's looking pretty good so far.

Dmitriy V'jukov

unread,

Feb 10, 2008, 5:52:40 AM2/10/08

to

On 9 фев, 05:22, "Chris Thomasson" <cris...@comcast.net> wrote:

> > It seems that I've got something to go on. It will be 100% scalable
> > and 100% atomic-free. I will call it '30mm machine-GC' because of its
> > ability to completely blow all other silly stuff out of the water :)
>
> Is it something like this:

> How far off am I?
> ;^)

Nice *try*, Chris ;)
I will answer in 'Alternatives to reference counting?' thread.

Dmitriy V'jukov

unread,

Feb 11, 2008, 6:25:42 AM2/11/08

to

On Feb 3, 5:12 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> Proxy-collector with atomic-free/membar-free fast-path for acquire/
> release.
>

> Chris Thomasson's original proxy-collector is algorithm basis:http://home.comcast.net/~vzoom/demos/pc_sample.c

>
> Instead of collector per user object I use fixed collector count. So
> every collector holds multiple user objects. In code below collector
> count = 4.
>

> Also I mix in my acquire/release amortization trick:http://groups.google.ru/group/comp.programming.threads/msg/91d01886ae...
> It allows me to achieve atomic-free/membar-free fast-path for acquire/
> release.
>
> And finally I add some amortizations like local per thread cache for
> deferred objects.
>
> So it must kick ass of any other proxy-collector out there :)
> Strictly saying, it's no more pure proxy-collector, it's something
> like PC+RCU+SMR :)

I've noticed that this proxy-collector uses only single-word atomic
RMW operations (_InterlockedAnd, _InterlockedExchangeAdd,
_InterlockedExchange, _InterlockedIncrement) and at the same time
allows unlimited number of references. As opposed to other proxy-
collectors which use double-word atomic RMW operations or allow only
small finite number of references.
It's possible because of small fixed number of collectors.
And it's totally wait-free.
Cool!

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 11, 2008, 8:52:36 PM2/11/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:f00ca005-2d7e-4fab...@n20g2000hsh.googlegroups.com...

Your right. Cool indeed!

Chris Thomasson

unread,

Feb 13, 2008, 3:51:56 AM2/13/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:d7ydna1R9vZzZC3a...@comcast.com...

Humm, let me play simplistic devils advocate for just a brief moment: What
about dumb app that has absolutely a ridiculous shi%load number of threads
executing pc_acquire concurrently and all bumping the master (e.g., outer)
count until it does something "bad"... This scenario is basically stupid,
and if it ever happens you can tell the author to buy a book and learn the
<errno.h> of his ways...

;^)

Chris Thomasson

unread,

Feb 13, 2008, 3:55:57 AM2/13/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:qe6dnQ6E2M7lDzfa...@comcast.com...

Well, okay, you can always just provide a "global" dtor function to the
"main" pc_init() call... It should heavily decrease dtor granularity (e.g.,
per-object level), but you don't have to store a func-ptr per-pc_node_t...

;^)

Dmitriy V'jukov

unread,

Feb 14, 2008, 10:57:04 AM2/14/08

to

On Feb 13, 11:55 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> >> Agree. It's reasonable.
> >> It's like in your pc_sample - I remember.
> >> But I think that state_ can be omitted in pc_node_t. Because user can
> >> always convert his object to pc_node_t and vice versa.
> > [...]
>
> > Right. But that means that it has to be the first member, or the offsetof
> > macro needs to be used.
>
> Well, okay, you can always just provide a "global" dtor function to the
> "main" pc_init() call... It should heavily decrease dtor granularity (e.g.,
> per-object level), but you don't have to store a func-ptr per-pc_node_t...
>
> ;^)

dtor function can be global. And user can provide 4-bit (16 choices)
'node type' for every node:

struct pc_node_t
{
unsigned next_pointer_ : 28;
unsigned node_type_ : 4;
};

So dtor function will be something like this:

void my_global_dtor(pc_node_t* node, unsigned node_type)
{
switch (node_type)
{
case 0:
{
my_node_type0* n = static_cast<my_node_type0*>(node);
delete n;
}
case 1:
{
my_node_type1* n = (my_node_type1*)
((char*)node - offsetof(my_node_type1, pc_node_));
return_to_pool(n);
}
case ...
...
}
}

Dmitriy V'jukov

unread,

Feb 14, 2008, 11:05:30 AM2/14/08

to

On Feb 13, 11:51 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> Humm, let me play simplistic devils advocate for just a brief moment: What
> about dumb app that has absolutely a ridiculous shi%load number of threads
> executing pc_acquire concurrently and all bumping the master (e.g., outer)
> count until it does something "bad"... This scenario is basically stupid,
> and if it ever happens you can tell the author to buy a book and learn the
> <errno.h> of his ways...

Do you mean overflow of outer counter?
Every thread can increase outer counter *maximum* at 1. So you must
have 2^26 threads. Are you talking about such situation?
Whatever. This algorithm will handle such situation. Overflow of outer
counter will *not* break algorithm:

http://groups.google.com/group/comp.programming.threads/msg/a79b9b68b9771e0a
http://groups.google.com/group/comp.programming.threads/msg/2eb0ff35d50e696e

Dmitriy V'jukov

Alexander Shuvaev

unread,

Feb 15, 2008, 8:53:01 AM2/15/08

to

I think it's cool. I spent some time analizing this code but the
reading was not boring :).
But some features of algorithm is not clear for me:
For what reason back_link_ bit used? Wherefore is additional
copy_whole_ member incremented and testing in order to test collector
switching?

I see only one disadvantage of very wide usage of the algorithm. There
is need for pc_flush call before executing long time operations not
only to not prevent memory reclamation but to guarantee lock freedom
of the algorithm (in case of vary long readings all collectors can be
busy). There is no need for such operation in SMR for example. At the
same time I understand disadvantages/overheads of SMR as compared to
this implementation. On the other hand the proxy-collector suites
perfectly while non-blocking algorithms implementing. Cool.

Alexander Shuvaev

unread,

Feb 15, 2008, 9:17:28 AM2/15/08

to

On 6 фев, 14:00, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> On Feb 3, 5:12 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>
> > Proxy-collector with atomic-free/membar-free fast-path for acquire/
> > release.
>

> I've found bug in the algorithm.
> pc_flush_impl() enqueues deferred nodes to collector pointed by
> pc_thread_data.collector_index_. It's wrong.
>
> Sequence which will crash algorithm:
> Current collector 0
> Thread 0 acquires collector 0
> Current collector shifted. Current collector 1
> Thread 1 acquires collector 1
> Thread 0 executes pc_defer(Node). Node is enqueued to collector 0
> (pointed by pc_thread_data.collector_index_ of thread 0)
> Thread 0 releases collector 0
> Because collector 0 is not current all nodes from collector 0 are
> deleted. Including the Node.
> Thread 1 still can access the Node. Crash!
>
> To fix it I have to enqueue deferred nodes not to collector pointed by
> pc_thread_data.collector_index_, but to the current collector:
>
> > void pc_flush_impl()
> > {
> > // transfer local defer list to collector
> > pc_thread_data_t& local = pc_thread_data;
> > pc_collector_t& collector = pc_collectors[local.collector_index_];
>
> Replace last line with:
>
> // #StoreLoad to order node removal from collection
> // and load of current collector
> // #StoreLoad is not needed if node removal
> // issues full memory barrier
> pc_collector_t& collector =
> pc_collectors[pc_master.union_.current_collector_];
>
> Dmitriy V'jukov

After your correction the implementation is correct only if an object
being deferred can't be acquired. Has the algorithm such restriction?
Otherwise switching and acquiring may be produced by the other thread
after the collector reference is received.

Alexander Shuvaev

unread,

Feb 15, 2008, 9:20:42 AM2/15/08

to

Alexander Shuvaev

unread,

Feb 15, 2008, 9:25:33 AM2/15/08

to

Alexander Shuvaev

unread,

Feb 15, 2008, 9:27:33 AM2/15/08

to

On 15 фев, 17:25, Alexander Shuvaev <alex.shuv...@gmail.com> wrote:

> On 6 ÆÅ×, 14:00, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>
>
>
> > On Feb 3, 5:12 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>
> > > Proxy-collector with atomic-free/membar-free fast-path for acquire/
> > > release.
>
> > I've found bug in the algorithm.
> > pc_flush_impl() enqueues deferred nodes to collector pointed by
> > pc_thread_data.collector_index_. It's wrong.
>
> > Sequence which will crash algorithm:
> > Current collector 0
> > Thread 0 acquires collector 0
> > Current collector shifted. Current collector 1
> > Thread 1 acquires collector 1
> > Thread 0 executes pc_defer(Node). Node is enqueued to collector 0
> > (pointed by pc_thread_data.collector_index_ of thread 0)
> > Thread 0 releases collector 0
> > Because collector 0 is not current all nodes from collector 0 are
> > deleted. Including the Node.
> > Thread 1 still can access the Node. Crash!
>
> > To fix it I have to enqueue deferred nodes not to collector pointed by
> > pc_thread_data.collector_index_, but to the current collector:
>
> > > void pc_flush_impl()
> > > {

> > > š š š š // transfer local defer list to collector
> > > š š š š pc_thread_data_t& local = pc_thread_data;
> > > š š š š pc_collector_t& collector = pc_collectors[local.collector_index_];
>
> > Replace last line with:
>
> > š š š š // #StoreLoad to order node removal from collection
> > š š š š // and load of current collector
> > š š š š // #StoreLoad is not needed if node removal
> > š š š š // issues full memory barrier
> > š š š š pc_collector_t& collector =
> > š š š š š š pc_collectors[pc_master.union_.current_collector_];

>
> > Dmitriy V'jukov
>
> After your correction the implementation is correct only if an object
> being deferred can't be acquired. Has the algorithm such restriction?
> Otherwise switching and acquiring may be produced by the other thread
> after the collector reference is received.

I am spammer :). It's bug.

Alexander Shuvaev

unread,

Feb 15, 2008, 9:37:52 AM2/15/08

to

On 14 фев, 19:05, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> On Feb 13, 11:51 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
> > Humm, let me play simplistic devils advocate for just a brief moment: What
> > about dumb app that has absolutely a ridiculous shi%load number of threads
> > executing pc_acquire concurrently and all bumping the master (e.g., outer)
> > count until it does something "bad"... This scenario is basically stupid,
> > and if it ever happens you can tell the author to buy a book and learn the
> > <errno.h> of his ways...
>
> Do you mean overflow of outer counter?
> Every thread can increase outer counter *maximum* at 1. So you must
> have 2^26 threads. Are you talking about such situation?
> Whatever. This algorithm will handle such situation. Overflow of outer
> counter will *not* break algorithm:
>

> http://groups.google.com/group/comp.programming.threads/msg/a79b9b68b...http://groups.google.com/group/comp.programming.threads/msg/2eb0ff35d...
>
> Dmitriy V'jukov

I think you wrong. The overflow of outer counter will not break
algorithm but the overflow of readers (threads) will.

Dmitriy V'jukov

unread,

Feb 15, 2008, 10:28:18 AM2/15/08

to

On Feb 15, 5:17 pm, Alexander Shuvaev <alex.shuv...@gmail.com> wrote:

> On 6 ÆÅ×, 14:00, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>
>
>
> > On Feb 3, 5:12 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>
> > > Proxy-collector with atomic-free/membar-free fast-path for acquire/
> > > release.
>
> > I've found bug in the algorithm.
> > pc_flush_impl() enqueues deferred nodes to collector pointed by
> > pc_thread_data.collector_index_. It's wrong.
>
> > Sequence which will crash algorithm:
> > Current collector 0
> > Thread 0 acquires collector 0
> > Current collector shifted. Current collector 1
> > Thread 1 acquires collector 1
> > Thread 0 executes pc_defer(Node). Node is enqueued to collector 0
> > (pointed by pc_thread_data.collector_index_ of thread 0)
> > Thread 0 releases collector 0
> > Because collector 0 is not current all nodes from collector 0 are
> > deleted. Including the Node.
> > Thread 1 still can access the Node. Crash!
>
> > To fix it I have to enqueue deferred nodes not to collector pointed by
> > pc_thread_data.collector_index_, but to the current collector:
>
> > > void pc_flush_impl()
> > > {

> > > š š š š // transfer local defer list to collector
> > > š š š š pc_thread_data_t& local = pc_thread_data;

> > > š š š š pc_collector_t& collector = pc_collectors[local.collector_index_];
>
> > Replace last line with:
>

> > š š š š // #StoreLoad to order node removal from collection
> > š š š š // and load of current collector
> > š š š š // #StoreLoad is not needed if node removal
> > š š š š // issues full memory barrier
> > š š š š pc_collector_t& collector =
> > š š š š š š pc_collectors[pc_master.union_.current_collector_];
>
>

> After your correction the implementation is correct only if an object
> being deferred can't be acquired. Has the algorithm such restriction?
> Otherwise switching and acquiring may be produced by the other thread
> after the collector reference is received.

I didn't get you. Please clarify your point.
Btw, objects itself can't be acquired. Only collector can be acquired.
Collector holds arbitrary amount of user objects.

Dmitriy V'jukov

unread,

Feb 15, 2008, 10:38:06 AM2/15/08

to

On Feb 15, 5:37 pm, Alexander Shuvaev <alex.shuv...@gmail.com> wrote:

> > > Humm, let me play simplistic devils advocate for just a brief moment: What
> > > about dumb app that has absolutely a ridiculous shi%load number of threads
> > > executing pc_acquire concurrently and all bumping the master (e.g., outer)
> > > count until it does something "bad"... This scenario is basically stupid,
> > > and if it ever happens you can tell the author to buy a book and learn the
> > > <errno.h> of his ways...
>
> > Do you mean overflow of outer counter?
> > Every thread can increase outer counter *maximum* at 1. So you must
> > have 2^26 threads. Are you talking about such situation?
> > Whatever. This algorithm will handle such situation. Overflow of outer
> > counter will *not* break algorithm:
>

> >http://groups.google.com/group/comp.programming.threads/msg/a79b9b68b......

>
> > Dmitriy V'jukov
>
> I think you wrong. The overflow of outer counter will not break
> algorithm but the overflow of readers (threads) will.

Hmmm... Yes... You are perfectly right. You beat me again :)
If there are more than 2^26 live references to collector (when
collector is become not current, and back-link is dropped) then inner
counter will overflow and all user objects will be freed prematurely.
Note: here reference is effectively == thread. I.e. one must have >
2^26 threads.

Dmitriy V'jukov

unread,

Feb 15, 2008, 10:53:22 AM2/15/08

to

On Feb 15, 4:53 pm, Alexander Shuvaev <alex.shuv...@gmail.com> wrote:

> I think it's cool. I spent some time analizing this code but the
> reading was not boring :).
> But some features of algorithm is not clear for me:

> Wherefore is additional
> copy_whole_ member incremented and testing in order to test collector
> switching?

Ok. I begin with easy question :)
copy_whole_ is needed solely for optimization.
pc_master.union_.current_collector_ can be used instead.
I have to check whether previously acquired collector is still current
on fast-path in pc_release() function:
(pc_master.current_collector_copy_ != local.collector_index_)
Every real acquire of collector and every drain of collector modifies
cache-line where pc_master.union_.current_collector_ resides. I don't
want this modifications affect reader who just want to get current
collector number. So I put copy in separate cache-line.

Dmitriy V'jukov

unread,

Feb 15, 2008, 11:25:02 AM2/15/08

to

On Feb 15, 4:53 pm, Alexander Shuvaev <alex.shuv...@gmail.com> wrote:

> I think it's cool. I spent some time analizing this code but the
> reading was not boring :).
> But some features of algorithm is not clear for me:
> For what reason back_link_ bit used?

It's more difficult question.
Proxy-collector can be used for lock-free read-only traversals over
big collections. Reader don't have to acquire/release reference for
every node, he just have to acquire reference to current collector
before traversal, and release it after traversal.
So usage pattern for proxy-collector:

void writer_thread()
{
for (;;)
{
pc_acquire();
pc_node_t* node = remove_some_node_from_collection();
pc_defer(node);
pc_release();
sleep_for_a_while();
}
}

void reader_thread()
{
for (;;)
{
pc_acquire();
pc_node_t* node = head;
while (node)
{
process_node(node);
node = node->next;
}
pc_release();
}
}

So why back_link_ is needed?
Example:
0. Collector 0 is current
1. Reader acquires collector 0
2. Current collector is shifted. Collector 1 is current
Collector 0 is not drained because reader still holds reference to
it
3. Writer defers some node to collector 1
4. Current collector is shifted. Collector 2 is current
At this point, without back-links, collector 1 must be drained
because there is no
outstanding references to it and it is not current. But reader
still can work with node
which was deferred to collector 1. Oops! Paging fault!
With back-links, collector 1 is not drained because collector 0
holds reference
(back-link) to it

So back-links ensure fifo-order of draining of collectors. I.e.
collector 1 can *not* be drained until collector 0 is not drained.

Dmitriy V'jukov

unread,

Feb 15, 2008, 11:38:53 AM2/15/08

to

On Feb 15, 4:53 pm, Alexander Shuvaev <alex.shuv...@gmail.com> wrote:

> I think it's cool. I spent some time analizing this code but the
> reading was not boring :).
> But some features of algorithm is not clear for me:
> For what reason back_link_ bit used? Wherefore is additional
> copy_whole_ member incremented and testing in order to test collector
> switching?
>
> I see only one disadvantage of very wide usage of the algorithm. There
> is need for pc_flush call before executing long time operations not
> only to not prevent memory reclamation but to guarantee lock freedom
> of the algorithm (in case of vary long readings all collectors can be
> busy).

Every collector can hold arbitrary number of user objects (nodes)!
So collector can *not* be busy. I always can put deferred user object
to current collector.
The only problem here - if *all* system memory will be exhausted.

> There is no need for such operation in SMR for example. At the
> same time I understand disadvantages/overheads of SMR as compared to
> this implementation. On the other hand the proxy-collector suites
> perfectly while non-blocking algorithms implementing. Cool.

With proxy-collector reader thread doesn't have to make any activity
on per node basis. See usage example:
http://groups.google.com/group/comp.programming.threads/msg/c3fa2ec8f80058fd
Reader thread just use plain pointer loads to 'next' node while
traversing big linked data structure. It's killing advantage.

Dmitriy V'jukov

Alexander Shuvaev

unread,

Feb 18, 2008, 8:50:06 AM2/18/08

to

Everything clear, thanks.
I remember that I've used such method (like back-link) in my old lock-
free queue implementation to get such guarantee.

Alexander Shuvaev

unread,

Feb 18, 2008, 9:13:40 AM2/18/08

to

On 15 фев, 18:28, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
>
> I didn't get you. Please clarify your point.
> Btw, objects itself can't be acquired. Only collector can be acquired.
> Collector holds arbitrary amount of user objects.
>
> Dmitriy V'jukov

By now I have no questions, because usage pattern is something like
that:

some_t global_ptr; //shared data, globally visible

void writer_thread()
{
for (;;)
{
pc_acquire();

...
global_ptr = new_global_ptr; //for a single writer
_mm_mfence();
pc_defer(old_global_ptr); //before the defer operation old
value of global_ptr *must* not be visible not to be acquired.
pc_release();
}
}

Alexander Shuvaev

unread,

Feb 18, 2008, 9:27:20 AM2/18/08

to

On 15 фев, 19:38, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:

>
> Every collector can hold arbitrary number of user objects (nodes)!
> So collector can *not* be busy. I always can put deferred user object
> to current collector.
> The only problem here - if *all* system memory will be exhausted.
>

Ok, I see.

>
> With proxy-collector reader thread doesn't have to make any activity

> on per node basis. See usage example:http://groups.google.com/group/comp.programming.threads/msg/c3fa2ec8f...

> Reader thread just use plain pointer loads to 'next' node while
> traversing big linked data structure. It's killing advantage.
>

I think so!

Dmitriy V'jukov

unread,

Feb 18, 2008, 1:10:37 PM2/18/08

to

On Feb 18, 5:13 pm, Alexander Shuvaev <alex.shuv...@gmail.com> wrote:
>
> > I didn't get you. Please clarify your point.
> > Btw, objects itself can't be acquired. Only collector can be acquired.
> > Collector holds arbitrary amount of user objects.
>

> By now I have no questions, because usage pattern is something like
> that:
>
> some_t global_ptr; //shared data, globally visible
>
> void writer_thread()
> {
> for (;;)
> {
> pc_acquire();
> ...
> global_ptr = new_global_ptr; //for a single writer
> _mm_mfence();
> pc_defer(old_global_ptr); //before the defer operation old
> value of global_ptr *must* not be visible not to be acquired.
> pc_release();
> }
>
> }

Usage pattern is correct. But I'm not sure what you mean by acquiring
of object.
Actually other threads *can not* acquire object (i.e. collector where
deferred object resides) only after collector shift. I.e. other
threads *can* acquire reference to object (i.e. collector where
deferred object resides) after pc_defer(old_global_ptr).

Dmitriy V'jukov

unread,

Feb 18, 2008, 1:15:07 PM2/18/08

to

On Feb 18, 4:50 pm, Alexander Shuvaev <alex.shuv...@gmail.com> wrote:
>
> > So why back_link_ is needed?
> > Example:
> > 0. Collector 0 is current
> > 1. Reader acquires collector 0
> > 2. Current collector is shifted. Collector 1 is current
> > Collector 0 is not drained because reader still holds reference to
> > it
> > 3. Writer defers some node to collector 1
> > 4. Current collector is shifted. Collector 2 is current
> > At this point, without back-links, collector 1 must be drained
> > because there is no
> > outstanding references to it and it is not current. But reader
> > still can work with node
> > which was deferred to collector 1. Oops! Paging fault!
> > With back-links, collector 1 is not drained because collector 0
> > holds reference
> > (back-link) to it
>
> > So back-links ensure fifo-order of draining of collectors. I.e.
> > collector 1 can *not* be drained until collector 0 is not drained.
>

> Everything clear, thanks.
> I remember that I've used such method (like back-link) in my old lock-
> free queue implementation to get such guarantee.

Yeah, I remember, it was multi-producer/multi-consumer queue with
embedded proxy-collector-like algorithm (based on DWCAS) for node
reclamation. Right?

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 18, 2008, 1:49:31 PM2/18/08

to

"Alexander Shuvaev" <alex.s...@gmail.com> wrote in message
news:fbd207dc-3696-4638...@k2g2000hse.googlegroups.com...

> On 15 фев, 18:28, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> >
> > I didn't get you. Please clarify your point.
> > Btw, objects itself can't be acquired. Only collector can be acquired.
> > Collector holds arbitrary amount of user objects.
> >
> > Dmitriy V'jukov

> By now I have no questions, because usage pattern is something like
> that:

> some_t global_ptr; //shared data, globally visible

> void writer_thread()
> {
> for (;;)
> {
> pc_acquire();
> ...
> global_ptr = new_global_ptr; //for a single writer
> _mm_mfence();

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
No! You execute the membar __before__ you set the global pointer.

> pc_defer(old_global_ptr); //before the defer operation old
> value of global_ptr *must* not be visible not to be acquired.
> pc_release();
> }
> }

Here is corrected code:

some_t global_ptr;

void writer_thread()
{
for (;;)
{

some_t local_ptr = new some_t;
_mm_mfence();
pc_acquire();
some_t old_global_ptr = global_ptr;
gobal_ptr = local_ptr;
pc_defer(old_global_ptr);
pc_release();
}
}