My first Proxy Collector /w a stack

ti_chris

unread,

Jan 23, 2008, 3:14:46 AM1/23/08

to

Alright, here's my code for the implementation of a stack. The
results I've obtained with this code are as follows:

Stack: Stack with Mutexes = 28.8Kops/s
LFStack: Microsoft Lock-Free Stack with page-fault catch = 880Kops/s
LFPCStack: Lock-Free Proxy Collector Stack = 28Kops/s

This was implemented using Win32 PThread with Visual Studio in debug
mode (release requires assembly language to stop the compiler from re-
ordering things it shouldn't). It confirms that mutexes are really
slow and that a good lock-free algo can seriously kick some ass (30
times faster). What's not much of a surprise to me is that my
implementation of the Proxy Collector was actually more or less on-par
with the mutex version. I know I can optimize my proxy a little bit
and possibly insignificantly beat the mutexed version (that's my next
step). I know that there would be some significant optimizations if I
were to write the PC in assembly (again; next version). I also know
that I can use a good memory allocator to speed things up, but
arguably, every version of the Stack could use this and equally gain
from it. It may however help the lock-free versions a little more
since there will no longer be any locks between frees and mallocs (So
a likely gain against the mutexed version).

In the meantime, I've attached my source code for my implementation
and my test. It sounds a little weird to me that the lock-free
version is 30 times faster than the mutexed version. BTW my system is
a Quad-core running at 3Ghz and my test has one thread pushing
elements while 2 other threads pop'ed elements (non-stop). I use this
technique because it takes longer to Pop than it does to Push and I
want to balance out the load to maximize my CPU usage. I also know
that the way I link my deleted memory may not be safe for certain
algorithms, I do however believe that it's perfectly fine for a
stack. In practice, I could allocate an extra word every time I
allocate a node merely to use it for that purpose.

Comments are appreciated!

// externs...
void __fastcall Read64(void *volatile Src, void *Dst);

// 64bit write
void __fastcall Write64(void *volatile Dst, void *Src);

// Atomic ++
void __fastcall Inc(void *Dst);

// Atomic --
bool __fastcall Dec(void *Dst);

// Atomic +=
unsigned long __fastcall XAdd(void *Dst, long Addend);

// Compare and exchange (memory version)
char __fastcall CAS(void *volatile Memory, long New, long Old);

// Compare and exchange (64bit version)
char __fastcall CAS2(void *volatile Memory, void *New, void *Old);

// Compare and exchange (64bit version)
char __fastcall CAS2(void *volatile Memory, long NewL, long NewH, long
OldL, long OldH);

/*
** Lock-Free Proxy Collector
*/
struct ProxyObject {
long Count; // Reference count
ProxyObject *Next; // Next reference
void *List; // Deletion list

~ProxyObject()
{
void *Next;
for (; List; List = Next) {
Next = *(void **)List; // First word is the 'next'

free(List);
}
}

void Defer(void *Memory)
{
do {
*(void **)Memory = List;
} while (!CAS((long *)&List, (long)Memory, *(unsigned long
*)Memory));
}
};

struct ProxyCollector {
ProxyObject *volatile Collector; // (this)
volatile long Count; // Reference count

static ProxyCollector Master;

static void __fastcall Init(void)
{
Master.Collector = new ProxyObject;
Master.Count = 0;

Master.Collector->Count = -1;
Master.Collector->List = NULL;
Master.Collector->Next = NULL;
}

static ProxyObject * __fastcall ProxyCollector::AddRef(void)
{
ProxyCollector New, Old;

memset((void *)&Old.Collector, 0, 4 * 2);

Read64((void *)&Master.Collector, (void *)&Old.Collector);

int Diff = Master.Count + Master.Collector->Count;

// Increment and get the pointer reference to the master
collector
do {
New.Collector = Old.Collector;
New.Count = Old.Count + 1;
} while (!CAS2((void *)&Master.Collector, (void
*)&New.Collector, (void *)&Old.Collector));

return Old.Collector;
}

static void __fastcall RemRef(ProxyObject *Collect)
{
// If we're out of elements, free/pop the colletor's stack
while (Dec(&(Collect->Count))) {
ProxyObject *const Next = Collect->Next;

delete Collect;

Collect = Next;
}
}

static void Mutate(void)
{
ProxyCollector New, Old;
ProxyObject *NewObj = new ProxyObject;

NewObj->Count = 0; // It always starts with one back-
link
NewObj->List = NULL;
NewObj->Next = NULL;

Read64((void *)&Master.Collector, (void *)&Old.Collector);

New.Collector = NewObj;
New.Count = 0;
Master.Collector->Next = NewObj;

// Insert the new collector
do {
} while (!CAS2((void *)&Master.Collector, (void
*)&New.Collector, (void *)&Old.Collector));

// Combine the reads with the releases plus the backlink
if (XAdd((void *)&Old.Collector->Count, Old.Count + 1) == 0) {
ProxyObject *const Next = Old.Collector->Next;

delete Old.Collector;

RemRef(Next);
}
}
};

ProxyCollector ProxyCollector::Master;

/*
** Lock-Free Proxy Collected Stack
*/
struct Stack {
Stack() : Top(NULL) { }

struct StackNode {
StackNode *Next;
unsigned long Data;
};

void Push(register unsigned long Data)
{
pthread_mutex_lock(&Mutex);

StackNode *Node = (StackNode *)malloc(sizeof(StackNode)); //
new StackNode();

Node->Data = Data;
Node->Next = (StackNode *)Top;
Top = Node;

pthread_mutex_unlock(&Mutex);
}

unsigned long Pop(void)
{
pthread_mutex_lock(&Mutex);

if (!Top) {
pthread_mutex_unlock(&Mutex);
return -1;
}

StackNode *Old = Top;
const unsigned long Data = Top->Data;

Top = Top->Next;

pthread_mutex_unlock(&Mutex);
free(Old);

return Data;
}

public:
StackNode *Top;
};

/*
** Lock-Free Stack
*/
struct LFStack {
LFStack() : Count(0), Top(NULL) { }

struct LFStackNode {
LFStackNode *Next;
unsigned long Data;
};

struct LFStackPtr {
LFStackNode *Top;
unsigned long Count;
};

void Push(register unsigned long Data)
{
LFStackNode *Node = (LFStackNode
*)malloc(sizeof(LFStackNode)); //new LFStackNode();

Node->Data = Data;

do {
Node->Next = (LFStackNode *)Top;
} while (!CAS((long *)&Top, (long)Node, (long)Node->Next));
/*

__asm {
mov edi, [this] ; edi = &this
mov esi, [Node] ; esi = &Node
};
LoopPush:
__asm {
mov eax, [edi] ; eax = Top
mov [esi], eax ; Node->Next = Top;
lock cmpxchg [edi], esi
jne LoopPush
};
*/
}

unsigned long Pop(void)
{
do {
__try {
/*
register LFStackNode *OldNode;

__asm {
mov edi, [this] ; edi = &this
xor eax, eax
xor edx, edx
xor ebx, ebx
xor ecx, ecx

lock cmpxchg8b [edi] ; edx:eax = this
}
LoopPop:
__asm {
test eax, eax
je PopOut

mov ebx, [eax] ; ebx = Top->Next
lea ecx, [edx + 1] ; ebx = Top->Count + 1
lock cmpxchg8b [edi] ; ebx:ecx = Next
jne LoopPop

mov OldNode, eax ; OldNode = Top
}

const unsigned long Data = OldNode->Data;

free(OldNode);

return Data;

PopOut:
return -1;
*/

LFStackPtr Head, Next;

do {
Read64((void *volatile)&Top, &Head.Top);

if (!Head.Top)
return -1;

Next.Count = Head.Count + 1;
Next.Top = Head.Top->Next;

} while (!CAS2((void *volatile)&Top, (long)Head.Count,
(long)Head.Top, (long)Next.Count, (long)Next.Top));

const unsigned long Data = Head.Top->Data;

free(Head.Top);

return Data;

} __except ( EXCEPTION_EXECUTE_HANDLER ) {
// if we got a page-fault, then we want to re-start
the loop because the OS paged the memory out on us
}
} while (1);
}

public:
LFStackNode *volatile Top;
volatile unsigned long Count;
};

/*
** Lock-Free Proxy Collected Stack
*/
struct LFPCStack {
LFPCStack() : Top(NULL) { }

struct LFPCStackNode {
LFPCStackNode *Next;
unsigned long Data;
};

void Push(register unsigned long Data)
{
LFPCStackNode *Node = (LFPCStackNode
*)malloc(sizeof(LFPCStackNode)); //new LFPCStackNode();

Node->Data = Data;

ProxyObject *Obj = ProxyCollector::AddRef();

do {
Node->Next = (LFPCStackNode *)Top;
} while (!CAS((long *)&Top, (long)Node, (long)Node->Next));

ProxyCollector::RemRef(Obj);
}

unsigned long Pop(void)
{
LFPCStackNode *Head, *Next;
ProxyObject *Obj = ProxyCollector::AddRef();

do {
Head = Top;

if (!Head) {
ProxyCollector::RemRef(Obj);
return -1;
}

Next = Head->Next;
} while (!CAS((void *volatile)&Top, (long)Next, (long)Head));

const unsigned long Data = Head->Data;

Obj->Defer(Head);
ProxyCollector::RemRef(Obj);

return Data;
}

public:
LFPCStackNode *volatile Top;
};

LFPCStack Stack;
bool GO = true;

void *ThreadFunc(void *)
{
do {
Stack.Pop();
} while (GO);

while (Stack.Pop() != -1);

return NULL;
}

void *ThreadFunc2(void *)
{
do {
Stack.Pop();
} while (GO);

while (Stack.Pop() != -1);

return NULL;
}

void *ThreadFunc3(void *)
{
do {
Stack.Pop();
} while (GO);

while (Stack.Pop() != -1);

return NULL;
}

int main(int argc, char* argv[])
{
char c;
long long Freq;
long long Start, End;
double Frequency;

QueryPerformanceFrequency((LARGE_INTEGER *)&Freq);
Frequency = (double)Freq / 1000;

pthread_mutex_init(&Mutex, NULL);

ProxyCollector::Init();

printf("\n");

QueryPerformanceCounter((LARGE_INTEGER *)&Start);
pthread_mutex_init(&Mutex, NULL);
pthread_create (&Thread, NULL, ThreadFunc, NULL);
pthread_create (&Thread3, NULL, ThreadFunc2, NULL);
// pthread_create (&Thread2, NULL, ThreadFunc3, NULL);

unsigned long Int = 0;

do {
Stack.Push(rand() % 100);

if (rand() == 0x5EAF)
ProxyCollector::Mutate();

} while (++Int != 0x10000);

GO = false;

while (Stack.Pop() != -1);

ProxyCollector::Mutate();

QueryPerformanceCounter((LARGE_INTEGER *)&End);

double Speed = (End - Start) / Frequency;
printf("Time: [%lf ms] Rate: [%fK]\n", Speed, Int*2 / Speed);

getchar();
return 0;
}

ti_chris

unread,

Jan 23, 2008, 5:33:29 PM1/23/08

to

Some more disturbing facts and validation of the locking issue
regarding memory...

I have moved my mutex lock so that it includes the "free" (ie: one
line down from it's current location in the Pop function) and I got a
8.8 times speed improvement.

The lock implementation now does 253Kops/s leaving the lock-free Proxy
Collector in the dust due. I am sure however that theses numbers will
change once a decent memory manager is instated. I notice that in all
cases where the algo is slow, the push gets done really fast and the
pop operation is basically queued for a while at the end. This sounds
like some form of contention which favors the Push for some reason.

ti_chris

unread,

Jan 23, 2008, 5:50:23 PM1/23/08

to

Finally, a spin-lock version of this algorithm does about 725Kops/s

Chris Thomasson

unread,

Jan 23, 2008, 6:56:15 PM1/23/08

to

"ti_chris" <tiCh...@gmail.com> wrote in message
news:3a471d88-2c14-4ba7...@u10g2000prn.googlegroups.com...

> Alright, here's my code for the implementation of a stack. The
> results I've obtained with this code are as follows:
>
>
> Stack: Stack with Mutexes = 28.8Kops/s
> LFStack: Microsoft Lock-Free Stack with page-fault catch = 880Kops/s
> LFPCStack: Lock-Free Proxy Collector Stack = 28Kops/s

[...]

I am going to read through your code; thanks for posting it. However, its
good to keep in mind that the proxy collector in question uses interlocking
on every operation. The LFPCStack::Pop function now has at least 4
interlocked ops, and the LFPCStack::Push* function has at least 3. An
uncontended mutex will have only 2 for push _and_ pop; your twice as
expensive on the fast-path. This basically holds true for any proxy
collector that is based on interlocked ops.

Also, the whole point of proxy collection is to amortize the cost of the
interlocking and membars over the depth of the "protected" collection. You
are using it to collect the pop function of a lock-free stack. There is only
1 operation in there, so your simply not going to see any amortization.
However, if you were to iterate over the stack while it contained multiple
items, then, your going to see the benefits... That's why Proxy GC is good
for implementing robust reader/writer patterns.

You code uses malloc/free on every operation. This will muck up testing
results unless you code the allocator from scratch. I would suggest that you
stick a little node caching algorithm into your testing code; see what that
does.

One more thing... The LFPCStack::Push function does not need to be in a
collected region because its not subject to any faulty memory access
issues...

Humm... Can the ProxyCollector::Mutate function be called by multiple
threads? There is a line in there that caught my eye:

static void Mutate(void)
{
[...]
Master.Collector->Next = NewObj;
[...]
}

What if multiple threads set the Next pointer in succession? Won't some
NewObj's get lost? Please correct me if I am wrong...

ti_chris

unread,

Jan 23, 2008, 10:49:45 PM1/23/08

to

On Jan 23, 3:56 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> "ti_chris" <tiChri...@gmail.com> wrote in message

>
> news:3a471d88-2c14-4ba7...@u10g2000prn.googlegroups.com...> Alright, here's my code for the implementation of a stack. The
> > results I've obtained with this code are as follows:
>
> > Stack: Stack with Mutexes = 28.8Kops/s
> > LFStack: Microsoft Lock-Free Stack with page-fault catch = 880Kops/s
> > LFPCStack: Lock-Free Proxy Collector Stack = 28Kops/s
>
> [...]
>
> I am going to read through your code; thanks for posting it. However, its
> good to keep in mind that the proxy collector in question uses interlocking
> on every operation. The LFPCStack::Pop function now has at least 4
> interlocked ops, and the LFPCStack::Push* function has at least 3. An
> uncontended mutex will have only 2 for push _and_ pop; your twice as
> expensive on the fast-path. This basically holds true for any proxy
> collector that is based on interlocked ops.

Yes; I agree. I merely wanted to implement it to see how it works and
how I could possibly use it to solve the memory reclamation issue in a
clean fashion.

>
> Also, the whole point of proxy collection is to amortize the cost of the
> interlocking and membars over the depth of the "protected" collection. You
> are using it to collect the pop function of a lock-free stack. There is only
> 1 operation in there, so your simply not going to see any amortization.
> However, if you were to iterate over the stack while it contained multiple
> items, then, your going to see the benefits... That's why Proxy GC is good
> for implementing robust reader/writer patterns.
>

So are you saying that it should keep a "freelist" which goes onto the
proxy or something along those lines?

> You code uses malloc/free on every operation. This will muck up testing
> results unless you code the allocator from scratch. I would suggest that you
> stick a little node caching algorithm into your testing code; see what that
> does.

Totally agree. I need to use a form of pre-allocated arrray/bucket
scheme to seriously test this one out. I'll do that and post my
results as a means of comparison :). With the above numbers, it's
clear that malloc/free is mucking with me.

>
> One more thing... The LFPCStack::Push function does not need to be in a
> collected region because its not subject to any faulty memory access
> issues...

Very true. Good point.

>
> Humm... Can the ProxyCollector::Mutate function be called by multiple
> threads? There is a line in there that caught my eye:
>
> static void Mutate(void)
> {
> [...]
> Master.Collector->Next = NewObj;
> [...]
> }
>
> What if multiple threads set the Next pointer in succession? Won't some
> NewObj's get lost? Please correct me if I am wrong...

No; the mutator is called by a single thread whenever it feels like
it. It is not made to be thread-safe. In theory,

ti_chris

unread,

Jan 24, 2008, 5:27:06 AM1/24/08

to

Alright. I've tested my algorithm with an extremely simple lock xadd
using an array as a means to do memory allocation and have come up
with some more realistic numbers that may be interesting to all of you
guys. This version of the code has Chris' suggested fix for the
proxied version and no longer has any locks in the lock-free
implementations *YAY*.

Mutex: 1.5Mops/s (spinlocks were about the same speed)
LFPC: 3.6Mops/s (Much nicer indeed)
LF: 3.8Mops/s (still the winer)

So it's pretty clear that locking memory management is a HUGE
hindrance to lock-free algos. This is a somewhat unrealistic
implementation since it statically allocates a HUGE amount of memory
that is never re-used and the code contains a lot of validation (plus
debug code) to make sure that it works correctly, however it does
indeed illustrate that lock-free is significantly faster (more than
twice as fast) than traditional mutex algorithms for stacks.

I was initially surprised to see the PC version seriously outperform
the mutex version. However thinking about it, I am not so surprised
anymore. Moreover, the exclusion makes sure I can't read while I
write and vice versa and since I'm running at 3 threads on a quad-
core, every threads are churning full-blast in the lock-free methods;
so theses results make sense to me.

At least in this specific case, it looks like my implementation of the
PC cost about 5% more. I suppose this would be more in the case of a
queue which would require more work from the proxy. However it is
interesting to me to see that memory reclamation is not that big of an
overhead in this case. I haven't compared, but I'm assuming that
hazard pointers would be more or less in par with theses numbers. ie:
The GC overhead wouldn't be too big (along the order of 5% I would
assume).

I do however want to try and finish my optimized 1-word proxy
collector and see how much I can for a wait-free PC. So possibly more
code and numbers coming up ahead :)

Dmitriy Vyukov

unread,

Jan 24, 2008, 8:09:54 AM1/24/08

to

On Jan 24, 1:27 pm, ti_chris <tiChri...@gmail.com> wrote:

> Mutex: 1.5Mops/s (spinlocks were about the same speed)
> LFPC: 3.6Mops/s (Much nicer indeed)
> LF: 3.8Mops/s (still the winer)

If you are looking at lock-free algos as a means for improving
performance and scalability, then you better look at more distributed
solutions. Centralized/global containers inherently *not* efficient
and *not* scalable. Because of high memory contention. Memory
contention is even more expensive than atomic-rmw on local memory
location.
For example you can replace one multiple-producer/multiple-consumer
container with N multiple-producer/single-consumer containers, or N*N
single-producer/single-consumer containers. Such containers contains
less atomic-rmw per operation (to the point of zero atomic-rmw per
operation), and more importantly naturally reduces contention for
memory locations. But of course you then will have to cope with new
problems such as load-balancing and polling.
Centralized memory allocator can be replaced with N local memory
allocators. Proxy-collector can be replaced with RCU-like (or SMR+RCU)
PDR scheme. And so on.
With smart distributed solution you can see order of magnitude
improvement in performance and scalability, and exactly at this point
you can see ungovernable power of lock-free algos over lock-based.

Dmitriy V'jukov

ti_chris

unread,

Jan 24, 2008, 6:03:32 PM1/24/08

to

You are right on all counts. However I am not interested in using
lock-free algorithms for SMP servers or something no-one has in their
backyard. I am looking at this from the perspective of the simple
home PC user that has a Quad-core CPU. As such, I'm not really
interested in having 1'000 threads running since any more than 4-6
threads on a quad-core is going to slow down the system anyhow. Some
points you raised will still inevitably improve performance on a low-
thread count such as tmultiple memory allocators. Given my numbers, I
don't think that SMR + RCU will give me much of an improvement (if
any) over a proxy collector. At least for the stack, it can't be any
more than 5%.

I'm curious to see how Intel/AMD think that their processors will
scale. Right now, the memory BUS is shared between the multiple
cores. This is a serious bottleneck for multi-processor communication
unless the bus increases linearly. I'm not sure how much using an L2
cache line will help going forward.

Point taken however, you want to inter-thread contention and opt for
more localized autonomous work.

Dmitriy Vyukov

unread,

Jan 25, 2008, 5:48:43 AM1/25/08

to

On Jan 25, 2:03 am, ti_chris <tiChri...@gmail.com> wrote:

> You are right on all counts. However I am not interested in using
> lock-free algorithms for SMP servers or something no-one has in their
> backyard. I am looking at this from the perspective of the simple
> home PC user that has a Quad-core CPU.

I'm targeted exactly for the same case.
Btw think about next Intel processor - 8 cores x 2 hardware threads.

Try to make following test with lock-free stack. Create 4 threads.
Bind them to:
1. All threads to core 0
2. All threads to cores 0 and 1
3. All threads to cores 0 and 2
4. All threads to cores 0, 1, 2, 3

Calculate throughput and scalability relatively to case 1. I think
results will be interesting to you.

Dmitriy V'jukov

Dmitriy Vyukov

unread,

Jan 25, 2008, 8:37:52 AM1/25/08

to

On Jan 23, 11:14 am, ti_chris <tiChri...@gmail.com> wrote:

> Alright, here's my code for the implementation of a stack

I think one can combine lock-free stack algorithm with proxy-collector
into one.
If we use DWCAS we can place into one DWORD: pointer to stack head,
pointer to proxy-collector head and reference counter for proxy-
collector.

struct stack
{
unsigned head : 28;
unsigned pc_head : 28;
unsigned pc_rc : 8;
};

I assume 16 byte alignment. If we also steal high bit from pointers
then we get 10 bit pc_rc.
So pop() will be something like:

pop()
{
DWCAS to increment pc_rc
do
{
calculate new value
DWCAS to pop node, decrement rc and push node onto pc_head (3 in
1!)
} while (success)
}

So pop() is only 2 interlocked RMW (on fast-path) instead of 4
interlocked RMW.

What do you think?

Dmitriy V'jukov

Chris Thomasson

unread,

Jan 25, 2008, 2:47:32 PM1/25/08

to

On Wed, 23 Jan 2008 19:49:45 -0800, ti_chris wrote:

> On Jan 23, 3:56 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>> "ti_chris" <tiChri...@gmail.com> wrote in message
>>

>> news:3a471d88-2c14-4ba7-
b9f0-767...@u10g2000prn.googlegroups.com...>
[...]

>> Also, the whole point of proxy collection is to amortize the cost of
>> the interlocking and membars over the depth of the "protected"
>> collection. You are using it to collect the pop function of a lock-free
>> stack. There is only 1 operation in there, so your simply not going to
>> see any amortization. However, if you were to iterate over the stack
>> while it contained multiple items, then, your going to see the
>> benefits... That's why Proxy GC is good for implementing robust
>> reader/writer patterns.
>>
>>
> So are you saying that it should keep a "freelist" which goes onto the
> proxy or something along those lines?

No. Here is how I make use of proxy collection, try and bear with be
here... I will use the API from my pc_sample.c collector:
_________________________________________________________________
struct node {
node* nx;
[...];
};

struct list {
node* hd;
};

static pc_t g_proxy; /* pc_alloc(&g_proxy) */
static list g_list = {0};
static pthread_mutex_t g_mtx = PTHREAD_MUTEX_INITALIZER;

void list_iterate(dlist* const _this) {
pc_deferred* const defer = pc_inc(&g_proxy);
node* n = _this->hd;
while (n) {
node* const nx = n->nx;
do_something_readonly(n);
n = nx;
}
pc_dec(defer);
}

node* list_pop(list* const _this) {
pthread_mutex_lock(&g_mtx);
node* const n = _this->hd;
if (hd) {
_this->hd = n->nx;
}
pthread_mutex_unlock(&g_mtx);
return n;
}

void node_defer(void* state) {
free(state);
}

void list_pop_and_free(list* const _this) {
node* const n = list_pop(_this);
if (n) {
pc_defer(&g_proxy, node_defer, n);
}
}
_________________________________________________________________

Read here:

https://coolthreads.dev.java.net/servlets/ProjectForumMessageView?
forumID=1797&messageID=11068

Refer to the pseudo-code at the end. The code under the "/* Read-Only
Threads */" section can be protected via. proxy reference. This basically
allows threads to perform full-blown traversal's of data-structures while
there are concurrent mutations.

Also, read here:

http://groups.google.com/group/comp.programming.threads/browse_frm/thread/
b586dbbbf273764d

It analogous of a simple RCU pattern; I think that you can try this out in
your code by adding the following function:
_________________________________________________________________
LFPCStack::Iterate() {
ProxyObject* const Obj = ProxyCollector::AddRef();
LFPCStackNode* Node = Top;
while (Node) {
LFPCStackNode* const Next = Node->Next;
// DoSomethingReadOnly(Node);
Node = Next;
}
ProxyCollector::RemRef(Obj);
}
_________________________________________________________________

I think this should work for you because your back-linking proxy objects.
However, I don't know for sure because I have not had time to study your
implementation. If your not back-linking properly, FIFO, then your going
to experience a nasty race-condition... Even thought it seems like it
should be okay, its not.

>> You code uses malloc/free on every operation. This will muck up testing
>> results unless you code the allocator from scratch. I would suggest
>> that you stick a little node caching algorithm into your testing code;
>> see what that does.
>
> Totally agree. I need to use a form of pre-allocated arrray/bucket
> scheme to seriously test this one out. I'll do that and post my results
> as a means of comparison :). With the above numbers, it's clear that
> malloc/free is mucking with me.

;^)

>> Humm... Can the ProxyCollector::Mutate function be called by multiple
>> threads? There is a line in there that caught my eye:
>>
>> static void Mutate(void)
>> {
>> [...]
>> Master.Collector->Next = NewObj;
>> [...]
>> }
>>
>> What if multiple threads set the Next pointer in succession? Won't some
>> NewObj's get lost? Please correct me if I am wrong...
>
> No; the mutator is called by a single thread whenever it feels like it.
> It is not made to be thread-safe. In theory,

Okay. I see what your doing here with the Mutate function. Humm... I think
I am going to revisit pc_sample.c...

Chris Thomasson

unread,

Jan 26, 2008, 1:39:27 AM1/26/08

to

On Thu, 24 Jan 2008 15:03:32 -0800, ti_chris wrote:

> On Jan 24, 5:09 am, Dmitriy Vyukov <dvyu...@gmail.com> wrote:
>> On Jan 24, 1:27 pm, ti_chris <tiChri...@gmail.com> wrote:
>>
>> > Mutex: 1.5Mops/s (spinlocks were about the same speed) LFPC:
>> > 3.6Mops/s (Much nicer indeed)
>> > LF: 3.8Mops/s (still the winer)

[...]an be replaced with RCU-like (or SMR+RCU) PDR scheme.

>> And so on.
>> With smart distributed solution you can see order of magnitude
>> improvement in performance and scalability, and exactly at this point
>> you can see ungovernable power of lock-free algos over lock-based.
>>
>> Dmitriy V'jukov
>
> You are right on all counts. However I am not interested in using
> lock-free algorithms for SMP servers or something no-one has in their
> backyard.

[...]

What are you going to do when most working families find themselves having
the ability to actually be able to afford systems that can possibly
contain hundreds/thousands of processing-units per-physical-chip?

ti_chris

unread,

Jan 26, 2008, 1:56:11 AM1/26/08

to

On Jan 25, 11:47 am, Chris Thomasson <cris...@comcast.net> wrote:
> It analogous of a simple RCU pattern; I think that you can try this out in
> your code by adding the following function:
> _________________________________________________________________
> LFPCStack::Iterate() {
> ProxyObject* const Obj = ProxyCollector::AddRef();
> LFPCStackNode* Node = Top;
> while (Node) {
> LFPCStackNode* const Next = Node->Next;
> // DoSomethingReadOnly(Node);
> Node = Next;
> }
> ProxyCollector::RemRef(Obj);}
>
> _________________________________________________________________
>
> I think this should work for you because your back-linking proxy objects.
> However, I don't know for sure because I have not had time to study your
> implementation. If your not back-linking properly, FIFO, then your going
> to experience a nasty race-condition... Even thought it seems like it
> should be okay, its not.

Yes; this works flawlessly and is effectively the right way to do
this. Adding to the reference count basically makes sure that no
nodes that existed at the time of the increment will be deleted. I'm
curious however, you are using a mutex for the write threads (aka:
pop)?

> >> Humm... Can the ProxyCollector::Mutate function be called by multiple
> >> threads? There is a line in there that caught my eye:
>
> >> static void Mutate(void)
> >> {
> >> [...]
> >> Master.Collector->Next = NewObj;
> >> [...]
> >> }
>
> >> What if multiple threads set the Next pointer in succession? Won't some
> >> NewObj's get lost? Please correct me if I am wrong...
>
> > No; the mutator is called by a single thread whenever it feels like it.

> > It is not made to be thread-safe. In theory.

>
> Okay. I see what your doing here with the Mutate function. Humm... I think
> I am going to revisit pc_sample.c...

;^)

ti_chris

unread,

Jan 26, 2008, 1:59:38 AM1/26/08

to

I think we have bigger hardware design problems related to regular
memory access when we even get close to 8 cpus :). There's
contention on the memory BUS already, I can't even imagine how 8 cores
are going to kill it (they'll need to assign memory to specific cores
to make this work) and even that doesn't solve the intercommunication
issues at a hardware level :/.

ti_chris

unread,

Jan 26, 2008, 2:17:11 AM1/26/08

to

> Okay. I see what your doing here with the Mutate function. Humm... I think
> I am going to revisit pc_sample.c...

haha. I think I'm just starting to see also how you used the proxy
collector. It seems to be somewhat different. In theory, mine should
be a little more optimal since it batches deleted nodes instead of
"mutating" the proxy every time a node is deleted (to use my
terminology). That's also why I can get away with using only 256
proxy objects. If I batch my nodes, I can actually remove many nodes
and handle a very big load without really taking the hit of mutating
the proxy every time something gets written to. Then it really just
comes down to choosing a good frequency at which to mutate that will
balance the load nicely.

Chris Thomasson

unread,

Jan 26, 2008, 2:28:15 AM1/26/08

to

On Fri, 25 Jan 2008 23:17:11 -0800, ti_chris wrote:

>> Okay. I see what your doing here with the Mutate function. Humm... I
>> think I am going to revisit pc_sample.c...
>
>
> haha. I think I'm just starting to see also how you used the proxy
> collector. It seems to be somewhat different. In theory, mine should
> be a little more optimal since it batches deleted nodes instead of
> "mutating" the proxy every time a node is deleted (to use my
> terminology).

Indeed. Well, I know that I posted this as an optimization possibility to
pc_sample.c to this group in the past two or there years. DAMN! Time to to
a crappy Google search! Shi%!!! This is going to SUCK!

> That's also why I can get away with using only 256 proxy
> objects. If I batch my nodes, I can actually remove many nodes and
> handle a very big load without really taking the hit of mutating the
> proxy every time something gets written to.

Right. But, you still will "probably" have to deal with batch sizing
issues as some point in time...

> Then it really just comes
> down to choosing a good frequency at which to mutate that will balance
> the load nicely.

Welcome to my world! ;^)

Chris Thomasson

unread,

Jan 26, 2008, 2:39:46 AM1/26/08

to

On Fri, 25 Jan 2008 22:56:11 -0800, ti_chris wrote:

> On Jan 25, 11:47 am, Chris Thomasson <cris...@comcast.net> wrote:

[...]

>
> Yes; this works flawlessly and is effectively the right way to do this.
> Adding to the reference count basically makes sure that no nodes that
> existed at the time of the increment will be deleted. I'm curious
> however, you are using a mutex for the write threads (aka: pop)?

[...]

I use mutex simply because I can do double-lists, complex trees, or
anything else that _cannot_ be done with lock-free methods. See, I am all
about joining lock-based and lock-free methods together in a highly
effective marriage; I am not a lock-free die-hard religious freak...
SenderX is another story all-together as he was an hard-core _assho%;e_
ready to blow people's posts away with random gunfire; however, he did
seem to have the ability to spur debate, to say the least!

;^)

Chris Thomasson

unread,

Jan 26, 2008, 1:34:21 PM1/26/08

to

On Fri, 25 Jan 2008 22:59:38 -0800, ti_chris wrote:
[...]

>
> I think we have bigger hardware design problems related to regular
> memory access when we even get close to 8 cpus :). There's contention
> on the memory BUS already, I can't even imagine how 8 cores are going to
> kill it (they'll need to assign memory to specific cores to make this
> work) and even that doesn't solve the intercommunication issues at a
> hardware level :/.

What about the 64-core processors from IBM that are available today?

John Dallman

unread,

Jan 26, 2008, 3:11:00 PM1/26/08

to

In article <-rednWiFeNOw4Aba...@comcast.com>,
cri...@comcast.net (Chris Thomasson) wrote:

> What about the 64-core processors from IBM that are available today?

Details, please? How do they connect to memory?

--
John Dallman, j...@cix.co.uk, HTML mail is treated as probable spam.

ti_chris

unread,

Jan 26, 2008, 5:44:05 PM1/26/08

to

I'd have to look at their architecture to understand more about them.
Moreover, you cannot have 64 processors share the same memory bus as
it's main line of memory like the Intel/AMD cores do. You reach a
point at which pure memory bandwidth doesn't add up to having so many
cores access memory. Now there's a couple of ways to design the
hardware to minimize that contention on the main bus. I know that
Intel shares one L2 cache per 2 core (thus making communication
between those 2 cores relatively fast. So the only way you hit the
main bus is if you use too much memory or if you need to communicate
to other threads. Caching like this alleviates the problem, but it
does not solve it. Another possible solution to this would be to
assign core-owned memory to each core. This still leaves the open
question of how do you efficiently resolve cross-processor
communication. Something you could possibly fix by implementing multi-
leveled caches. However, such a setup is likely to be too expensive.

Personally, I'm less interested in the server/mainframe type of code
than I am with the average Joe's computer today. I may be wrong, but
I don't think that we can generally treat server type applications the
same way we treat consumer type applications. And I think that there
are still a couple of unresolved memory issues with computers that
have a lot of cores. A solution may already be out there, but I
haven't seen or read about it.

Chris Thomasson

unread,

Jan 27, 2008, 12:30:58 AM1/27/08

to

On Sat, 26 Jan 2008 20:11:00 +0000, John Dallman wrote:

> In article <-rednWiFeNOw4Aba...@comcast.com>,
> cri...@comcast.net (Chris Thomasson) wrote:
>
>> What about the 64-core processors from IBM that are available today?
>
> Details, please?

Ask IBM.

> How do they connect to memory?

Distributed:

http://groups.google.com/group/comp.arch/msg/51b390b63b733ff8

Joe Seigh

unread,

Jan 27, 2008, 9:18:14 AM1/27/08

to

Shared cache is dead or will be along with UMA. Multi-core systems
in the future will be NUMA. Caching algorithms will be developed
by software programmers who understand how shared memory multi-threading
works, not by hardware engineers who are congenitally incapable of ever
understanding that kind of stuff. They'll try to hang on to their strongly
coherent MESI protocols as long as they can but the market will quickly
take care of that.

--
Joe Seigh

When you get lemons, you make lemonade.
When you get hardware, you make software.

John Dallman

unread,

Jan 27, 2008, 10:35:00 AM1/27/08

to

In article <RsWdnSeXdMyPigHa...@comcast.com>,

cri...@comcast.net (Chris Thomasson) wrote:
> On Sat, 26 Jan 2008 20:11:00 +0000, John Dallman wrote:
> > cri...@comcast.net (Chris Thomasson) wrote:
> >> What about the 64-core processors from IBM that are available
> > today?
> > Details, please?
> Ask IBM.

Unlike many people on Usenet these days, I wasn't asking for details as
a way of claiming that you were making something up. All I wanted was
some kind of name that I could use to search IBM's website and try in
Google. I admit, thinking about it, that asking for details is often
considered a hostile response these days, so here's a polite request:

Do you have any names or model numbers that could be used to look up
some details of this 64-core processor?

The results of Google on "IBM 64 core" are almost all about the p5 model
595 high-end server, which isn't a 64-core processor: it's a system with
32 conventional dual-core POWER processors, each with attached RAM. Since
programming that would be distinctly different from a true 64-core
processor, and you generally know what you're talking about with
parallel programming, I deduce that you're talking about some other IBM
product. But with no idea of what it's called, I can't look it up.

If you meant this thing, it isn't an IBM product - just mentioned on an
IBM blog
http://www.ibm.com/developerworks/blogs/page/johnston?entry=64_core_thinkp
ad_anyone
http://arstechnica.com/articles/paedia/cpu/MIT-startup-raises-multicore-ba
r-with-new-64-core-CPU.ars

Chris Thomasson

unread,

Jan 27, 2008, 1:15:27 PM1/27/08

to

On Sun, 27 Jan 2008 15:35:00 +0000, John Dallman wrote:

> In article <RsWdnSeXdMyPigHa...@comcast.com>,
> cri...@comcast.net (Chris Thomasson) wrote:
>> On Sat, 26 Jan 2008 20:11:00 +0000, John Dallman wrote:
>> > cri...@comcast.net (Chris Thomasson) wrote:
>> >> What about the 64-core processors from IBM that are available
>> > today?
>> > Details, please?
>> Ask IBM.
>
> Unlike many people on Usenet these days, I wasn't asking for details as
> a way of claiming that you were making something up.

[...]

http://www.tgdaily.com/content/view/33451/135

I screwed up an thought that IBM created this; sorry about that.

;^(

Chris Thomasson

unread,

Jan 27, 2008, 1:31:11 PM1/27/08

to

On Sun, 27 Jan 2008 15:35:00 +0000, John Dallman wrote:

> In article <RsWdnSeXdMyPigHa...@comcast.com>,
> cri...@comcast.net (Chris Thomasson) wrote:
>> On Sat, 26 Jan 2008 20:11:00 +0000, John Dallman wrote:
>> > cri...@comcast.net (Chris Thomasson) wrote:
>> >> What about the 64-core processors from IBM that are available
>> > today?
>> > Details, please?

[...]

Check this out as well:

http://www.news.com/Intel-shows-off-80-core-processor/2100-1006_3-6158181.html

John Dallman

unread,

Jan 27, 2008, 2:03:00 PM1/27/08

to

In article <e9WdnYo6_JmiVwHa...@comcast.com>,
cri...@comcast.net (Chris Thomasson) wrote:

> http://www.tgdaily.com/content/view/33451/135

Aha... it's TILE64, and that page says: "Initial customers using the
processor in upcoming products include 3Com, Top Layer, Codian and
GoBackTV."

Makes sense: two likely doing network packet processing and two doing
video compression/decompression. Both are tasks that can be split over a
pipeline set of cores and have memory access patterns that are amenable
to the much lower memory bandwidth:processing power ratio of this kind
of CPU. They can also afford to write code for the number of cores
available, without having to worry about there being more or less of
them, because the code will only be running on hardware the software
vendor built. Writing threaded code for desktop PCs doesn't really allow
that. Much as Dell or HP would love people to produce software
specifically for their machines and no others, it's the wrong thing to
do in the medium to long term.

There's nearly an excuse for giving that tile style of hardware a name
of its own, "corepipelined", or something like that. There's an analogy
to a pipelined single-core processor that you can draw, with each core
in a pipeline doing a stage in an algorithm that's tightly defined and
doesn't have much in the way of variations.

> I screwed up an thought that IBM created this; sorry about that.

No problem; makes sense of the situation.

ti_chris

unread,

Jan 27, 2008, 10:48:45 PM1/27/08

to

> Shared cache is dead or will be along with UMA. Multi-core systems
> in the future will be NUMA. Caching algorithms will be developed
> by software programmers who understand how shared memory multi-threading
> works, not by hardware engineers who are congenitally incapable of ever
> understanding that kind of stuff. They'll try to hang on to their strongly
> coherent MESI protocols as long as they can but the market will quickly
> take care of that.

Yup agreed. Shared caches (MESI) is only a cheap short-term solution
that will work for a few cores. To make things scale, you need
something like NUMA (even though the memory ops costs are not really
as predictable). However I wonder if it's going to take a flop from
Intel for the hardware guys to understand and implement a better
memory scheme. Historically with Intel, it did require screw-ups. I
personally wouldn't mind moving off to a newer design (overall). The
x86 design has enough archaic things in it as it is. In the meantime,
we'll just have to try and do the best that we can with 40-year old
hardware that was born on steroids.

Chris Thomasson

unread,

Jan 28, 2008, 12:47:53 AM1/28/08

to

Read this:

http://www.news.com/Designer-puts-96-cores-on-single-chip/2100-1006_3-5399128.html?tag=st.nl

Dmitriy Vyukov

unread,

Jan 28, 2008, 4:11:04 AM1/28/08

to

On Jan 26, 10:39 am, Chris Thomasson <cris...@comcast.net> wrote:
>
> I use mutex simply because I can do double-lists, complex trees, or
> anything else that _cannot_ be done with lock-free methods.

But if you use lock-free reader pattern you still have to expose all
modifications in atomic transactional style. How can you expose two
links in transactional style in double-list? Or how can you rebalance
tree (move node from one place to another) in transactional style?
You still have to use lock-free algorithms. For example, when you
rebalance tree, you can make something like this: mark original node,
make copy of original node, mark copy, insert copy into new position,
wait for quiescent period, remove original node, wait for quiescent
period, free original node, unmark copy. And reader have to be
involved into this mess too.
Mutex for writes make things a bit easier, but it's still not mutex-
based programming.

Dmitriy V'jukov

Chris Thomasson

unread,

Jan 28, 2008, 2:39:20 PM1/28/08

to

On Mon, 28 Jan 2008 01:11:04 -0800, Dmitriy Vyukov wrote:

> On Jan 26, 10:39 am, Chris Thomasson <cris...@comcast.net> wrote:
>>
>> I use mutex simply because I can do double-lists, complex trees, or
>> anything else that _cannot_ be done with lock-free methods.
>
>
> But if you use lock-free reader pattern you still have to expose all
> modifications in atomic transactional style. How can you expose two
> links in transactional style in double-list?

If I create the reader _and_ writer algorithm, then I can create all special rules I
need for the data-structure at hand; I have some simple tricks. I will
type up some pseudo-code some time today. I am applying a fix to AppCore
right now...

> Or how can you rebalance
> tree (move node from one place to another) in transactional style? You
> still have to use lock-free algorithms.

[...]

You can read some of this:

http://groups.google.com/group/comp.programming.threads/browse_frm/thread/7d202e726424c0bf

> Mutex for writes make things a bit easier, but it's still not mutex-
> based programming.

IMHO, using mutex for writes makes thing a lot easier...

Chris Thomasson

unread,

Jan 28, 2008, 3:30:56 PM1/28/08

to

On Sat, 26 Jan 2008 01:28:15 -0600, Chris Thomasson wrote:

> On Fri, 25 Jan 2008 23:17:11 -0800, ti_chris wrote:
>
>>> Okay. I see what your doing here with the Mutate function. Humm... I
>>> think I am going to revisit pc_sample.c...
>>
>>
>> haha. I think I'm just starting to see also how you used the proxy
>> collector. It seems to be somewhat different. In theory, mine should
>> be a little more optimal since it batches deleted nodes instead of
>> "mutating" the proxy every time a node is deleted (to use my
>> terminology).
>
> Indeed. Well, I know that I posted this as an optimization possibility
> to pc_sample.c to this group in the past two or there years. DAMN! Time
> to to a crappy Google search! Shi%!!! This is going to SUCK!

[...]

I found it:

http://groups.google.com/group/comp.programming.threads/msg/aa1b34e932c022b0

Chris Thomasson

unread,

Jan 28, 2008, 3:30:58 PM1/28/08

to

On Sat, 26 Jan 2008 01:28:15 -0600, Chris Thomasson wrote:

> On Fri, 25 Jan 2008 23:17:11 -0800, ti_chris wrote:
>
>>> Okay. I see what your doing here with the Mutate function. Humm... I
>>> think I am going to revisit pc_sample.c...
>>
>>
>> haha. I think I'm just starting to see also how you used the proxy
>> collector. It seems to be somewhat different. In theory, mine should
>> be a little more optimal since it batches deleted nodes instead of
>> "mutating" the proxy every time a node is deleted (to use my
>> terminology).
>
> Indeed. Well, I know that I posted this as an optimization possibility
> to pc_sample.c to this group in the past two or there years. DAMN! Time
> to to a crappy Google search! Shi%!!! This is going to SUCK!

John Dallman

unread,

Jan 28, 2008, 4:15:00 PM1/28/08

to

In article <b5SdnSjy_cAU8QDa...@comcast.com>,
cri...@comcast.net (Chris Thomasson) wrote:

> Read this:
> http://www.news.com/Designer-puts-96-cores-on-single-chip/2100-1006_3-
> 5399128.html?tag=st.nl

That's ClearSpeed. I read up on their thing and talked to them a bit.
It's a hell of a lot less generally usable than journalists try to make
out. Simple core count doesn't mean very much any more.

Chris Thomasson

unread,

Jan 28, 2008, 6:30:00 PM1/28/08

to

"John Dallman" <j...@cix.co.uk> wrote in message
news:memo.2008012...@jgd.compulink.co.uk...

> In article <b5SdnSjy_cAU8QDa...@comcast.com>,
> cri...@comcast.net (Chris Thomasson) wrote:
>
>> Read this:
>> http://www.news.com/Designer-puts-96-cores-on-single-chip/2100-1006_3-
>> 5399128.html?tag=st.nl
>
> That's ClearSpeed. I read up on their thing and talked to them a bit.
> It's a hell of a lot less generally usable than journalists try to make
> out. Simple core count doesn't mean very much any more.

What do you think about Intel's experimental chip?

ti_chris

unread,

Jan 28, 2008, 7:25:35 PM1/28/08

to

On Jan 28, 1:15 pm, j...@cix.co.uk (John Dallman) wrote:
> In article <b5SdnSjy_cAU8QDanZ2dnUVZ_hOdn...@comcast.com>,

>
> cris...@comcast.net (Chris Thomasson) wrote:
> > Read this:
> >http://www.news.com/Designer-puts-96-cores-on-single-chip/2100-1006_3-
> > 5399128.html?tag=st.nl
>
> That's ClearSpeed. I read up on their thing and talked to them a bit.
> It's a hell of a lot less generally usable than journalists try to make
> out. Simple core count doesn't mean very much any more.
>
> --
> John Dallman, j...@cix.co.uk, HTML mail is treated as probable spam.

Yeah totally agree with you on that. It's just like a couple years
ago when Intel wanted everyone to believe that pure Frequency speed
was how you should go about comparing a computer's speed. We all know
how wrong this is now. Of course, the same stunt has to be pulled for
cores.

Chris Thomasson

unread,

Jan 28, 2008, 7:57:50 PM1/28/08

to

"ti_chris" <tiCh...@gmail.com> wrote in message
news:ae13718c-142e-4730...@c23g2000hsa.googlegroups.com...

> On Jan 25, 11:47 am, Chris Thomasson <cris...@comcast.net> wrote:
>> It analogous of a simple RCU pattern; I think that you can try this out
>> in
>> your code by adding the following function:
>> _________________________________________________________________
>> LFPCStack::Iterate() {
>> ProxyObject* const Obj = ProxyCollector::AddRef();
>> LFPCStackNode* Node = Top;
>> while (Node) {
>> LFPCStackNode* const Next = Node->Next;
>> // DoSomethingReadOnly(Node);
>> Node = Next;
>> }
>> ProxyCollector::RemRef(Obj);}
>>
>> _________________________________________________________________
>>
>> I think this should work for you because your back-linking proxy objects.
>> However, I don't know for sure because I have not had time to study your
>> implementation. If your not back-linking properly, FIFO, then your going
>> to experience a nasty race-condition... Even thought it seems like it
>> should be okay, its not.
>
> Yes; this works flawlessly and is effectively the right way to do
> this.

[...]

Wait a minute here, Okay I think I found a bug that makes it so you can't do
safe traversals with your design as is; here is why:
_________________________________________________________________
void ProxyObject::Defer(void *Memory)
{
do {

*(void **)Memory = List;
//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

} while (!CAS((long *)&List,
(long)Memory,
*(unsigned long*)Memory));
}
_________________________________________________________________

'Memory' can still be referenced and accessed by concurrent reader threads
traversing through the data-structure from which it was just removed from.
This can make their iterations go from the list, right into your private
defer list; that would be very bad! Think if reader thread A has loaded a
pointer to NodeA; writer thread B removes NodeA without modifying it, then
and defers NodeA, thus modifying its next pointer to point into nodes in the
defer list; then poor reader thread A reads the next pointer and goes right
into defer list. The contents of Memory should never be mutated in the Defer
function. You should think about using some per-memory-block meta-data; here
is example:
_________________________________________________________________
struct MemoryHeader {
MemoryHeader* next;
};

void ProxyObject::Defer(void *MemoryBlock)
{
MemoryHeader* const Header =
(MemoryHeader*)MemoryBlock;
do {
Header->Next = List;
} while (!CAS((long *)&List,
(long)Header,
Header->Next));
}

// A node in a lock-free stack can look like:

struct Node {
MemoryHeader Header;
Node* Next;
[...];
};
_________________________________________________________________

That way, a thread can remove a node, and a concurrent reader thread can
still use its links because everything is intact and there will be no
mutations until the Node is indeed quiescent. The way your defer function is
now, you modify memory after its been removed from further global reach, but
before its quiescent. You can send concurrent reader threads that have
acquired references, just prior to deferment, right off into the private
structures of your proxy implementation. That's bad.

What do you think? Am I missing something in your implementation wrt using
it for lock-free traversals?

ti_chris

unread,

Jan 29, 2008, 2:43:53 AM1/29/08

to

On Jan 28, 4:57 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> "ti_chris" <tiChri...@gmail.com> wrote in message

Oh Yah; I totally agree. I thought about this when I wrote this and
you are right. I wrote this off as "I would write a memory manager
that would have an extra word just for that sort of management".
Otherwise, your solution is acceptable also. The other option would
be to make sure that any pointer cannot reside a the first word of a
data structure. So I could effectively sort my node structure such
that the first word is the count instead of being the next pointer and
that would also fix the issue. But yes; as-is, this is a sneaky bug
that would really hurt.

Dmitriy Vyukov

unread,

Jan 29, 2008, 6:42:06 AM1/29/08

to

On Jan 28, 10:39 pm, Chris Thomasson <cris...@comcast.net> wrote:

> > Mutex for writes make things a bit easier, but it's still not mutex-
> > based programming.
>
> IMHO, using mutex for writes makes thing a lot easier...

Yes, you are right. It's better to say: using mutex for writes makes
thing a lot easier, but some problems still in place.

I've read Joe Seigh's post. Yeah, it looks like general approach for
some data structures. But not for all. For example - double-list.

Dmitriy V'jukov

Dmitriy Vyukov

unread,

Jan 29, 2008, 6:51:49 AM1/29/08

to

> http://groups.google.com/group/comp.programming.threads/msg/aa1b34e93...

The only problem I see here - such optimizations makes proxy-collector
not "zero garbage" system. By "zero garbage" I mean that memory is
freed as soon as last thread stop using it. Your original pc_sample is
"zero garbage". Mutexes is "zero garbage". SMR is something like
"bounded garbage".
And all other techniques (RCU, RCU+SMR, vZOOM) is "unbounded garbage".
This optimization makes proxy-collector "unbounded garbage" too.
Natural question arises - what reasons to use in general case
"unbounded garbage" system with atomic ops per read access, when you
can use "unbounded garbage" system without atomic ops per read access?
(Definitely there are some reasons in *special cases*).

Dmitriy V'jukov

ti_chris

unread,

Jan 29, 2008, 4:17:39 PM1/29/08

to

I know this is nitpicking. But in theory, the queued proxy collector
only makes GC more flexible. If you so choose to, you can Mutate the
proxy every time you delete something to achieve a "zero garbage GC".
If you can tolerate more slack, then you can change the rules :)

John Dallman

unread,

Jan 30, 2008, 2:00:00 PM1/30/08

to

In article <xp-dnZ3Mr8Pg_gPa...@comcast.com>,

cri...@comcast.net (Chris Thomasson) wrote:
> > Simple core count doesn't mean very much any more.
> What do you think about Intel's experimental chip?

Haven't looked at it. Since they said it was only a demo and would never
see production, I didn't worry.

Chris Thomasson

unread,

Jan 30, 2008, 3:30:15 PM1/30/08

to

"John Dallman" <j...@cix.co.uk> wrote in message

news:memo.2008013...@jgd.compulink.co.uk...

> In article <xp-dnZ3Mr8Pg_gPa...@comcast.com>,
> cri...@comcast.net (Chris Thomasson) wrote:
>> > Simple core count doesn't mean very much any more.
>> What do you think about Intel's experimental chip?
>
> Haven't looked at it. Since they said it was only a demo and would never
> see production, I didn't worry.

I think its on one of their roadmaps somewhere:

http://www.news.com/Intel-pledges-80-cores-in-five-years/2100-1006_3-6119618.html?tag=st.nl

5 years... It may be a pipedream.

Chris Thomasson

unread,

Jan 31, 2008, 7:28:31 PM1/31/08

to

"ti_chris" <tiCh...@gmail.com> wrote in message

news:3a471d88-2c14-4ba7...@u10g2000prn.googlegroups.com...
> Alright, here's my code for the implementation of a stack. The
> results I've obtained with this code are as follows:
[...]

I think I found a reference counting bug! Please correct me if I am wrong...

> static ProxyObject * __fastcall ProxyCollector::AddRef(void)
> {
> ProxyCollector New, Old;
>
> memset((void *)&Old.Collector, 0, 4 * 2);
>
> Read64((void *)&Master.Collector, (void *)&Old.Collector);
>
> int Diff = Master.Count + Master.Collector->Count;
>
> // Increment and get the pointer reference to the master
> collector
> do {
> New.Collector = Old.Collector;
> New.Count = Old.Count + 1;
> } while (!CAS2((void *)&Master.Collector, (void
> *)&New.Collector, (void *)&Old.Collector));
>
> return Old.Collector;
> }

You always increment the anchor count in ProxyCollector::AddRef(); fine...

> static void __fastcall RemRef(ProxyObject *Collect)
> {
> // If we're out of elements, free/pop the colletor's stack
> while (Dec(&(Collect->Count))) {
> ProxyObject *const Next = Collect->Next;
>
> delete Collect;
>
> Collect = Next;
> }
> }

However, you always decrement the proxy count in ProxyCollector::RemRef(...)
and test it for zero i order to know when its quiescent; well, what happens
if a single thread does something like this:
_______________________________________________________________
for (;;) {
ProxyObject* const Region = ProxyCollector::AddRef();
ProxyCollector::RemRef(Region);
}
_______________________________________________________________

?

Think of the following scenario:
_______________________________________________________________
Anchor = (PC1, 0);
PC1 = (-1, 0, 0);

* AddRef / RemRef *

Anchor = (PC1, 1);
PC1 = (-2, 0, 0);

* AddRef / RemRef *

Anchor = (PC1, 2);
PC1 = (-3, 0, 0);

* Repeat until PC1 "private" count rolls over to zero *
_______________________________________________________________

Whoops! Once PC1 count rolls to zero, you destroy it when there has been no
mutation whatsoever; please correct me if I am dead wrong on this one.

Thank you.

_______
P.S.

This cannot happen in pc_sample.c because I decrement the Anchor count if no
mutations have occured; thats why I use CAS in pc_dec()...

Dmitriy Vyukov

unread,

Feb 1, 2008, 9:06:35 AM2/1/08

to

On Feb 1, 3:28 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> I think I found a reference counting bug! Please correct me if I am wrong...

> Whoops! Once PC1 count rolls to zero, you destroy it when there has been no
> mutation whatsoever; please correct me if I am dead wrong on this one.

Yes, it can take place. I already notice it:
http://groups.google.com/group/comp.programming.threads/msg/d9941fe4ddda734a

ti_chris said that it is not a problem, because Mutate() have to
called frequently enough:
http://groups.google.com/group/comp.programming.threads/msg/8c5db508759c762f

24 bits == 16M. I think inner counter can overflow in one second, if
Mutate is not called.

Dmitriy V'jukov

Chris Thomasson

unread,

Feb 1, 2008, 5:13:45 PM2/1/08

to

On Fri, 01 Feb 2008 06:06:35 -0800, Dmitriy Vyukov wrote:

> On Feb 1, 3:28 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> I think I found a reference counting bug! Please correct me if I am
>> wrong... Whoops! Once PC1 count rolls to zero, you destroy it when
>> there has been no mutation whatsoever; please correct me if I am dead
>> wrong on this one.
>
>
> Yes, it can take place. I already notice it:

[...]

Whoops! I missed that.