multi-producer/single-consumer memory passing queue

Dmitriy V'jukov

unread,

Apr 1, 2008, 5:28:06 PM4/1/08

to

multi-producer/single-consumer memory passing queue.

Not suitable for situations where data must be passed promptly and
reactively. Suitable for situations where data may be passed
eventually, in 'best-effort' manner.

The queue is designed to use in this memory allocator algorithm:
http://groups.google.com/group/comp.programming.threads/tree/browse_frm/thread/8245f4b48591fc69

The queue internally works like N single-producer/single-consumer
queues with automatic multiplexing. But multiplexing works in such a
way that dequeue operation is O(1), not O(N).

There is always 1 item per producer delayed in queue. Sometimes more.

Fast-path for dequeue and enqueue operations is atomic-free and heavy
memory barrier free.

Here is the code:

struct node
{
node* next_;
};

struct anchor;
struct mpsc_mem_queue;

// target for DWCAS
struct anchor_ptr
{
anchor* ptr_;
unsigned cntr_; // ABA counter

anchor_ptr(anchor* ptr = 0, unsigned cntr = 0)
: ptr_(ptr)
, cntr_(cntr)
{
}

anchor_ptr(__int64 raw)
{
new (this) anchor_ptr (*(anchor_ptr*)&raw);
}

operator __int64 () const
{
return *(__int64*)this;
}
};

// anchor is related to pair (queue, producer)
// so in environment with N threads and N queues
// must be N*(N-1) anchors
// anchor is no more then spsc-queue
struct anchor
{
anchor_ptr next_;
node* tail_;
char cl_pad1_[64];
node* head_;
bool linked_;
anchor* anchor_list_;
mpsc_mem_queue* parent_; // for debug
unsigned owner_thread_; // for debug
bool attached_;
char cl_pad2_[64];

anchor()
{
next_.ptr_ = 0;
next_.cntr_ = 0;
tail_ = 0;
head_ = 0;
linked_ = false;
parent_ = 0;
owner_thread_ = 0;
anchor_list_ = 0;
attached_ = false;
}
};

struct mpsc_mem_queue
{
anchor stub_;
node stub_node_;
anchor* tail_;
char cl_pad1_[64];
anchor_ptr head_;
anchor* anchor_list_;
unsigned owner_thread_;
char cl_pad2_[64];

mpsc_mem_queue()
{
stub_node_.next_ = 0;
stub_.head_ = &stub_node_;
stub_.tail_ = &stub_node_;
tail_ = &stub_;
head_.ptr_ = &stub_;
head_.cntr_ = 0;
anchor_list_ = 0;
owner_thread_ = 0;
}

void init()
{
assert(0 == owner_thread_);
owner_thread_ = GetCurrentThreadId();
}

void attach_anchor(anchor* a)
{
assert(0 == a->parent_);
assert(0 == a->owner_thread_);
assert(false == a->attached_);

a->parent_ = this;
a->owner_thread_ = GetCurrentThreadId();
a->attached_ = true;

for (;;)
{
a->anchor_list_ = anchor_list_;
anchor* old = (anchor*)_InterlockedCompareExchange
((long*)&anchor_list_,
(long)a, (long)a->anchor_list_);
if (old == a->anchor_list_)
break;
}
}

void enqueue(anchor* a, node* n)
{
assert(this == a->parent_);
assert(GetCurrentThreadId() == a->owner_thread_);
assert(true == a->attached_);

n->next_ = 0;
if (a->head_)
{
a->head_->next_ = n;
a->head_= n;
if (false == a->linked_)
enqueue_anchor(a);
}
else
{
a->head_ = n;
a->tail_ = n;
}
}

node* dequeue()
{
assert(GetCurrentThreadId() == owner_thread_);

anchor* tail;
anchor_ptr next;
anchor_ptr head;
for (;;)
{
tail = tail_;
if (tail->tail_->next_)
{
node* n = tail->tail_;
tail->tail_ = tail->tail_->next_;
return n;
}
else
{
head = head_;
next = tail->next_;
if (next.ptr_)
{
if (head.ptr_ == tail)
{
_InterlockedCompareExchange64
((__int64*)&head_, next, head);
}
tail->linked_ = false;
tail_ = next.ptr_;
}
else
{
if (tail != &stub_)
{
enqueue_anchor(&stub_);
tail_ = tail->next_.ptr_;
tail->linked_ = false;

}
return 0;
}
}
}
}

void enqueue_anchor(anchor* a)
{
a->next_.ptr_ = 0;
a->linked_ = true;
for (;;)
{
anchor_ptr head = head_;
anchor_ptr next = head.ptr_->next_;
if (next.ptr_)
{
_InterlockedCompareExchange64
((__int64*)&head_, next, head);
continue;
}
else
{
a->next_.cntr_ = head.cntr_ + 1;
anchor_ptr xchg (a, next.cntr_ + 1);
anchor_ptr* dst = &head_.ptr_->next_;
anchor_ptr old = _InterlockedCompareExchange64
((__int64*)dst, xchg, next);
if (old == next)
{
_InterlockedCompareExchange64
((__int64*)&head_, xchg, head);
break;
}
}
}
}

node* finalize()
{
assert(GetCurrentThreadId() == owner_thread_);

while (anchor_list_)
{
if (anchor_list_->tail_)
{
node* n = anchor_list_->tail_;
anchor_list_->tail_ = anchor_list_->tail_->next_;
return n;
}
else
{
anchor_list_->attached_ = false;
anchor_list_ = anchor_list_->anchor_list_;
}
}
return 0;
}
};

Dmitriy V'jukov

unread,

Apr 1, 2008, 5:31:12 PM4/1/08

to

On Apr 2, 1:28 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
> multi-producer/single-consumer memory passing queue.

Here is simple test/usage example (it compiles under MSVC 2005):

#define _WIN32_WINNT 0x0500
#include <windows.h>
#include <process.h>
#include <intrin.h>
#include <time.h>
#include <stdlib.h>

#include <deque>
#include <cassert>

struct my_node : node
{
my_node(int sender, int data)
: sender_(sender)
, data_(data)
{
}

int sender_;
int data_;
};

class barrier
{
public:
barrier(int count)
: count_(count)
, event_(CreateEvent(0, 1, 0, 0))
{
}

void wait()
{
if (_InterlockedDecrement(&count_))
WaitForSingleObject(event_, INFINITE);
else
SetEvent(event_);
}

private:
long count_;
HANDLE event_;
};

class mpsc_queue
{
public:
mpsc_queue()
{
InitializeCriticalSection(&cs_);
}

void enqueue(my_node* n)
{
EnterCriticalSection(&cs_);
queue_.push_back(n);
LeaveCriticalSection(&cs_);
}

my_node* dequeue()
{
my_node* n = 0;
EnterCriticalSection(&cs_);
if (queue_.size())
{
n = queue_.front();
queue_.pop_front();
}
LeaveCriticalSection(&cs_);
return n;
}

private:
std::deque<my_node*> queue_;
CRITICAL_SECTION cs_;
};

int const thread_count = 8;
int const iter_count = 1000000;
int const batch_size = 16;

mpsc_mem_queue queues [thread_count];
mpsc_queue req_queues [thread_count];

barrier start (thread_count);
barrier finalize1 (thread_count);
barrier finalize2 (thread_count);
barrier stop (thread_count);

unsigned __stdcall thread_func(void* p)
{
srand((unsigned)time(0));
int const id = (int)p;
anchor anchors [thread_count];
queues[id].init();
for (int i = 0; i != thread_count; ++i)
queues[i].attach_anchor(&anchors[i]);

HANDLE heap = HeapCreate(HEAP_NO_SERIALIZE, 0, 0);

start.wait();

for (int i = 0; i != iter_count; ++i)
{
if (0 == (i / batch_size) % 2)
{
int const target = rand() % thread_count;
my_node* n = (my_node*)HeapAlloc(heap, 0,
sizeof(my_node));
new (n) my_node(id, i);
req_queues[target].enqueue(n);
}
else
{
if (my_node* n = req_queues[id].dequeue())
{
int const sender = n->sender_;
n->sender_ = id;
queues[sender].enqueue(&anchors[sender], n);
}

if (my_node* n = (my_node*)queues[id].dequeue())
HeapFree(heap, 0, n);
}
}

finalize1.wait();

while (my_node* n = req_queues[id].dequeue())
{
int const sender = n->sender_;
n->sender_ = id;
queues[sender].enqueue(&anchors[sender], n);
}

finalize2.wait();

while (my_node* n = (my_node*)queues[id].finalize())
HeapFree(heap, 0, n);

stop.wait();

HeapDestroy(heap);

return 0;
}

int main()
{
srand((unsigned)time(0));
HANDLE threads [thread_count] = {};
for (int i = 0; i != thread_count; ++i)
threads[i] = (HANDLE)_beginthreadex(0, 0, &thread_func,
(void*)i, 0, 0);
WaitForMultipleObjects(thread_count, threads, 1, INFINITE);
}

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 1, 2008, 6:46:35 PM4/1/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
news:c08b3862-624d-4afb...@e6g2000prf.googlegroups.com...

> multi-producer/single-consumer memory passing queue.
>
> Not suitable for situations where data must be passed promptly and
> reactively. Suitable for situations where data may be passed
> eventually, in 'best-effort' manner.

Interesting...

> The queue is designed to use in this memory allocator algorithm:
> http://groups.google.com/group/comp.programming.threads/tree/browse_frm/thread/8245f4b48591fc69
>
> The queue internally works like N single-producer/single-consumer
> queues with automatic multiplexing. But multiplexing works in such
> a way that dequeue operation is O(1), not O(N).

That's a pretty neat property indeed. The thing that vZOOM uses is O(N)
per-consumer where N is the number of registered producers. The consumer
iterates through its list of registered producers and attempts to dequeue
something. It not that bad, but I would definitely like to find ways to
amortize or reduce N.

> There is always 1 item per producer delayed in queue. Sometimes more.

I see. Well, I think that behavior will be tolerable for certain usage
patterns for sure.

> Fast-path for dequeue and enqueue operations is atomic-free and heavy
> memory barrier free.

[...]

Dmitriy V'jukov

unread,

Apr 1, 2008, 7:15:16 PM4/1/08

to

On Apr 2, 2:46 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> > The queue internally works like N single-producer/single-consumer
> > queues with automatic multiplexing. But multiplexing works in such
> > a way that dequeue operation is O(1), not O(N).
>
> That's a pretty neat property indeed. The thing that vZOOM uses is O(N)
> per-consumer where N is the number of registered producers. The consumer
> iterates through its list of registered producers and attempts to dequeue
> something. It not that bad, but I would definitely like to find ways to
> amortize or reduce N.

I was using the same algorithm - N spsc queues + O(N) multiplexing.
But I'm looking for ways to eliminate O(N). Because in the end of 2008
we can find ourself using low-end servers with 32 hardware threads (2
processors X 8 cores X 2 hardware threads). 32 hardware threads end up
being 64/128 software threads.
Iterating through 128 queues just to see that they are all empty is
not cool indeed.

Single mpsc queue based on reversed stack is not cool too. Because of
contention on producer side.

In this queue I was trying to combine strength of spsc with strength
of mpsc.

> > There is always 1 item per producer delayed in queue. Sometimes more.
>
> I see. Well, I think that behavior will be tolerable for certain usage
> patterns for sure.

Yes. I was developing the queue especially for slab-allocator.
Now I am developing analogous queue but w/o delayed items, so it will
be more suitable for general-purpose usage. I think I can make it at
the cost of 1 StoreLoad on producer and on consumer side. All I have
to do is to resolve race between concurrent enqueue operation and
dequeue removing anchor from queue.

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 2, 2008, 12:13:40 AM4/2/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:207edf36-e8c2-41d4...@i36g2000prf.googlegroups.com...

You can help eliminate some threads by relying on programmer inputs. That is
if the API lets you. For some applications, you do not need to maintain a
complete network such that an arbitrary thread can communication with any
other arbitrary thread present in the "system". A producer only registers
with a consumer if it really needs to communicate with it for an "extended
period of time". So the thread-to-thread communication web is at least
customized to an end-user's programming requirements. This would be a part
of the "high-level" API usage-pattern which is designed to help to reduce
N...

Dmitriy V'jukov

unread,

Apr 2, 2008, 4:59:12 AM4/2/08

to

On Apr 2, 8:13 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> You can help eliminate some threads by relying on programmer inputs. That is
> if the API lets you. For some applications, you do not need to maintain a
> complete network such that an arbitrary thread can communication with any
> other arbitrary thread present in the "system". A producer only registers
> with a consumer if it really needs to communicate with it for an "extended
> period of time". So the thread-to-thread communication web is at least
> customized to an end-user's programming requirements. This would be a part
> of the "high-level" API usage-pattern which is designed to help to reduce
> N...

In some situations this can help. For example in staged/pipelined
architecture.
In some situations this doesn't help. For example in language run-time
where one has N equivalent threads.

Anyway if library provides API for connecting threads, then library
must be ready for user setups fully-connected network, i.e. N^2
connections. It's so called worst-case scenario. One can't rely for
user to setup no more than N connections.

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 2, 2008, 5:51:44 AM4/2/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:8dc36022-39d8-4959...@d21g2000prf.googlegroups.com...

The system can handle fully connected network. However, I specifically
discourage that type of setup in the usage-pattern documentation for
distributed message-passing. I encourage spending time carefully designing
your applications messaging scheme with a heavy emphasis on trying to reduce
create complex inter-connected communication webs. But, in the end your
correct, the user can do what they want, and I have no control over that.

Chris Thomasson

unread,

Apr 2, 2008, 5:58:42 PM4/2/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:c08b3862-624d-4afb...@e6g2000prf.googlegroups.com...

> multi-producer/single-consumer memory passing queue.
>
> Not suitable for situations where data must be passed promptly and
> reactively. Suitable for situations where data may be passed
> eventually, in 'best-effort' manner.

[...]

Could you clarify a bit more here. Is this because of the node delay in the
slab?

Dmitriy V'jukov

unread,

Apr 3, 2008, 4:38:32 AM4/3/08

to

On Apr 3, 1:58 am, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message

Yes. Several nodes can be delayed for *arbitrary* amount of time
*until* producer will push another node.

Dmitriy V'jukov

unread,

Apr 3, 2008, 12:25:48 PM4/3/08

to

On Apr 2, 1:51 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> > Anyway if library provides API for connecting threads, then library
> > must be ready for user setups fully-connected network, i.e. N^2
> > connections. It's so called worst-case scenario. One can't rely for
> > user to setup no more than N connections.
>
> The system can handle fully connected network. However, I specifically
> discourage that type of setup in the usage-pattern documentation for
> distributed message-passing. I encourage spending time carefully designing
> your applications messaging scheme with a heavy emphasis on trying to reduce
> create complex inter-connected communication webs. But, in the end your
> correct, the user can do what they want, and I have no control over that.

As far as I understand the whole thing 'producer-consumer registering'
is only artifact of multiplexing of spsc-queues, it's unneeded for end
user, it's needed solely to reduce overheads of multiplexing. Right?
So if we will have effective mpsc queue (which doesn't impose any
overheads for inactive producers), 'producer-consumer registering' can
be eliminated.

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 3, 2008, 9:51:36 PM4/3/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:ca52f47b-86b4-46c0...@i7g2000prf.googlegroups.com...

I was messing around with highly experimental scheme that did something
crazy like:
_____________________________________________________________
#define NODE_REAL() 0x1
#define NODE_DUMMY() 0x2

struct node {
node* m_next;
int m_state;
node(int state) : m_state(state) {}
};

static spsc_stack g_cache(new node(NODE_DUMMY()));
static spsc_queue g_queue(new node(NODE_DUMMY()));
static eventcount g_ec;

node* producer_cache_pop(int state) {
node* n = g_cache.trypop();
if (! n) {
n = new node(state);
} else {
n->m_state = state;
}
return n;
}

void producer() {
node* n[2];
for (;;) {
g_q.push(producer_cache_pop(NODE_REAL()));
g_q.push(producer_cache_pop(NODE_DUMMY()));
g_ev.signal();
}
}

void consumer() {
node* n;
for (;;) {
while (! n = g_q.trypop()) {
eventcount::count_type key = g_ec.get();
if (n = g_q.trypop()) { break; }
g_ec.wait(key);
}
if (! (n->m_state & NODE_DUMMY())) {
// we have real work! ;^)
}
g_cache.push(n);
}
}
_____________________________________________________________

Basically, every producer pushes, and consumer pops, two nodes; a real one
and a dummy. The queue starts with one dummy. I would look something like:
_____________________________________________________________
Initial state:
Q->DUMMY->NULL;

produce single item:
1: Q->DUMMY->REAL->NULL;
2: Q->DUMMY->REAL->DUMMY->NULL;

consume a single item:
1: Q->REAL->DUMMY->NULL;
2: Q->DUMMY->NULL;

Final state:
Q->DUMMY->NULL;
_____________________________________________________________

What do you think of that? It sure seems like it should work... Humm...

;^)

Dmitriy V'jukov

unread,

Apr 4, 2008, 4:54:55 AM4/4/08

to

On Apr 4, 5:51 am, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message
>
> news:ca52f47b-86b4-46c0...@i7g2000prf.googlegroups.com...
>
>
>
> > On Apr 3, 1:58 am, "Chris Thomasson" <cris...@comcast.net> wrote:
> >> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message
>
> >>news:c08b3862-624d-4afb...@e6g2000prf.googlegroups.com...
>
> >> > multi-producer/single-consumer memory passing queue.
>
> >> > Not suitable for situations where data must be passed promptly and
> >> > reactively. Suitable for situations where data may be passed
> >> > eventually, in 'best-effort' manner.
>
> >> [...]
>
> >> Could you clarify a bit more here. Is this because of the node delay in
> >> the
> >> slab?
>
> > Yes. Several nodes can be delayed for *arbitrary* amount of time
> > *until* producer will push another node.
>
> I was messing around with highly experimental scheme that did something
> crazy like:
> _____________________________________________________________

> What do you think of that? It sure seems like it should work... Humm...

Is user_node+dummy_node not really the same as user_node
+internal_node?

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 4, 2008, 5:15:54 AM4/4/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:93921b99-d7a0-426f...@s19g2000prg.googlegroups.com...

Humm... The only thing I can think of is if the program "cheated" a little
and did something like this:

_____________________________________________________________
Initial state:
Q->DUMMY->NULL;

produce single item
- (actually two real items; one important, the other no so much...)
1: Q->DUMMY->REAL_IMPORTANT->NULL;
2: Q->DUMMY->REAL_IMPORTANT->NOT_THAT_IMPORTANT->NULL;

consume a single item:
1: Q->REAL_IMPORTANT->NOT_THAT_IMPORTANT->NULL;
2: Q->REAL_IMPORTANT->NULL;

Final state:
- (the final state would always be left with an item that is not that
important.)
Q->NOT_THAT_IMPORTANT->NULL;
_____________________________________________________________

lol. :^)

Other than that, well, there really is no difference because as soon as a
program inserts the dummy node it is basically the same as an internal
allocator creating a node for every new item inserted. There is one _little_
difference I guess... That would be the elimination of the data copy to/from
the dummy node. Let's take the push function of your version 2 of your
non-intrusive spsc_lifo as an example:

void push(node* n, T const& v) {
n->link_ = 0;
head_->data_ = v;
_ReadWriteBarrier();
head_->link_ = n;
_ReadWriteBarrier();
head_ = n;
}

Your not storing the data 'v' in the passed node 'n', therefore you must
copy 'v' over to the dummy node located in 'head_'. This is usually fine as
long as 'T' is a pointer or something very small... However, if 'T' is
larger than that, well, the copy operation gets "more expensive". AppCore
queues make the data copy from a real node to the dummy in pop function.
However, I think this is only minor point... Humm... I need to really take a
good look at your most recent intrusive solutions...

Chris Thomasson

unread,

Apr 4, 2008, 5:23:21 AM4/4/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message
news:UeKdnSp9cuINbGja...@comcast.com...

> "Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
> news:93921b99-d7a0-426f...@s19g2000prg.googlegroups.com...

[...]

>> Is user_node+dummy_node not really the same as user_node
>> +internal_node?
>

[...]

In the rules for this algorithm, you could say something like:

If you want to produce an item which must be promptly acted upon, then you
must produce two nodes into the queue. The first one representing an
important item, the second can be less important, or even a dummy.

;^)

Dmitriy V'jukov

unread,

Apr 4, 2008, 5:27:10 AM4/4/08

to

On Apr 4, 1:15 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> > Is user_node+dummy_node not really the same as user_node
> > +internal_node?
>
> Humm... The only thing I can think of is if the program "cheated" a little
> and did something like this:
> _____________________________________________________________
> Initial state:
> Q->DUMMY->NULL;
>
> produce single item
> - (actually two real items; one important, the other no so much...)
> 1: Q->DUMMY->REAL_IMPORTANT->NULL;
> 2: Q->DUMMY->REAL_IMPORTANT->NOT_THAT_IMPORTANT->NULL;
>
> consume a single item:
> 1: Q->REAL_IMPORTANT->NOT_THAT_IMPORTANT->NULL;
> 2: Q->REAL_IMPORTANT->NULL;
>
> Final state:
> - (the final state would always be left with an item that is not that
> important.)
> Q->NOT_THAT_IMPORTANT->NULL;
> _____________________________________________________________
>
> lol. :^)

:)

> Other than that, well, there really is no difference because as soon as a
> program inserts the dummy node it is basically the same as an internal
> allocator creating a node for every new item inserted. There is one _little_
> difference I guess... That would be the elimination of the data copy to/from
> the dummy node. Let's take the push function of your version 2 of your
> non-intrusive spsc_lifo as an example:
>
> void push(node* n, T const& v) {
> n->link_ = 0;
> head_->data_ = v;
> _ReadWriteBarrier();
> head_->link_ = n;
> _ReadWriteBarrier();
> head_ = n;
> }
>
> Your not storing the data 'v' in the passed node 'n', therefore you must
> copy 'v' over to the dummy node located in 'head_'. This is usually fine as
> long as 'T' is a pointer or something very small... However, if 'T' is
> larger than that, well, the copy operation gets "more expensive". AppCore
> queues make the data copy from a real node to the dummy in pop function.
> However, I think this is only minor point... Humm...

Yeah, all previous spsc primitives not only allocate additional
internal node, but also copy user data.
Copy can be worked around this way:

T& stack::get_placeholder_for_push()
{
return head_->data_;
}

void stack::push(node* n)
{
n->link_ = 0;
// assume user already stored item in head_->data_
// head_->data_ = v;

_ReadWriteBarrier();
head_->link_ = n;
_ReadWriteBarrier();
head_ = n;
}

void user_thread()
{
T& placeholder = stack.get_placeholder_for_push();
fill_data_inplace(placeholder);
stack.push(allocate_node());
}

But I don't think that it's worth doing. At least in end-user API.

> I need to really take a
> good look at your most recent intrusive solutions...

I eliminate the need for double-word atomic operations in last
version.

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 8, 2008, 5:44:04 AM4/8/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:ef2f1a9b-0e8b-4148...@s8g2000prg.googlegroups.com...

> On Apr 2, 1:51 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> > Anyway if library provides API for connecting threads, then library
>> > must be ready for user setups fully-connected network, i.e. N^2
>> > connections. It's so called worst-case scenario. One can't rely for
>> > user to setup no more than N connections.
>>
>> The system can handle fully connected network. However, I specifically
>> discourage that type of setup in the usage-pattern documentation for
>> distributed message-passing. I encourage spending time carefully
>> designing
>> your applications messaging scheme with a heavy emphasis on trying to
>> reduce
>> create complex inter-connected communication webs. But, in the end your
>> correct, the user can do what they want, and I have no control over that.
>
>
> As far as I understand the whole thing 'producer-consumer registering'
> is only artifact of multiplexing of spsc-queues, it's unneeded for end
> user, it's needed solely to reduce overheads of multiplexing. Right?

Yeah. Its designed to avoid creating a fully connected network by default.
There is another way to get a pseudo fully connected network:

http://www.sicortex.com/5832_newsletter/just_plane_fast/just_plane_fast

That way, a given thread would only need a total of 6 spsc_queues (3 in, 3
out) to communicate with any other thread in the network. I would only need
to add the ability to simulate node hops. The worst case scenario would be 2
hops, e.g., a message goes from origin thread, to middle thread, and
finally to destination thread.