Portable epoch auto detection algorithm

Dmitriy Vyukov

unread,

Jan 23, 2008, 4:42:47 PM1/23/08

to

This algorithm allows user thread to determine when all processors
execute #StoreLoad memory barrier.

Basic outline:
On system start processor_number number of threads is created (let's
call those threads sys_synch_threads). sys_synch_threads binded to
processors - one thread per processor. When user thread wants to
detect epoch it enqueues request to every sys_synch_thread and waits
for response. Every sys_synch_thread dequeues request and executes
#StoreLoad memory barrier. When last sys_synch_thread process request
user thread is notified.

Epoch detection operation is relatively heavy. So it's not suitable
for frequent execution. But imho it's perfectly suitable for all kind
of "asymmetric" algorithms, when you want to move overhead from reader
to writer/GC.

It's not guarantee any kind of quiescent user state, because it's not
intrusive. It's only guarantee execution of #StoreLoad. On the other
hand it's automatic and suitable for preemptive threads.

I think it can be used in such algorithms as SMR+RCU (as RCU
replacement) and vZOOM (as epoch auto detection). And I'm going to
demonstrate one more application in a couple of days.

API:

// initialization
bool sys_synch_init();
// deinitialization
void sys_synch_deinit();

// thread initialization
// must be called on every thread
// which executes sys_synch()
bool sys_synch_thread_init();
// thread deinitialization
void sys_synch_thread_deinit();

// epoch detection
// when function returns
// all processors execute #StoreLoad
void sys_synch();

Implementation (no error handling for simplicity):

// per thread data
// represents request for system synchronization
struct sys_synch_req_t
{
long volatile remain_count_;
long volatile spin_wait_;
HANDLE event_;
sys_synch_req_t** next_;
};

__declspec(thread) sys_synch_req_t sys_synch_req_tls;

// per processor data
struct sys_synch_proc_t
{
unsigned proc_index_;
sys_synch_req_t* volatile anchor_;
sys_synch_req_t* local_anchor_;
long volatile blocked_;
bool volatile need_stop_;
HANDLE event_;
HANDLE thread_;
};

// global data
struct sys_synch_mgr_t
{
unsigned proc_count_;
unsigned spin_count_;
sys_synch_proc_t* procs_;
};

sys_synch_mgr_t sys_synch_mgr;

// thread function for sys_synch_thread
static unsigned long __stdcall sys_synch_thread(void* p)
{
sys_synch_proc_t* proc = (sys_synch_proc_t*)p;
for (;;)
{
if (proc->need_stop_)
break;
for (;;)
{
while (proc->local_anchor_)
{
sys_synch_req_t* next =
proc->local_anchor_->next_[proc->proc_index_];
if (0 == _InterlockedDecrement(
&proc->local_anchor_->remain_count_))
{
if (0 != _InterlockedDecrement(
&proc->local_anchor_->spin_wait_))
{
SetEvent(proc->local_anchor_->event_);
}
}
proc->local_anchor_ = next;
}
proc->local_anchor_ = (sys_synch_req_t*)
_InterlockedExchange((long*)&proc->anchor_, 0);
if (0 == proc->local_anchor_)
break;
}
_InterlockedExchange(&proc->blocked_, 1);
proc->local_anchor_ = (sys_synch_req_t*)
_InterlockedExchange((long*)&proc->anchor_, 0);
if (0 == proc->local_anchor_)
WaitForSingleObject(proc->event_, INFINITE);
}
return 0;
}

bool sys_synch_init()
{
SYSTEM_INFO si = {};
GetSystemInfo(&si);
sys_synch_mgr.proc_count_ = si.dwNumberOfProcessors;
if (1 == sys_synch_mgr.proc_count_)
return true;
sys_synch_mgr.spin_count_ = 3;
sys_synch_mgr.procs_ = (sys_synch_proc_t*)
calloc(sizeof(sys_synch_proc_t), sys_synch_mgr.proc_count_);
for (unsigned i = 0; i != sys_synch_mgr.proc_count_; ++i)
{
sys_synch_mgr.procs_[i].proc_index_ = i;
sys_synch_mgr.procs_[i].event_ = CreateEventW(0, 0, 0, 0);
sys_synch_mgr.procs_[i].thread_ =
CreateThread(0, 8 * 1024, &sys_synch_thread,
&sys_synch_mgr.procs_[i], CREATE_SUSPENDED, 0);
SetThreadPriority(sys_synch_mgr.procs_[i].thread_,
THREAD_PRIORITY_ABOVE_NORMAL);
SetThreadAffinityMask(
sys_synch_mgr.procs_[i].thread_, 1 << i);
ResumeThread(sys_synch_mgr.procs_[i].thread_);
}
return true;
}

void sys_synch_deinit()
{
if (1 == sys_synch_mgr.proc_count_)
return;
for (unsigned i = 0; i != sys_synch_mgr.proc_count_; ++i)
{
sys_synch_mgr.procs_[i].need_stop_ = true;
SetEvent(sys_synch_mgr.procs_[i].event_);
WaitForSingleObject(sys_synch_mgr.procs_[i].thread_,
INFINITE);
CloseHandle(sys_synch_mgr.procs_[i].thread_);
CloseHandle(sys_synch_mgr.procs_[i].event_);
}
free(sys_synch_mgr.procs_);
}

bool sys_synch_thread_init()
{
if (1 == sys_synch_mgr.proc_count_)
return true;
sys_synch_req_tls.next_ = (sys_synch_req_t**)
calloc(sizeof(sys_synch_req_t*), sys_synch_mgr.proc_count_);
sys_synch_req_tls.event_ = CreateEventW(0, 0, 0, 0);
return true;
}

void sys_synch_thread_deinit()
{
if (1 == sys_synch_mgr.proc_count_)
return;
CloseHandle(sys_synch_req_tls.event_);
free(sys_synch_req_tls.next_);
}

void sys_synch()
{
if (1 == sys_synch_mgr.proc_count_)
{
// little bonus
_ReadWriteBarrier();
return;
}

sys_synch_req_tls.remain_count_ = sys_synch_mgr.proc_count_;
sys_synch_req_tls.spin_wait_ = 1;

for (unsigned i = 0; i != sys_synch_mgr.proc_count_; ++i)
{
// enqueue request for every sys_synch_thread
do
{
sys_synch_req_tls.next_[i] =
sys_synch_mgr.procs_[i].anchor_;
}
while (0 != _InterlockedCompareExchange(
(long*)&sys_synch_mgr.procs_[i].anchor_,
(long)&sys_synch_req_tls,
(long)sys_synch_req_tls.next_[i]));

// check whether we need to wake sys_synch_thread
if (1 == sys_synch_mgr.procs_[i].blocked_)
{
if (1 == _InterlockedCompareExchange(
&sys_synch_mgr.procs_[i].blocked_, 0, 1))
{
SetEvent(sys_synch_mgr.procs_[i].event_);
}
}
}

for (unsigned i = 0; i != sys_synch_mgr.spin_count_; ++i)
{
// wait in spin for a while
SwitchToThread();
if (0 == sys_synch_req_tls.spin_wait_)
return;
}

if (0 == _InterlockedCompareExchange(
&sys_synch_req_tls.spin_wait_, 0, 1))
return;

// and then block
WaitForSingleObject(sys_synch_req_tls.event_, INFINITE);
}

-----------------------------------------------------------------------------------------------------------------

What do you think?

Dmitriy V'jukov

Chris Thomasson

unread,

Jan 23, 2008, 6:40:49 PM1/23/08

to

"Dmitriy Vyukov" <dvy...@gmail.com> wrote in message
news:2243e2e8-1331-4b28...@d4g2000prg.googlegroups.com...

> This algorithm allows user thread to determine when all processors
> execute #StoreLoad memory barrier.
>
> Basic outline:
> On system start processor_number number of threads is created (let's
> call those threads sys_synch_threads). sys_synch_threads binded to
> processors - one thread per processor. When user thread wants to
> detect epoch it enqueues request to every sys_synch_thread and waits
> for response. Every sys_synch_thread dequeues request and executes
> #StoreLoad memory barrier. When last sys_synch_thread process request
> user thread is notified.

[...]

That has a patent application:

http://groups.google.com/group/comp.lang.c++/msg/878a80ea30b2e849

Dmitriy Vyukov

unread,

Jan 23, 2008, 7:31:43 PM1/23/08

to

On 24 янв, 02:40, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Dmitriy Vyukov" <dvyu...@gmail.com> wrote in message

Sh#t!
Stop. You sad that:
"Global Memory Fence is, well, AFICT for now, covered by prior
art on multiple fronts."
here:
http://groups.google.com/group/comp.lang.c++/msg/f93b439a1baac9d2

Prior art in public domain. Right?

Dmitriy V'jukov