Re: atomically thread-safe Meyers singleton impl (fixed)...

Chris Thomasson

unread,

Jul 30, 2008, 12:44:19 AM7/30/08

to

Here is the fixed version of my atomically thread-safe singleton
implementation using pthreads,
x86, MSVC and the double-checked locking pattern with some error checking
omitted for brevity:
__________________________________________________________________
#include <cstdio>
#include <cassert>
#include <cstdlib>
#include <pthread.h>

#if ! defined(_MSC_VER)
# error MSVC REQUIRED FOR NOW!
#elif (_MSC_VER > 1300)
using namespace std;
#endif

class mutex_guard {
pthread_mutex_t* const m_mtx;

public:
mutex_guard(pthread_mutex_t* const mtx)
: m_mtx(mtx) {
pthread_mutex_lock(m_mtx);
printf("pthread_mutex_lock(%p);\n", (void*)m_mtx);
}

~mutex_guard() throw() {
printf("pthread_mutex_unlock(%p);\n", (void*)m_mtx);
pthread_mutex_unlock(m_mtx);
}
};

namespace atomic {
__declspec(naked)
static void*
ldptr_acq(void* volatile*) {
_asm {
MOV EAX, [ESP + 4]
MOV EAX, [EAX]
RET
}
}

__declspec(naked)
static void*
stptr_rel(void* volatile*, void* const) {
_asm {
MOV ECX, [ESP + 4]
MOV EAX, [ESP + 8]
MOV [ECX], EAX
RET
}
}
}

#if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
static pthread_mutex_t singleton_mtx =
PTHREAD_RECURSIVE_MUTEX_INITIALIZER;
#else
static pthread_mutex_t* volatile singleton_mtx_ptr = NULL;
static pthread_mutex_t singleton_mtx;

static void
singleton_mutex_static_init_destroy() {
assert(singleton_mtx_ptr == &singleton_mtx);
pthread_mutex_destroy(&singleton_mtx);
printf("pthread_mutex_destroy(%p);\n", (void*)&singleton_mtx);
}
#endif

static pthread_mutex_t*
singleton_mutex_static_init() {
pthread_mutex_t* mtx;
#if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
mtx = &singleton_mtx;
#else
mtx = (pthread_mutex_t*)atomic::ldptr_acq(
(void* volatile*)&singleton_mtx_ptr
);
if (! mtx) {
static pthread_mutex_t this_mtx_sentinel =
PTHREAD_MUTEX_INITIALIZER;
mutex_guard lock(&this_mtx_sentinel);
if (! (mtx = singleton_mtx_ptr)) {
pthread_mutexattr_t mattr;
pthread_mutexattr_init(&mattr);
pthread_mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
pthread_mutex_init(&singleton_mtx, &mattr);
pthread_mutexattr_destroy(&mattr);
atexit(singleton_mutex_static_init_destroy);
mtx = (pthread_mutex_t*)atomic::stptr_rel(
(void* volatile*)&singleton_mtx_ptr, &singleton_mtx
);
printf("pthread_mutex_init(%p);\n", (void*)mtx);
}
}
#endif
assert(mtx);
return mtx;
}

template<typename T>
struct singleton {
static T* instance() {
static T* volatile this_ptr = NULL;
T* ptr = (T*)atomic::ldptr_acq((void* volatile*)&this_ptr);
if (! ptr) {
mutex_guard lock(singleton_mutex_static_init());
if (! (ptr = this_ptr)) {
static T this_instance;
ptr = (T*)atomic::stptr_rel(
(void* volatile*)&this_ptr, &this_instance
);
}
}
assert(ptr);
return ptr;
}
};

struct foo {
foo() {
printf("(%p)->foo::foo();\n", (void*)this);
}

~foo() throw() {
printf("(%p)->foo::~foo();\n", (void*)this);
}
};

struct foo1 {
foo1() {
foo* ptr1 = singleton<foo>::instance();
foo* ptr2 = singleton<foo>::instance();
foo* ptr3 = singleton<foo>::instance();
assert(ptr1 == ptr2 && ptr2 == ptr3);
printf("(%p)->foo1::foo1();\n", (void*)this);
}

~foo1() throw() {
printf("(%p)->foo1::~foo1();\n", (void*)this);
}
};

struct foo2 {
foo2() {
printf("(%p)->foo2::foo2();\n", (void*)this);
}

~foo2() throw() {
printf("(%p)->foo2::~foo2();\n", (void*)this);
}
};

int main() {
foo1* ptr1 = singleton<foo1>::instance();
foo1* ptr2 = singleton<foo1>::instance();
foo1* ptr3 = singleton<foo1>::instance();
foo2* ptr11 = singleton<foo2>::instance();
foo2* ptr22 = singleton<foo2>::instance();
foo2* ptr33 = singleton<foo2>::instance();
assert(ptr1 == ptr2 && ptr2 == ptr3);
assert(ptr11 == ptr22 && ptr22 == ptr33);
return 0;
}
__________________________________________________________________

I think this is about as good as I can do. It uses a single recursive mutex
as a guard for the singleton slow-path. This is needed because a singleton
can contain other singletons in there ctor's. The pthread-win32 library
features a `PTHREAD_RECURSIVE_MUTEX_INITIALIZER' definition which statically
initialized a recursive mutex. However, I don't think that this is standard.
Therefore, the code will automatically compensate for this if it is not
defined. This means that this singleton will work even if threads are
created before main. Also, it should be rather trivial to convert this over
to GCC and Linux. Alls you would need to do is create the atomic functions
in AT&T inline assembler syntax.

Any thoughts on this approach?

Chris Thomasson

unread,

Jul 30, 2008, 1:19:22 AM7/30/08

to

"Chris Thomasson" <x...@xxx.xxx> wrote in message
news:g6orcf$q2d$1...@aioe.org...

> Here is the fixed version of my atomically thread-safe singleton
> implementation using pthreads,
> x86, MSVC and the double-checked locking pattern with some error checking
> omitted for brevity:
> __________________________________________________________________

[...]

> __________________________________________________________________
>
>
>
> I think this is about as good as I can do. It uses a single recursive
> mutex as a guard for the singleton slow-path. This is needed because a
> singleton can contain other singletons in there ctor's. The pthread-win32
> library features a `PTHREAD_RECURSIVE_MUTEX_INITIALIZER' definition which
> statically initialized a recursive mutex. However, I don't think that this
> is standard. Therefore, the code will automatically compensate for this if
> it is not defined. This means that this singleton will work even if
> threads are created before main. Also, it should be rather trivial to
> convert this over to GCC and Linux. Alls you would need to do is create
> the atomic functions in AT&T inline assembler syntax.
>
>
> Any thoughts on this approach?

I think the only possible way to break this would be to do something
EXTREMELY STUPID like:

struct foo {
foo() {
foo* f = singleton<foo>::instance();
}
};

which would be analogous to doing:

struct foo {
foo() {
static foo f;
}
};

For now, AFAICT this thread-safe singleton is looking fairly bullet-proof.
Humm...

Can you think of a way to bust it?

Thanks.

anon

unread,

Jul 30, 2008, 3:18:43 AM7/30/08

to

Chris Thomasson wrote:
> #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
> static pthread_mutex_t singleton_mtx =
> PTHREAD_RECURSIVE_MUTEX_INITIALIZER;
> #else
> static pthread_mutex_t* volatile singleton_mtx_ptr = NULL;
> static pthread_mutex_t singleton_mtx;
>

Not related to threads, but it might happen, depends how is it implemented:
http://www.parashift.com/c++-faq-lite/ctors.html#faq-10.12

Anthony Williams

unread,

Jul 30, 2008, 3:20:54 AM7/30/08

to

You need compiler barriers (_ReadWriteBarrier() in MSVC) to ensure
things don't get rearranged across your atomic access
functions. There's no need to drop to assembler either: you're not
doing anything more complicated than a simple MOV.

Anyway, if I was writing this (and I wouldn't be, because I really
dislike singletons), I'd just use boost::call_once. It doesn't use a
lock unless it has to and is portable across pthreads and win32
threads.

Oh, and one other thing: you don't need inline assembler for atomic
ops with gcc from version 4.2 onwards, as the compiler has built-in
functions for atomic operations.

Anthony
--
Anthony Williams | Just Software Solutions Ltd
Custom Software Development | http://www.justsoftwaresolutions.co.uk
Registered in England, Company Number 5478976.
Registered Office: 15 Carrallack Mews, St Just, Cornwall, TR19 7UL

Dmitriy V'jukov

unread,

Jul 30, 2008, 4:39:50 AM7/30/08

to

This can happen only with dynamic initialization, not with static. All
static initialization *guaranteed* to happen before all dynamic
initialization.
pthread_mutex_t must not have default ctor, because it has struct
initialization syntax "=PTHREAD_MUTEX_INITIALIZER", hense it will be
initialized statically.

Dmitriy V'jukov

unread,

Jul 30, 2008, 4:48:18 AM7/30/08

to

On 30 июл, 11:20, Anthony Williams <anthony....@gmail.com> wrote:
> You need compiler barriers (_ReadWriteBarrier() in MSVC) to ensure
> things don't get rearranged across your atomic access
> functions. There's no need to drop to assembler either: you're not
> doing anything more complicated than a simple MOV.

In MSVC one can use just volatile variables, accesses to volatile
variables guaranteed to be 'load-acquire' and 'store-release'. I.e. on
Itanium/PPC MSVC will emit hardware memory fences (along with compiler
fences).

http://msdn.microsoft.com/en-us/library/12a04hfd.aspx

Dmitriy V'jukov

James Kanze

unread,

Jul 30, 2008, 4:59:44 AM7/30/08

to

On Jul 30, 6:44 am, "Chris Thomasson" <x...@xxx.xxx> wrote:

[...]

> Any thoughts on this approach?

I'm not familiar enough with modern the x86 architecture and
VC++'s implementation of it to judge, but my first reaction is,
why bother? It's fairly easy to ensure (in practice) that the
first initialization of the singleton occurs either before main
(and thus, hopefully, before threads have started), or in a
protected environment (during dynamic loading, before the loaded
memory is made visible to threads other than the one doing the
loading). It's also fairly easy to fusion the mutex lock with
any mutex lock needed to use the singleton (so its effective
cost is 0). And of course, acquiring a mutex lock when there is
no contention is very, very fast anyway.

--
James Kanze (GABI Software) email:james...@gmail.com
Conseils en informatique orientée objet/
Beratung in objektorientierter Datenverarbeitung
9 place Sémard, 78210 St.-Cyr-l'École, France, +33 (0)1 30 23 00 34

Dmitriy V'jukov

unread,

Jul 30, 2008, 5:36:39 AM7/30/08

to

On 30 июл, 12:59, James Kanze <james.ka...@gmail.com> wrote:

> I'm not familiar enough with modern the x86 architecture and
> VC++'s implementation of it to judge, but my first reaction is,
> why bother? It's fairly easy to ensure (in practice) that the
> first initialization of the singleton occurs either before main
> (and thus, hopefully, before threads have started)

For example this way :)
http://groups.google.ru/group/comp.lang.c++.moderated/msg/e6c403a43169a8de

Dmitriy V'jukov

Anthony Williams

unread,

Jul 30, 2008, 5:44:11 AM7/30/08

to

I am aware of this. However, the description only says it orders
accesses to "global and static data", and that it refers to objects
*declared* as volatile. I haven't tested it enough to be confident
that it is entirely equivalent to _ReadWriteBarrier(), and that it
works on variables *cast* to volatile.

Chris M. Thomasson

unread,

Jul 30, 2008, 5:54:11 AM7/30/08

to

Sorry for the nasty top-post, however, I need to let you all know that I
want this code to work on VC 6.0. Why? Well, I don't have a soluble answer
for this... Well, there has to be somebody who still uses the ancient
relic... If you want to flame me; so be it!

:^o

"Anthony Williams" <antho...@gmail.com> wrote in message
news:umyjz6...@gmail.com...

> You need compiler barriers (_ReadWriteBarrier() in MSVC) to ensure
> things don't get rearranged across your atomic access
> functions.

Okay. Is there documentation that MSVC will explicitly rearrange things
across a function declared as __declspec(naked) which contains nothing but
assembly?

http://groups.google.com/group/comp.programming.threads/browse_frm/thread/29ea516c5581240e
(read all...)

I suspect that there is not, therefore, I should really make the code as a
__separately_ASSEMBLED_library__; I am probably misguided on that...
However, I could use AppCore, but the website is totally dead; it will be
resurrected in less than 48 hours...

> There's no need to drop to assembler either: you're not
> doing anything more complicated than a simple MOV.

The functions contained within the `atomic' namespace are IMVHO, "divine",
in the sense that I was under the impression that MSVC would not perform
introspection into the instructions which make up such functions. I thought
that said functions would be treated as an externally assembled with
link-time optimizations turned off... Well, how useful would the `_asm'
keyword be if the compiler could say, basically, "F-YOU" I know better:

http://groups.google.com/group/comp.programming.threads/msg/0afc1109d18c2991

http://groups.google.com/group/comp.programming.threads/msg/fd3b4b5a5cd7841e

OUCH!!!

:^O

> Anyway, if I was writing this (and I wouldn't be, because I really
> dislike singletons)

:^D

> , I'd just use boost::call_once.

Sure. However, IMVHO, that does not teach anybody anything wrt the MEAT of
the algorithm at hand. For some reason, I think that could possibly be
fairly important within certain contexts... Perhaps I am living a
pipe-dream...

;^(...

> It doesn't use a

> lock unless it has to and is portable across threads and win32
> threads.

The code I posted does not use a lock unless it absolutely has to because it
attempts to efficiently take advantage of the double checked locking
pattern. AFAICT, ASM is the only "universal" way to implement that
particular pattern across multiple architectures. Does compiler X for arch Y
directly support DCL? The answer is NO. Therefore programmers MUST take
advantage of ABI's and ASM to go ahead and implement code and the API's that
declare these types of sensitive algorithm arch-dependant implementations.

> Oh, and one other thing: you don't need inline assembler for atomic
> ops with gcc from version 4.2 onwards, as the compiler has built-in
> functions for atomic operations.

GREAT! I have NASTY habit... I like to make my code compatible with several
previous versions of any given compiler, API, ABI platform. Therefore, the
GCC version of this singleton should work WELL with GCC versions < 4.2.

;^o

Does that make any sense? Please elaborate because I LOVE to LEARN NEW
THINGS!

Thank you Anthony. I really do appreciate your time!

Seriously.

Chris M. Thomasson

unread,

Jul 30, 2008, 5:56:12 AM7/30/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
news:e9e8900f-0031-4d0f-beda-> 98d4b4...@k37g2000hsf.googlegroups.com...

YES! However, there might be some retarded program which spawns threads in a
singleton before main is called. Hence, my contrived band-aid...

;^o

Chris M. Thomasson

unread,

Jul 30, 2008, 6:20:25 AM7/30/08

to

"Chris M. Thomasson" <n...@spam.invalid> wrote in message
news:OPWjk.3880$QX3....@newsfe02.iad...

> Sorry for the nasty top-post, however, I need to let you all know that I
> want this code to work on VC 6.0. Why? Well, I don't have a soluble answer
> for this... Well, there has to be somebody who still uses the ancient
> relic... If you want to flame me; so be it!
>
> :^o
>
>
>
>
> "Anthony Williams" <antho...@gmail.com> wrote in message
> news:umyjz6...@gmail.com...
>> You need compiler barriers (_ReadWriteBarrier() in MSVC) to ensure
>> things don't get rearranged across your atomic access
>> functions.
>
>
> Okay. Is there documentation that MSVC will explicitly rearrange things
> across a function declared as __declspec(naked) which contains nothing but
> assembly?
>
> http://groups.google.com/group/comp.programming.threads/browse_frm/thread/29ea516c5581240e
> (read all...)
>
>
> I suspect that there is not, therefore, I should really make the code as a
> __separately_ASSEMBLED_library__; I am probably misguided on that...

Well, it does not matter what a compiler says... If you want to avoid
problems, learn the ABI, create a separately assembled library with a C API
compatible with your platform of choice, and "decorate" said API
declarations with "magic" that disables link-time optimizations and you
should be free of "worries". Please correct me where I am going wrong...
BTW, this means that portable code be DAMNED, and specific impl per-arch,
and perhaps per-compiler, is OKAY!

;^o

> However, I could use AppCore, but the website is totally dead; it will be
> resurrected in less than 48 hours...

[...]

Anthony Williams

unread,

Jul 30, 2008, 6:50:27 AM7/30/08

to

"Chris M. Thomasson" <n...@spam.invalid> writes:

> Sorry for the nasty top-post, however, I need to let you all know that
> I want this code to work on VC 6.0. Why? Well, I don't have a soluble
> answer for this... Well, there has to be somebody who still uses the
> ancient relic... If you want to flame me; so be it!

If you really do have a client that uses VC 6, then I understand
completely. If you're just wanting to support it for the sake of it,
you must be a glutton for punishment. Oh, and why use pthread
emulation if you're targetting MSVC?

> "Anthony Williams" <antho...@gmail.com> wrote in message
> news:umyjz6...@gmail.com...
>> You need compiler barriers (_ReadWriteBarrier() in MSVC) to ensure
>> things don't get rearranged across your atomic access
>> functions.
>
>
> Okay. Is there documentation that MSVC will explicitly rearrange
> things across a function declared as __declspec(naked) which contains
> nothing but assembly?

The compiler docs say:

"The presence of an __asm block in a function affects optimization
in several ways. First, the compiler doesn't try to optimize the
__asm block itself. What you write in assembly language is exactly
what you get. Second, the presence of an __asm block affects
register variable storage. The compiler avoids enregistering
variables across an __asm block if the register's contents would
be changed by the __asm block. Finally, some other function-wide
optimizations will be affected by the inclusion of assembly
language in a function."

So, you get what you write, but I guess the compiler may still move
things across the ASM block if they are not accessed within it.

> I suspect that there is not, therefore, I should really make the code
> as a __separately_ASSEMBLED_library__;

In theory that might be subject to link-time optimization too, but
maybe not in practice.

>> There's no need to drop to assembler either: you're not
>> doing anything more complicated than a simple MOV.
>
> The functions contained within the `atomic' namespace are IMVHO,
> "divine", in the sense that I was under the impression that MSVC would
> not perform introspection into the instructions which make up such
> functions. I thought that said functions would be treated as an
> externally assembled with link-time optimizations turned off... Well,
> how useful would the `_asm' keyword be if the compiler could say,
> basically, "F-YOU" I know better:

The compiler knows which registers you use, and which variables you
access. Unless you tell it otherwise (e.g. with _ReadWriteBarrier()),
it *may* rearrange other accesses around and across the asm block. I
imagine the same is true for gcc, which is why it has the "memory"
modifier for asm blocks.

>> , I'd just use boost::call_once.
>
> Sure. However, IMVHO, that does not teach anybody anything wrt the
> MEAT of the algorithm at hand. For some reason, I think that could
> possibly be fairly important within certain contexts... Perhaps I am
> living a pipe-dream...

The algorithm used by boost::call_once on pthreads platforms is
described here:

http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2444.html

>> It doesn't use a
>> lock unless it has to and is portable across threads and win32
>> threads.
>
> The code I posted does not use a lock unless it absolutely has to
> because it attempts to efficiently take advantage of the double
> checked locking pattern.

Oh yes, I realise that: the code for call_once is similar. However, it
attempts to avoid contention on the mutex by using thread-local
storage. If you have atomic ops, you can go even further in
eliminating the mutex, e.g. using compare_exchange and fetch_add.

> AFAICT, ASM is the only "universal" way to
> implement that particular pattern across multiple architectures.

Yes, at least until compilers start providing C++0x atomics.

>> Oh, and one other thing: you don't need inline assembler for atomic
>> ops with gcc from version 4.2 onwards, as the compiler has built-in
>> functions for atomic operations.
>
> GREAT! I have NASTY habit... I like to make my code compatible with
> several previous versions of any given compiler, API, ABI
> platform. Therefore, the GCC version of this singleton should work
> WELL with GCC versions < 4.2.

Sure. I just prefer to use compiler intrinsics when I can, as then the
compiler understands what I'm doing and can optimize (or not, where
important) accordingly.

James Kanze

unread,

Jul 30, 2008, 7:23:01 AM7/30/08

to

On Jul 30, 11:36 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:

> For example this way :)http://groups.google.ru/group/comp.lang.c++.moderated/msg/e6c403a4316...

That's basically what I've been proposing (and doing in my own
code) for the last four or five years.

James Kanze

unread,

Jul 30, 2008, 7:26:51 AM7/30/08

to

On Jul 30, 11:56 am, "Chris M. Thomasson" <n...@spam.invalid> wrote:
> "Dmitriy V'jukov" <dvyu...@gmail.com> wrote in message

> news:e9e8900f-0031-4d0f-beda-> 98d4b43be...@k37g2000hsf.googlegroups.com...

> > On 30 ÉÀÌ, 12:59, James Kanze <james.ka...@gmail.com> wrote:
> > > I'm not familiar enough with modern the x86 architecture and
> > > VC++'s implementation of it to judge, but my first reaction is,
> > > why bother? It's fairly easy to ensure (in practice) that the
> > > first initialization of the singleton occurs either before main
> > > (and thus, hopefully, before threads have started)
> > For example this way :)

> >http://groups.google.ru/group/comp.lang.c++.moderated/msg/e6c403a4316...

> YES! However, there might be some retarded program which
> spawns threads in a singleton before main is called. Hence, my
> contrived band-aid...

It's not necessarily retarded, depending on what the library
which does so is doing. But such a program either 1) knows
about the singleton, and its constraints, and will ensure that
it is constructed before starting the thread (e.g. by calling
Singleton::instance() before calling pthread_create), or it
doesn't know about the singleton, and in such cases, it won't
use it in the spawned threads.

The case seems constrained enough that I wouldn't worry about
it.

Chris M. Thomasson

unread,

Jul 30, 2008, 9:34:35 AM7/30/08

to

"Anthony Williams" <antho...@gmail.com> wrote in message

news:u63qn6...@gmail.com...

> "Chris M. Thomasson" <n...@spam.invalid> writes:

[...]

> The algorithm used by boost::call_once on pthreads platforms is
> described here:
>
> http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2444.html
>
>>> It doesn't use a
>>> lock unless it has to and is portable across threads and win32
>>> threads.
>>
>> The code I posted does not use a lock unless it absolutely has to
>> because it attempts to efficiently take advantage of the double
>> checked locking pattern.
>
> Oh yes, I realise that: the code for call_once is similar. However, it
> attempts to avoid contention on the mutex by using thread-local
> storage. If you have atomic ops, you can go even further in
> eliminating the mutex, e.g. using compare_exchange and fetch_add.

[...]

Before I reply to your entire post I should point out that:

http://groups.google.com/group/comp.lang.c++.moderated/msg/e39c7aff738f9102

the Boost mechanism is not 100% portable, but is elegant in practice. It
uses a similar technique that a certain distributed reference counting
algorithm I created claims:

http://groups.google.com/group/comp.programming.threads/browse_frm/thread/1216861ac568b2be

http://groups.google.com/group/comp.programming.threads/search?hl=en&group=comp.programming.threads&q=vzoom

Not 100% portable, but _highly_ portable indeed!

Dmitriy V'jukov

unread,

Jul 30, 2008, 9:35:08 AM7/30/08

to

On Jul 30, 8:44 am, "Chris Thomasson" <x...@xxx.xxx> wrote:

> template<typename T>
> struct singleton {
> static T* instance() {
> static T* volatile this_ptr = NULL;

I think here is a little problem. this_ptr is initialized dynamically,
and this initialization is not thread-safe. So some thread can
overwrite pointer in this_ptr with NULL.
You have to made this_ptr global, not function local, so it will be
initialized with NULL statically before any user code is executed.

Dmitriy V'jukov

Anthony Williams

unread,

Jul 30, 2008, 9:42:36 AM7/30/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> writes:

Initialization with a constant is still static initialization, even
for function locals.

Anthony Williams

unread,

Jul 30, 2008, 9:47:17 AM7/30/08

to

"Chris M. Thomasson" <n...@spam.invalid> writes:

> "Anthony Williams" <antho...@gmail.com> wrote in message
> news:u63qn6...@gmail.com...
>> "Chris M. Thomasson" <n...@spam.invalid> writes:
> [...]
>> The algorithm used by boost::call_once on pthreads platforms is
>> described here:
>>
>> http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2444.html
>>
>>>> It doesn't use a
>>>> lock unless it has to and is portable across threads and win32
>>>> threads.
>>>
>>> The code I posted does not use a lock unless it absolutely has to
>>> because it attempts to efficiently take advantage of the double
>>> checked locking pattern.
>>
>> Oh yes, I realise that: the code for call_once is similar. However, it
>> attempts to avoid contention on the mutex by using thread-local
>> storage. If you have atomic ops, you can go even further in
>> eliminating the mutex, e.g. using compare_exchange and fetch_add.
> [...]
>
> Before I reply to your entire post I should point out that:
>
> http://groups.google.com/group/comp.lang.c++.moderated/msg/e39c7aff738f9102
>
> the Boost mechanism is not 100% portable, but is elegant in
> practice.

Yes. If you look at the whole thread, you'll see a comment by me there
where I admit as much.

> It uses a similar technique that a certain distributed
> reference counting algorithm I created claims:

I wasn't aware that you were using something similar in vZOOM.

Chris M. Thomasson

unread,

Jul 30, 2008, 9:51:14 AM7/30/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:c7877461-e97b-40cf...@u12g2000prd.googlegroups.com...

Okay. However, by that logic and POSIX compiler magic aside for a moment,
the following is not "technically" thread-safe:

void foo() {
static pthread_mutex_t this_mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&this_mtx);
[...];
pthread_mutex_unlock(&this_mtx);
}

unless the `pthread_mutex_lock()' and `pthread_mutex_unlock()' function
calls explicitly handle the condition you describe; where am I going wrong?

or even:

void foo() {
static atomic_word volatile this_mtx = 0;

while (ATOMIC_SWAP(&this_mtx, 1)) {
sched_yield();
}

[...];

ATOMIC_SWAP(&this_mtx, 0);
}

Could the above deadlock? Or encounter multiple threads within the
critical-section?

Chris M. Thomasson

unread,

Jul 30, 2008, 10:21:30 AM7/30/08

to

"Anthony Williams" <antho...@gmail.com> wrote in message

news:uhca74...@gmail.com...

Does the following line:

__thread fast_pthread_once_t _fast_pthread_once_per_thread_epoch;

explicitly set `_fast_pthread_once_per_thread_epoch' to zero? If so, is it
guaranteed?

>> It uses a similar technique that a certain distributed
>> reference counting algorithm I created claims:
>
> I wasn't aware that you were using something similar in vZOOM.

Humm, now that I think about it, it seems like I am totally mistaken. The
"most portable" version of vZOOM relies on an assumption that pointer
load/stores are atomic and the unlocking of a mutex executes at least a
release-barrier, and the loading of a shared variable executes at least a
data-dependant load-barrier; very similar to RCU without the explicit
#LoadStore | #StoreStore before storing into a shared pointer location...
Something like:

____________________________________________________________________
struct foo {
int a;
};

static foo* shared_f = NULL;

// single producer thread {
foo* local_f = new foo;
pthread_mutex_t* lock = get_per_thread_mutex();
pthread_mutex_lock(lock);
local_f->a = 666;
pthread_mutex_unlock(lock);
shared_f = local_f;
}

// single consumer thread {
foo* local_f;
while (! (local_f = shared_f)) {
sched_yield();
}
assert(local_f->a == 666);
delete local_f;
}
____________________________________________________________________

If the `pthread_mutex_unlock()' function does not execute at least a
release-barrier in the producer, and if the load of the shared variable does
not execute at least a data-dependant load-barrier in the consumer, the
"most portable" version of vZOOM will NOT work on that platform in any way
shape or form, it will need a platform-dependant version. However, the only
platform I can think of where the intra-node memory visibility requirements
do not hold is the Alpha... For multi-node super-computers, inter-node
communication is adapted to using MPI.

Chris M. Thomasson

unread,

Jul 30, 2008, 10:25:25 AM7/30/08

to

"Anthony Williams" <antho...@gmail.com> wrote in message

news:uljzj4...@gmail.com...

> "Dmitriy V'jukov" <dvy...@gmail.com> writes:
>
>> On Jul 30, 8:44 am, "Chris Thomasson" <x...@xxx.xxx> wrote:
>>
>>> template<typename T>
>>> struct singleton {
>>> static T* instance() {
>>> static T* volatile this_ptr = NULL;
>>
>> I think here is a little problem. this_ptr is initialized dynamically,
>> and this initialization is not thread-safe. So some thread can
>> overwrite pointer in this_ptr with NULL.
>> You have to made this_ptr global, not function local, so it will be
>> initialized with NULL statically before any user code is executed.
>
> Initialization with a constant is still static initialization, even
> for function locals.

That's what I always thought. However, perhaps he is thinking along the
lines of:

struct foo {
foo() {
puts("HELLO!");
}
};

void static_me_or_not(int flag) {
if (flag == 666) {
static foo x;
}
}

/* program 1 */
int main() {
static_me_or_not(1);
static_me_or_not(2);
static_me_or_not(3);
static_me_or_not(4);
return 0;
}

/* program 2 */
int main() {
static_me_or_not(1);
static_me_or_not(2);
static_me_or_not(666);
static_me_or_not(4);
return 0;
}

The execution of program 1 will not print "HELLO!", however, the execution
of program 2 will... Sounds dynamic...

Chris M. Thomasson

unread,

Jul 30, 2008, 10:29:33 AM7/30/08

to

"Chris M. Thomasson" <n...@spam.invalid> wrote in message
news:0i_jk.6451$1N1....@newsfe07.iad...

> "Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
> news:c7877461-e97b-40cf...@u12g2000prd.googlegroups.com...
>> On Jul 30, 8:44 am, "Chris Thomasson" <x...@xxx.xxx> wrote:
>>
>>> template<typename T>
>>> struct singleton {
>>> static T* instance() {
>>> static T* volatile this_ptr = NULL;
>>
>> I think here is a little problem. this_ptr is initialized dynamically,
>> and this initialization is not thread-safe. So some thread can
>> overwrite pointer in this_ptr with NULL.
>> You have to made this_ptr global, not function local, so it will be
>> initialized with NULL statically before any user code is executed.
>
> Okay.

[...]

I see:

http://groups.google.com/group/comp.programming.threads/msg/8c85c0c63831c6fd

Anthony Williams

unread,

Jul 30, 2008, 10:27:54 AM7/30/08

to

The algorithm assumes it does, but it depends which compiler you
user. In the Boost implementation, the value is explicitly
initialized (to ~0 --- I found it worked better with exception
handling to count backwards).

>
>>> It uses a similar technique that a certain distributed
>>> reference counting algorithm I created claims:
>>
>> I wasn't aware that you were using something similar in vZOOM.
>
> Humm, now that I think about it, it seems like I am totally
> mistaken. The "most portable" version of vZOOM relies on an assumption
> that pointer load/stores are atomic and the unlocking of a mutex
> executes at least a release-barrier, and the loading of a shared
> variable executes at least a data-dependant load-barrier; very similar
> to RCU without the explicit #LoadStore | #StoreStore before storing
> into a shared pointer location... Something like:
>

> // single producer thread {
> foo* local_f = new foo;
> pthread_mutex_t* lock = get_per_thread_mutex();
> pthread_mutex_lock(lock);
> local_f->a = 666;
> pthread_mutex_unlock(lock);
> shared_f = local_f;

So you're using the lock just for the barrier properties. Interesting
idea.

Anthony Williams

unread,

Jul 30, 2008, 10:29:32 AM7/30/08

to

"Chris M. Thomasson" <n...@spam.invalid> writes:

> "Anthony Williams" <antho...@gmail.com> wrote in message
> news:uljzj4...@gmail.com...
>> "Dmitriy V'jukov" <dvy...@gmail.com> writes:
>>
>>> On Jul 30, 8:44 am, "Chris Thomasson" <x...@xxx.xxx> wrote:
>>>
>>>> template<typename T>
>>>> struct singleton {
>>>> static T* instance() {
>>>> static T* volatile this_ptr = NULL;
>>>
>>> I think here is a little problem. this_ptr is initialized dynamically,
>>> and this initialization is not thread-safe. So some thread can
>>> overwrite pointer in this_ptr with NULL.
>>> You have to made this_ptr global, not function local, so it will be
>>> initialized with NULL statically before any user code is executed.
>>
>> Initialization with a constant is still static initialization, even
>> for function locals.
>
> That's what I always thought. However, perhaps he is thinking along
> the lines of:
>
>
> struct foo {
> foo() {
> puts("HELLO!");
> }
> };
>
>
> void static_me_or_not(int flag) {
> if (flag == 666) {
> static foo x;
> }
> }

Constructors are always dynamic initialization.

James Kanze

unread,

Jul 30, 2008, 10:34:13 AM7/30/08

to

On Jul 30, 4:25 pm, "Chris M. Thomasson" <n...@spam.invalid> wrote:
> "Anthony Williams" <anthony....@gmail.com> wrote in message

> news:uljzj4...@gmail.com...

Maybe because it is dynamic. Initializing an object with class
type and a user defined constructor is dynamic initialization.
Initializing a pointer with NULL is static initialization. Of
course, the current standard doesn't make this distinction for
local objects; it doesn't have to, since a local object cannot
be seen before the control flow has been executed, and there's
no way a conforming program could tell. Presumably now that
threads are being added to the standard, the distinction will
apply here as well (but I'm not sure that anyone has thought to
look at it yet).

Dmitriy V'jukov

unread,

Jul 30, 2008, 10:45:22 AM7/30/08

to

On Jul 30, 5:42 pm, Anthony Williams <anthony....@gmail.com> wrote:

> >> template<typename T>
> >> struct singleton {
> >> static T* instance() {
> >> static T* volatile this_ptr = NULL;
>
> > I think here is a little problem. this_ptr is initialized dynamically,
> > and this initialization is not thread-safe. So some thread can
> > overwrite pointer in this_ptr with NULL.
> > You have to made this_ptr global, not function local, so it will be
> > initialized with NULL statically before any user code is executed.
>
> Initialization with a constant is still static initialization, even
> for function locals.

Yes, I'm totally wrong here. Forgot about this.

Dmitriy V'jukov

James Kanze

unread,

Jul 30, 2008, 10:48:15 AM7/30/08

to

I've looked it up. Interestingly, C gives no real guarantee,
presumably because initialization of an object with static
storage duration can only be with constant expressions, and
without threads, there's no way for a program to see when it
took place. C++ addresses the issue in §3.6.2; the name of the
section is "Initialization of non-local objects", but it the
first paragraph speaks of "Objects with static storage
duration", with no requirement that they be non-local, and so
applies here: "Objects with static storage duration (3.7.1)
shall be zero-initialized (8.5) before any other initialization
takes place. [..] Together, zero-initialization and constant
initialization are called static initialization; all other
initialization is dynamic initialization. Static initialization
shall be performed before any dynamic initialization takes
place." And there's no way you're going to start a thread from
static initialization (which requires constant epxressions).

Anthony Williams

unread,

Jul 30, 2008, 10:55:34 AM7/30/08

to

James Kanze <james...@gmail.com> writes:

> C++ addresses the issue in §3.6.2; the name of the
> section is "Initialization of non-local objects", but it the
> first paragraph speaks of "Objects with static storage
> duration", with no requirement that they be non-local, and so
> applies here: "Objects with static storage duration (3.7.1)
> shall be zero-initialized (8.5) before any other initialization
> takes place. [..] Together, zero-initialization and constant
> initialization are called static initialization; all other
> initialization is dynamic initialization. Static initialization
> shall be performed before any dynamic initialization takes
> place." And there's no way you're going to start a thread from
> static initialization (which requires constant epxressions).

6.7/4 of the C++ Standard describes initialization of local objects
with static storage duration:

"The zero-initialization (8.5) of all local objects with static
storage duration (3.7.1) is performed before any other
initialization takes place. A local object of POD type (3.9) with
static storage duration initialized with constant-expressions is
initialized before its block is first entered. An implementation
is permitted to perform early initialization of other local
objects with static storage duration under the same conditions
that an implementation is permitted to statically initialize an
object with static storage duration in namespace scope
(3.6.2). Otherwise such an object is initialized the first time
control passes through its declaration; such an object is
considered initialized upon the completion of its
initialization. If the initialization exits by throwing an
exception, the initialization is not complete, so it will be tried
again the next time control enters the declaration. If control
re-enters the declaration (recursively) while the object is being
initialized, the behavior is undefined."

Chris M. Thomasson

unread,

Jul 30, 2008, 11:37:49 AM7/30/08

to

"Anthony Williams" <antho...@gmail.com> wrote in message

news:ud4kv4...@gmail.com...

> "Chris M. Thomasson" <n...@spam.invalid> writes:
[...]

Yes. Actually, I did not show the whole algorithm. The code above is busted
because I forgot to show it all; STUPID ME!!! Its busted because the store
to shared_f can legally be hoisted up above the unlock. Here is the whole
picture... Each thread has a special dedicated mutex which is locked from
its birth... Here is exactly how production of an object can occur:

static foo* volatile shared_f = NULL;

// single producer thread {
00: foo* local_f;
01: pthread_mutex_t* const mem_mutex = get_per_thread_mem_mutex();
02: local_f = new foo;
03: local_f->a = 666;
04: pthread_mutex_unlock(mem_mutex);
05: pthread_mutex_lock(mem_mutex);
06: shared_f = local_f;
}

Here are the production rules wrt POSIX:

1. Steps 02-03 CANNOT sink below step 04
2. Step 06 CANNOT rise above step 05
3. vZOOM assumes that step 04 has a release barrier

Those __two__guarantees__and__single__assumption__ ensure the ordering and
visibility of the operations is correct. After that, the consumer can do:

// single consumer thread {
00: foo* local_f;
01: while (! (local_f = shared_f)) {
02: sched_yield();
}
03: assert(local_f->a == 666);
04: delete local_f;
}

Consumption rules:

01: vZOOM assumes that the load from `shared_f' will have implied
data-dependant load-barrier.

BTW, here is a brief outline of how the "most portable" version of vZOOM
distributed reference counting works with the above idea:

http://groups.google.ru/group/comp.programming.threads/msg/59e9b6e427b4a144

http://groups.google.com/group/comp.programming.threads/browse_frm/thread/fe24fe99f742ce6e
(an __execlelent__ question from Dmitriy...)

What do you think Anthony?

Chris M. Thomasson

unread,

Jul 30, 2008, 11:40:38 AM7/30/08

to

[...]

>
> BTW, here is a brief outline of how the "most portable" version of vZOOM
> distributed reference counting works with the above idea:
>
> http://groups.google.ru/group/comp.programming.threads/msg/59e9b6e427b4a144

Take note of the per-thread memory lock. Its vital to vZOOM.

Chris M. Thomasson

unread,

Jul 30, 2008, 3:26:32 PM7/30/08

to

"Chris Thomasson" <x...@xxx.xxx> wrote in message
news:g6orcf$q2d$1...@aioe.org...
> Here is the fixed version of my atomically thread-safe singleton
> implementation using pthreads,
> x86, MSVC and the double-checked locking pattern with some error checking
> omitted for brevity:
> __________________________________________________________________
[...]
> __________________________________________________________________

>
[...]
> Any thoughts on this approach?

There is a NASTY deadlocking problem inherent with ANY initialize once
routine. It happens with my impl, boost::call_once or even POSIX
pthread_once()! There has to be some clear guidelines. Here is an example of
the problem:

http://groups.google.com/group/comp.programming.threads/msg/bfb9bc1f2517614e

Here is a proof in the form of a working program:

http://groups.google.com/group/comp.programming.threads/msg/08ffd67dcc28e118

We need rules! AFAICT, the guidelines can be as simple as:

- You cannot take a mutex in the ctor of any object which may be used as a
singleton.

or

- You cannot hold a mutex before you invoke the `instance()' function of the
singleton interface.

The problem should be solved if one of those two rules is ALWAYS followed.
Which rule do you think is the most flexible?

Thank you David Schwartz.

Dmitriy V'jukov

unread,

Jul 30, 2008, 4:54:58 PM7/30/08

to

On 30 июл, 23:26, "Chris M. Thomasson" <n...@spam.invalid> wrote:

> - You cannot take a mutex in the ctor of any object which may be used as a
> singleton.
>
> or
>
> - You cannot hold a mutex before you invoke the `instance()' function of the
> singleton interface.
>
> The problem should be solved if one of those two rules is ALWAYS followed.
> Which rule do you think is the most flexible?

First.

Dmitriy V'jukov

Chris M. Thomasson

unread,

Jul 30, 2008, 5:10:31 PM7/30/08

to

"James Kanze" <james...@gmail.com> wrote in message
news:8c6cb05a-0bec-4392...@m36g2000hse.googlegroups.com...

On Jul 30, 6:44 am, "Chris Thomasson" <x...@xxx.xxx> wrote:

[...]
> > Any thoughts on this approach?

> I'm not familiar enough with modern the x86 architecture and

> VC++'s implementation of it to judge, but my first reaction is,
> why bother?

:^D

Well, I wanted to see if I could implment a working generic DCL algorihtm.
It looks like the code I posted is going to work as long as the rules are
followed:

http://groups.google.com/group/comp.lang.c++/msg/ce50b7b836b9f823

> It's fairly easy to ensure (in practice) that the
> first initialization of the singleton occurs either before main

> (and thus, hopefully, before threads have started), or in a
> protected environment (during dynamic loading, before the loaded
> memory is made visible to threads other than the one doing the
> loading).

Fair enough. However, I was thinking about somebody else using the
singleton. I don't have control over their code. On the other hand, I do
have control over the generic primitive I provide to them. Robustness is a
fairly significant factor. I want the DCL algorithm to be able to cope with
threads created before main.

> It's also fairly easy to fusion the mutex lock with
> any mutex lock needed to use the singleton (so its effective
> cost is 0).

I don't think I understand what your getting at. Could you please elaborate
on this some more?

> And of course, acquiring a mutex lock when there is
> no contention is very, very fast anyway.

Well, most mutex implementations still will execute 2 atomic RMW's and a
#StoreLoad plus #LoadStore style memory barrier. 1 atomic op and 1 storeload
for mutex lock, and 1 atomic op and 1 loadstore for mutex unlock. AFAICT,
you can get cache ping-pong with multiple threads frequently accessing a
uncontended mutex. Say CPUS 1-6 frequently access a mutex and the execution
sequence is perfect such that the CPUS don't access it at the same time.
There still could be cache thrashing due to the frequent mutation to the
mutex internal state and stalls from the hard core memory barrier on
entry...

James Kanze

unread,

Jul 31, 2008, 4:27:26 AM7/31/08

to

On Jul 30, 11:10 pm, "Chris M. Thomasson" <n...@spam.invalid> wrote:
> "James Kanze" <james.ka...@gmail.com> wrote in message

> news:8c6cb05a-0bec-4392...@m36g2000hse.googlegroups.com...
> On Jul 30, 6:44 am, "Chris Thomasson" <x...@xxx.xxx> wrote:

> [...]

> > > Any thoughts on this approach?
> > I'm not familiar enough with modern the x86 architecture and
> > VC++'s implementation of it to judge, but my first reaction
> > is, why bother?

> :^D

> Well, I wanted to see if I could implment a working generic
> DCL algorihtm. It looks like the code I posted is going to
> work as long as the rules are followed:

> http://groups.google.com/group/comp.lang.c++/msg/ce50b7b836b9f823

The problem is, of course, that to do so, you have to introduce
all sorts of complexity and portability issues (inline
assembler, etc.). Something that you should probably avoid
unless it is absolutely necessary.

> > It's fairly easy to ensure (in practice) that the
> > first initialization of the singleton occurs either before main
> > (and thus, hopefully, before threads have started), or in a
> > protected environment (during dynamic loading, before the loaded
> > memory is made visible to threads other than the one doing the
> > loading).

> Fair enough. However, I was thinking about somebody else using
> the singleton. I don't have control over their code.

It's still rather trivial to ensure initialization before main,
since *you* can also declare statics, and write code which
initializes them.

> On the other hand, I do have control over the generic
> primitive I provide to them. Robustness is a fairly
> significant factor. I want the DCL algorithm to be able to
> cope with threads created before main.

The case should be rare enough that you can require specific
actions from the client to guarantee it. (Which only apply if
the thread is going to use your singleton, of course). IMHO,
it's not worth adding complexity to the singleton (which every
user of the singleton has to pay for).

> > It's also fairly easy to fusion the mutex lock with
> > any mutex lock needed to use the singleton (so its effective
> > cost is 0).

> I don't think I understand what your getting at. Could you
> please elaborate on this some more?

If the singleton is mutable, the client code will need a lock to
use it. The instance function acquires this lock before testing
the pointer, and returns a boost::shared_ptr, which releases it
(instead of destructing the object).

> > And of course, acquiring a mutex lock when there is
> > no contention is very, very fast anyway.

> Well, most mutex implementations still will execute 2 atomic
> RMW's and a #StoreLoad plus #LoadStore style memory barrier. 1
> atomic op and 1 storeload for mutex lock, and 1 atomic op and
> 1 loadstore for mutex unlock.

Certainly, but you're going to need some of that anyway in your
DCL algorithm. And compared to the function call, and whatever
the user is going to do with the singleton, the difference is
almost certainly negligeable. It's not as if the call to the
instance() function will take place in a tight loop. (If it
does, and it causes a performance problem, the client code can
trivially hoist it out of the loop.)

> AFAICT,
> you can get cache ping-pong with multiple threads frequently
> accessing a uncontended mutex. Say CPUS 1-6 frequently access
> a mutex and the execution sequence is perfect such that the
> CPUS don't access it at the same time. There still could be
> cache thrashing due to the frequent mutation to the mutex
> internal state and stalls from the hard core memory barrier on
> entry...

Do you have an actual scenario where you can measure a
significant difference? If not, you're trying to solve
something that isn't really a problem.

Chris M. Thomasson

unread,

Aug 1, 2008, 5:47:17 AM8/1/08

to

"James Kanze" <james...@gmail.com> wrote in message

news:83d5286c-2993-4d9a...@s50g2000hsb.googlegroups.com...

On Jul 30, 11:10 pm, "Chris M. Thomasson" <n...@spam.invalid> wrote:
> "James Kanze" <james.ka...@gmail.com> wrote in message

[...]

I need to think some more on your other comments...

> > AFAICT,
> > you can get cache ping-pong with multiple threads frequently
> > accessing a uncontended mutex. Say CPUS 1-6 frequently access
> > a mutex and the execution sequence is perfect such that the
> > CPUS don't access it at the same time. There still could be
> > cache thrashing due to the frequent mutation to the mutex
> > internal state and stalls from the hard core memory barrier on
> > entry...

> Do you have an actual scenario where you can measure a
> significant difference? If not, you're trying to solve
> something that isn't really a problem.

Take a global read-write lock protecting a so-called read-mostly
data-structure. Under heavy read activity, forward progress can actually be
stalled waiting for the updates to the internal lock state to be shuttled
around the CPU's caches. If the size of the read-side critical section is
moderate, the cost of moving lock updates from one CPU's cache to another
can rival that of the critical-section itself; This can prevent readers from
truly executing in parallel.

I have seen the simple act of removing a memory barrier from a read-side
critical region give several orders of magnitude in the improvement of
performance, scalability and throughout. I have personally seen a read rate
of around 1000 reads-per-second-per-thread for rw-locks, and over a million
reads-per-second-per-thread for some forms of atomic-op and membar-free read
algorihtms.

Others have had similar performance numbers:

http://groups.google.com/group/comp.programming.threads/browse_frm/thread/5d4e81b46352905d

http://www.rdrop.com/users/paulmck/RCU/lockperf.2004.01.17a.pdf