A Java- / .NET-like monitor

Bonita Montero

unread,

Nov 8, 2023, 10:28:08 PM11/8/23

to

Java and .NET have monitor objects instead of a combination of mutexes
and condition variables. The advantage of a monitor object is that when
you wait for a monitor to be signalled you can wait for that and for the
mutexe's semaphore to be unlocked in _one_ step. With a condition varia-
ble you have first to wait for the notify()-semaphore to be signalled
and for the mutexe's lock in two steps.
The below code simply has a 32 or 64 bit atomic (depening on if the 64
bit variant is lock-free or not) and the lower half is the number of
threads waiting to enter the mutex part and the upper half is the num-
ber of threads waiting to be notified. As all threads wanting to be
notified are also waiting for the mutex the lower half is always >=
the upper half, which I check for in several asserts.
On Windows waiting to be notified and waiting to lock the mutex part
to be unlocked is done by WaitForMultipleObjects(). On Linux there's
no way to wait for mutliple kernel handles to be signalled, but there
are SystemV semaphore sets which may consist of several semaphores and
you may have multiple operations to proceed atomically on this set.
The drawback of combining the mutex and condition-variable parts is
that you can't have multiple conditions associated with the same mutex.

// monitor.h

#pragma once
#if defined(_WIN32)
#define NOMINMAX
#include <Windows.h>
#elif defined(__unix__)
#include <sys/types.h>
#include <sys/sem.h>
#include <sys/stat.h>
#else
#error unsupported platform
#endif
#include <atomic>
#include <semaphore>
#include <type_traits>
#if defined(_WIN32)
#include "xhandle.h"
#endif

struct monitor
{
monitor();
~monitor();
void lock();
void unlock();
void wait();
void notify();
void notify_all();
private:
inline static thread_local char t_dummy;
static constexpr bool USE64 = std::atomic_int64_t::is_always_lock_free;
using atomic_word_t = std::conditional_t<USE64, uint64_t, uint32_t>;
static constexpr unsigned BITS = USE64 ? 32 : 16;
static constexpr atomic_word_t
ENTER_VALUE = 1,
SIGNAL_VALUE = USE64 ? 1ull << 32 : 1,
ENTER_MASK = USE64 ? (uint32_t)-1 : (uint16_t)-1,
SIGNAL_MASK = USE64 ? (uint64_t)(uint32_t)-1 << 32 :
(uint32_t)(uint16_t)-1 << 16;
std::atomic<atomic_word_t> m_atomic;
std::atomic<char *> m_threadId;
std::atomic_uint32_t m_recCount;
#if defined(_WIN32)
static constexpr uint32_t SEM_MAX = std::numeric_limits<LONG>::max();
XHANDLE
m_xhEnterEvt,
m_xhSignalSem;
#elif defined(__unix__)
static constexpr uint32_t SEM_MAX = std::numeric_limits<short>::max();
int m_sems;
int semop( std::initializer_list<sembuf> sems );
#endif
};

// monitor.cpp

#include <iostream>
#include <limits>
#include <system_error>
#include <cassert>
#include "monitor.h"

using namespace std;

monitor::monitor() :
m_atomic( 0 ),
m_threadId( nullptr )
#if defined(_WIN32)
, m_xhEnterEvt( CreateEventA( nullptr, FALSE, FALSE, nullptr ) ),
m_xhSignalSem( CreateSemaphoreA( nullptr, 0, SEM_MAX, nullptr ) )
#elif defined(__unix__)
, m_sems( semget( IPC_PRIVATE, 2, S_IRUSR | S_IWUSR ) )
#endif
{
#if defined(_WIN32)
if( !m_xhEnterEvt.get() || !m_xhSignalSem.get() )
throw system_error( GetLastError(), system_category(), "can't
initialize monitor object" );
#elif defined(__unix__)
union semun { int val; void *p; } su;
su.val = 0;
#if defined(__linux__)
if( m_sems == -1 )
#else
if( m_sems == -1 || semctl( m_sems, 0, SETVAL, su ) == -1 || semctl(
m_sems, 1, SETVAL, su ) == -1 )
#endif
throw system_error( errno, system_category(), "can't initialize
monitor object" );
#endif
}

monitor::~monitor()
{
#if defined(__unix__)
int ret = semctl( m_sems, 0, IPC_RMID );
assert(ret != -1);
#endif
}

void monitor::lock()
{
if( m_threadId.load( memory_order_relaxed ) == &t_dummy )
{
uint32_t oldRecCount = m_recCount.load( memory_order_relaxed );
if( oldRecCount == (uint32_t)-1 )
throw system_error( (int)errc::result_out_of_range,
generic_category(), "montor's recursion count saturated" );
m_recCount.store( oldRecCount + 1, memory_order_relaxed );
return;
}
atomic_word_t ref = m_atomic.load( memory_order_relaxed );
do
{
if( (ref & ENTER_MASK) == ENTER_MASK )
throw system_error( (int)errc::result_out_of_range,
generic_category(), "montor's locker count saturated" );
assert((ref & ENTER_MASK) >= ref >> BITS);
} while( !m_atomic.compare_exchange_strong( ref, ref + 1,
memory_order_acquire, memory_order_relaxed ) );
auto initThread = [&]()
{
m_threadId.store( &t_dummy, memory_order_relaxed );
m_recCount.store( 0, memory_order_relaxed );
};
if( (ref & ENTER_MASK) == ref >> BITS ) [[likely]]
return initThread();
#if defined(_WIN32)
if( WaitForSingleObject( m_xhEnterEvt.get(), INFINITE ) != WAIT_OBJECT_0 )
terminate();
#elif defined(__unix__)
if( semop( { { 0, -1, 0 } } ) )
terminate();
#endif
initThread();
}

void monitor::unlock()
{
if( uint32_t rc; m_threadId.load( memory_order_relaxed ) == &t_dummy &&
(rc = m_recCount.load( memory_order_relaxed )) )
{
m_recCount.store( rc - 1, memory_order_relaxed );
return;
}
atomic_word_t ref = m_atomic.load( memory_order_relaxed );
assert((ref & ENTER_MASK) && m_threadId == &t_dummy);
m_threadId.store( nullptr, memory_order_relaxed );
do
assert((ref & ENTER_MASK) >= ref >> BITS);
while( !m_atomic.compare_exchange_strong( ref, ref - 1,
memory_order_release, memory_order_relaxed ) );
if( (ref & ENTER_MASK) == 1 ) [[likely]]
return;
#if defined(_WIN32)
if( !SetEvent( m_xhEnterEvt.get() ) )
terminate();
#elif defined(__unix__)
if( semop( { { 0, 1, IPC_NOWAIT } } ) )
terminate();
#endif
}

void monitor::wait()
{
assert(m_threadId == &t_dummy && !m_recCount);
m_threadId.store( nullptr, memory_order_relaxed );
atomic_word_t ref = m_atomic.load( memory_order_relaxed );
do
assert((ref & ENTER_MASK) > ref >> BITS);
while( !m_atomic.compare_exchange_strong( ref, ref + SIGNAL_VALUE,
memory_order_release, memory_order_relaxed ) );
if( (ref & ENTER_MASK) - (ref >> BITS) > 1 )
{
#if defined(_WIN32)
if( !SetEvent( m_xhEnterEvt.get() ) )
terminate();
#elif defined(__unix__)
if( semop( { { 0, 1, IPC_NOWAIT } } ) )
terminate();
#endif
}
#if defined(_WIN32)
HANDLE waitFor[2] { m_xhEnterEvt.get(), m_xhSignalSem.get() };
if( WaitForMultipleObjects( 2, waitFor, TRUE, INFINITE ) != WAIT_OBJECT_0 )
terminate();
#elif defined(__unix__)
if( semop( { { 0, -1, 0 }, { 1, -1, 0 } } ) )
terminate();
#endif
m_threadId.store( &t_dummy, memory_order_relaxed );
m_recCount.store( 0, memory_order_relaxed );
}

void monitor::notify()
{
atomic_word_t ref = m_atomic.load( memory_order_relaxed );
assert((ref & ENTER_MASK) > ref >> BITS && m_threadId == &t_dummy);
uint32_t n;
while( (n = (uint32_t)(ref >> BITS)) &&
!m_atomic.compare_exchange_strong( ref, ref - SIGNAL_VALUE,
memory_order_relaxed, memory_order_relaxed ) );
if( !(ref >> BITS) )
return;
#if defined(_WIN32)
if( !ReleaseSemaphore( m_xhSignalSem.get(), 1, nullptr ) )
terminate();
#elif defined(__unix__)
int ret;
do
ret = semop( { { 1, 1, IPC_NOWAIT } } );
while( ret == EAGAIN );
if( ret )
terminate();
#endif
}

void monitor::notify_all()
{
atomic_word_t ref = m_atomic.load( memory_order_relaxed );
assert((ref & ENTER_MASK) > ref >> BITS && m_threadId == &t_dummy);
uint32_t n;
while( (n = (uint32_t)(ref >> BITS)) &&
!m_atomic.compare_exchange_strong( ref, ref & ENTER_MASK,
memory_order_relaxed, memory_order_relaxed ) );
while( n )
{
uint32_t nRelease = n <= SEM_MAX ? n : SEM_MAX;
#if defined(_WIN32)
BOOL succ;
for( ; !(succ = ReleaseSemaphore( m_xhSignalSem.get(), nRelease,
nullptr )) && GetLastError() == ERROR_TOO_MANY_POSTS;
nRelease = nRelease > 1 ? nRelease / 2 : nRelease );
if( !succ )
terminate();
#elif defined(__unix__)
int ret;
for( ; (ret = semop( { { 1, (short)nRelease, IPC_NOWAIT } } )) == EAGAIN;
nRelease = nRelease > 1 ? nRelease / 2 : nRelease );
if( ret )
terminate();
#endif
n -= nRelease;
}
}

#if defined(__unix__)
int monitor::semop( initializer_list<sembuf> sems )
{
int ret;
while( (ret = ::semop( m_sems, const_cast<sembuf *>(sems.begin()),
sems.size() )) == EINTR );
return ret;
}
#endif

Bonita Montero

unread,

Nov 8, 2023, 10:28:23 PM11/8/23

to

Am 08.11.2023 um 19:16 schrieb Kaz Kylheku:

> POSIX-style mutexes and condition variables are actually Mesa-style
> monitors.

A monitor is different because the mutex and condition variable
is joined in a monitor which allows the shown optimization while
waiting to be notified.

> That's an internal detail. In the POSIX API, you have pthread_cond_wait,
> which looks like one operation to the caller.

The mentioned optimization isn't possible if you don't join the
mutex with the condition variable as I've shown; or more precisely
there's a limit on the number of conditions as explained below.

> The problem is that you often want multiple condition variables with
> one monitor. So this is a nonstarter.

If that would be often Java and .net would provide that.

> I suggest you make an API where the wait operation has an "int cond_index"
> parameter to select a condition variable.

I've got a value which is usually 64 bit where the lower half is the
numer if theads which want to have exclusive access to the mutex. The
upper half is the number of threads that want to be notified. I thought
I could split the 64 bit value in three parts, one for the first threads
and two for the latter two types of threads. But from my eperience with
Java and .net I thought that it doesn't happen often that you need two
separate monitors to have twoo conditions, so I dropped this idea.

> The monitor object can be told at construction time how large a vector
> of condition variables is required.

Then I'd had to make my atomic even more split and the number of
threads being able to register in the bitfields of the atomic would
be too limited. My code is to take advantage of the one-step approach
while waiting to be notified and if you need more conditions beyond
that you'd have to stick with the two kernel calls used with a normal
mutex and condition variable.

Bonita Montero

unread,

Nov 8, 2023, 10:28:36 PM11/8/23

to

while( ret == -1 && errno == EAGAIN );

>     if( ret )
>         terminate();
> #endif
> }
>
> void monitor::notify_all()
> {
>     atomic_word_t ref = m_atomic.load( memory_order_relaxed );
>     assert((ref & ENTER_MASK) > ref >> BITS && m_threadId == &t_dummy);
>     uint32_t n;
>     while( (n = (uint32_t)(ref >> BITS)) &&
> !m_atomic.compare_exchange_strong( ref, ref & ENTER_MASK,
> memory_order_relaxed, memory_order_relaxed ) );
>     while( n )
>     {
>         uint32_t nRelease = n <= SEM_MAX ? n : SEM_MAX;
> #if defined(_WIN32)
>         BOOL succ;
>         for( ; !(succ = ReleaseSemaphore( m_xhSignalSem.get(),
> nRelease, nullptr )) && GetLastError() == ERROR_TOO_MANY_POSTS;
>             nRelease = nRelease > 1 ? nRelease / 2 : nRelease );
>         if( !succ )
>             terminate();
> #elif defined(__unix__)
>         int ret;
>         for( ; (ret = semop( { { 1, (short)nRelease, IPC_NOWAIT } } ))
> == EAGAIN;

for( ; (ret = semop( { { 1, (short)nRelease, IPC_NOWAIT } } )) == -1 &&
errno == EAGAIN;

Kaz Kylheku

unread,

Nov 8, 2023, 10:28:45 PM11/8/23

to

On 2023-11-08, Bonita Montero <Bonita....@gmail.com> wrote:
> Java and .NET have monitor objects instead of a combination of mutexes
> and condition variables. The advantage of a monitor object is that when

POSIX-style mutexes and condition variables are actually Mesa-style
monitors.

Monitors were invented by C. A. R. Hoare ("Quicksort Guy") and another
collaborator whose name escapes me, in the context of some concurrent
Pascal experiment.

Hoare monitors make some ordering guarantees that the "Mesa semantics"
monitors do not. Something like that when you signal a condition
variable, a waiting thread gets in, and when it releases the mutex, the
original thread get back in (no competititon).

The paradigm is that there is one monitor and multiple conditions.

> you wait for a monitor to be signalled you can wait for that and for the
> mutexe's semaphore to be unlocked in _one_ step. With a condition varia-
> ble you have first to wait for the notify()-semaphore to be signalled
> and for the mutexe's lock in two steps.

That's an internal detail. In the POSIX API, you have pthread_cond_wait,
which looks like one operation to the caller.

It is not required to be implemented with semaphores.

> struct monitor
> {
> monitor();
> ~monitor();
> void lock();
> void unlock();
> void wait();
> void notify();
> void notify_all();

The problem is that you often want multiple condition variables with
one monitor. So this is a nonstarter.

I suggest you make an API where the wait operation has an "int cond_index"
parameter to select a condition variable.

The monitor object can be told at construction time how large a vector
of condition variables is required.

That still doesn't provide all the flexibility, but fits the common use
cases where you have a monitor plus a fixed number of condition
variables.

It doesn't work where you have a dynamic number of condition variables;
e.g. a dynamic set data structure has one monitor, plus a condition on
each node it contains.

--
TXR Programming Language: http://nongnu.org/txr
Cygnal: Cygwin Native Application Library: http://kylheku.com/cygnal
Mastodon: @Kazi...@mstdn.ca
NOTE: If you use Google Groups, I don't see you, unless you're whitelisted.

Bonita Montero

unread,

Nov 8, 2023, 10:28:46 PM11/8/23

to

Am 08.11.2023 um 20:49 schrieb Kaz Kylheku:

> No, the "monitor" idea you're proposing is different in this
> way.

That's not true. That spurious wakesups may happen with a mutex and
a condition variable are constituted in that both are separate enti-
ties. Spuriuos wakesups never happen with my implementation, but
stolen wakeups are still possible.

> Monitors as they are understood in computer science (first described by
> C. A. R. Hoare) do not combine the monitor and condition variables into
> one object; they are distinct entities: one monitor, zero to many
> conditions.

The way monitors work does not suggest an implementation, or they can
be based internally on a mutex and a condition variable, but if you
have a monitor that never has spurious wakeups, it is implemented
like mine.

> To avoid muddying the debate with nonstandard terminology, you might
> want to call your cockamamie idea "bonitor".

I've implemented a monitor without spurious wakeups.

Kaz Kylheku

unread,

Nov 8, 2023, 10:29:01 PM11/8/23

to

On 2023-11-08, Bonita Montero <Bonita....@gmail.com> wrote:

> Am 08.11.2023 um 19:16 schrieb Kaz Kylheku:
>
>> POSIX-style mutexes and condition variables are actually Mesa-style
>> monitors.
>
> A monitor is different because the mutex and condition variable
> is joined in a monitor which allows the shown optimization while
> waiting to be notified.

No, the "monitor" idea you're proposing is different in this
way.

Monitors as they are understood in computer science (first described by
C. A. R. Hoare) do not combine the monitor and condition variables into
one object; they are distinct entities: one monitor, zero to many
conditions.

To avoid muddying the debate with nonstandard terminology, you might
want to call your cockamamie idea "bonitor".

Chris M. Thomasson

unread,

Nov 8, 2023, 10:29:07 PM11/8/23

to

On 11/8/2023 1:41 PM, Chris M. Thomasson wrote:

> On 11/8/2023 9:16 AM, Bonita Montero wrote:
>> Am 08.11.2023 um 15:56 schrieb Bonita Montero:
>>> Java and .NET have monitor objects instead of a combination of mutexes
>>> and condition variables. The advantage of a monitor object is that when
>>> you wait for a monitor to be signalled you can wait for that and for the
>>> mutexe's semaphore to be unlocked in _one_ step. With a condition varia-
>>> ble you have first to wait for the notify()-semaphore to be signalled
>>> and for the mutexe's lock in two steps.
>>> The below code simply has a 32 or 64 bit atomic (depening on if the 64
>>> bit variant is lock-free or not) and the lower half is the number of
>>> threads waiting to enter the mutex part and the upper half is the num-
>>> ber of threads waiting to be notified. As all threads wanting to be
>>> notified are also waiting for the mutex the lower half is always >=
>>> the upper half, which I check for in several asserts.
>>> On Windows waiting to be notified and waiting to lock the mutex part
>>> to be unlocked is done by WaitForMultipleObjects(). On Linux there's
>>> no way to wait for mutliple kernel handles to be signalled, but there
>>> are SystemV semaphore sets which may consist of several semaphores and
>>> you may have multiple operations to proceed atomically on this set.
>>> The drawback of combining the mutex and condition-variable parts is
>>> that you can't have multiple conditions associated with the same mutex.
>

> [snip code]
>
> Model it through Relacy Race Detector first, if you get any issues, we
> can work through them. ;^)
>
> https://github.com/dvyukov/relacy
>
>

There is no shame in using a race detector. If you want me to debug your
work, well, its not going to be for free. Believe it or not its not
exactly trivial. You already had to make corrections to your own code.

> while( ret == -1 && errno == EAGAIN );

Keep EINTR in mind.

Chris M. Thomasson

unread,

Nov 8, 2023, 10:29:11 PM11/8/23

to

On 11/8/2023 9:16 AM, Bonita Montero wrote:

> Am 08.11.2023 um 15:56 schrieb Bonita Montero:
>> Java and .NET have monitor objects instead of a combination of mutexes
>> and condition variables. The advantage of a monitor object is that when
>> you wait for a monitor to be signalled you can wait for that and for the
>> mutexe's semaphore to be unlocked in _one_ step. With a condition varia-
>> ble you have first to wait for the notify()-semaphore to be signalled
>> and for the mutexe's lock in two steps.
>> The below code simply has a 32 or 64 bit atomic (depening on if the 64
>> bit variant is lock-free or not) and the lower half is the number of
>> threads waiting to enter the mutex part and the upper half is the num-
>> ber of threads waiting to be notified. As all threads wanting to be
>> notified are also waiting for the mutex the lower half is always >=
>> the upper half, which I check for in several asserts.
>> On Windows waiting to be notified and waiting to lock the mutex part
>> to be unlocked is done by WaitForMultipleObjects(). On Linux there's
>> no way to wait for mutliple kernel handles to be signalled, but there
>> are SystemV semaphore sets which may consist of several semaphores and
>> you may have multiple operations to proceed atomically on this set.
>> The drawback of combining the mutex and condition-variable parts is
>> that you can't have multiple conditions associated with the same mutex.

Kaz Kylheku

unread,

Nov 8, 2023, 10:29:30 PM11/8/23

to

On 2023-11-08, Bonita Montero <Bonita....@gmail.com> wrote:

> Am 08.11.2023 um 20:49 schrieb Kaz Kylheku:
>
>> No, the "monitor" idea you're proposing is different in this
>> way.
>
> That's not true. That spurious wakesups may happen with a mutex and
> a condition variable are constituted in that both are separate enti-
> ties.

That doesn't follow. Hoare's original monitor implementation had
separate condition variables, yet no spurious wakeups.

In fact, the condition wait did not require loops! Just:

if (!whatever_condition)
monitor.wait(whatever_cond_var);

The thread waiting on the condition was guaranteed to get into
the monitor with the condition still true!

The reason we accept spurious wakeups is that the guarantee is not
efficient on systems with multiple processors, not because
we don't know how to code up the guarantee.

Spurious wakesup are part of the "Mesa semantics" of monitors
and condition variables, in contrast to the "Hoare semantics".

> I've implemented a monitor without spurious wakeups.

It doesn't look like a monitor, if it doesn't have multiple condition
variables. Maybe your approach can support that.

Kaz Kylheku

unread,

Nov 8, 2023, 10:29:31 PM11/8/23

to

On 2023-11-08, Chris M. Thomasson <chris.m.t...@gmail.com> wrote:

> On 11/8/2023 11:56 AM, Bonita Montero wrote:
>> I've implemented a monitor without spurious wakeups.
>

> Yawn.

Funnier:

Amine Moulay Ramdane has written seven, likewise dead in the water.

Chris M. Thomasson

unread,

Nov 8, 2023, 10:29:38 PM11/8/23

to

On 11/8/2023 11:56 AM, Bonita Montero wrote:

> Am 08.11.2023 um 20:49 schrieb Kaz Kylheku:
>
>> No, the "monitor" idea you're proposing is different in this
>> way.
>
> That's not true. That spurious wakesups may happen with a mutex and
> a condition variable are constituted in that both are separate enti-
> ties. Spuriuos wakesups never happen with my implementation, but
> stolen wakeups are still possible.
>
>> Monitors as they are understood in computer science (first described by
>> C. A. R. Hoare) do not combine the monitor and condition variables into
>> one object; they are distinct entities: one monitor, zero to many
>> conditions.

mutex and condition variables happen to be intimately interconnected.
Look up wait morphing...

Chris M. Thomasson

unread,

Nov 8, 2023, 10:29:44 PM11/8/23

to

On 11/8/2023 11:56 AM, Bonita Montero wrote:

Yawn.

Bonita Montero

unread,

Nov 8, 2023, 11:34:13 PM11/8/23

to

Am 09.11.2023 um 00:25 schrieb Kaz Kylheku:

> Spurious wakesup are part of the "Mesa semantics" of monitors
> and condition variables, in contrast to the "Hoare semantics".

Hoare monitors suck since they are less efficient.

Bonita Montero

unread,

Nov 8, 2023, 11:36:06 PM11/8/23

to

Am 09.11.2023 um 00:32 schrieb Chris M. Thomasson:

> mutex and condition variables happen to be intimately interconnected.
> Look up wait morphing...

With my implementation registering as a thread wanting to enter the
mutex and waiting to be notified is one atomic step. That's only
possible if they're one part.

Chris M. Thomasson

unread,

Nov 8, 2023, 11:37:11 PM11/8/23

to

Humm... Are you okay Bonita? Anything wrong with you?

Bonita Montero

unread,

Nov 8, 2023, 11:37:16 PM11/8/23

to

Am 08.11.2023 um 22:41 schrieb Chris M. Thomasson:

> Model it through Relacy Race Detector first, if you get any issues, we
> can work through them. ;^)
> https://github.com/dvyukov/relacy

You'd suggest Relacy for a hello world.

Bonita Montero

unread,

Nov 8, 2023, 11:38:11 PM11/8/23

to

Am 08.11.2023 um 22:49 schrieb Chris M. Thomasson:

> Keep EINTR in mind.

EINTR is handled if you inspect my own semop overload function.

Bonita Montero

unread,

Nov 8, 2023, 11:39:12 PM11/8/23

to

Am 09.11.2023 um 05:36 schrieb Chris M. Thomasson:

> Humm... Are you okay Bonita? Anything wrong with you?

Hoare monitors relase a waiting thread immediately after a notify()
and that's less efficient.

Chris M. Thomasson

unread,

Nov 8, 2023, 11:40:42 PM11/8/23

to

Wrt your code:

https://youtu.be/0R6WIbx8ysE

Chris M. Thomasson

unread,

Nov 8, 2023, 11:41:55 PM11/8/23

to

I actually might have some free time to blow on it later on tonight.
Humm... You are not exactly a fun person to work for.

Chris M. Thomasson

unread,

Nov 8, 2023, 11:42:29 PM11/8/23

to

:^D

Hello world! Try to get it passing a Relacy test, if you are having
trouble, I can help you.

Chris M. Thomasson

unread,

Nov 8, 2023, 11:42:46 PM11/8/23

to

Yawn.

Bonita Montero

unread,

Nov 9, 2023, 12:09:14 AM11/9/23

to

Re-acquiring the mutex part of a monitor after notify()
is an superfluous extra part that takes CPU time.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:11:43 AM11/9/23

to

On 11/8/2023 3:51 PM, Kaz Kylheku wrote:
> On 2023-11-08, Chris M. Thomasson <chris.m.t...@gmail.com> wrote:
>> On 11/8/2023 11:56 AM, Bonita Montero wrote:
>>> I've implemented a monitor without spurious wakeups.
>>
>> Yawn.
>
> Funnier:
>
> Amine Moulay Ramdane has written seven, likewise dead in the water.
>

Actually, Amine had a couple of interesting ideas from years ago.
Therefore, I think that Amine just might be "smarter" than Bonita?

Still, wrt Anime, not sure if the ideas are original or from reading a
white paper.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:12:26 AM11/9/23

to

Look up wait morphing.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:13:41 AM11/9/23

to

Well, I am referring to times of contention.

Bonita Montero

unread,

Nov 9, 2023, 12:18:26 AM11/9/23

to

Wait morphing isn't implemented with glibc's condition variables.
My code doen't need that because I'm sleeping on the condvar part
and on the mutex part in *one* step.

Kaz Kylheku

unread,

Nov 9, 2023, 12:18:31 AM11/9/23

to

Hoare gave us the concept of monitors and condition variables,
which deserves respect.

The original variant is semantically useful; the guarantees that it
provides can make it easier to reason about correctness.

It's something to know about as part of a well-rounded education
in concurrent programming.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:19:05 AM11/9/23

to

Humm... Sounds good. However, I need to try it out. Also, if you don't
mind I might actually model it in relacy.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:19:44 AM11/9/23

to

On 11/8/2023 9:17 PM, Kaz Kylheku wrote:
> On 2023-11-09, Bonita Montero <Bonita....@gmail.com> wrote:
>> Am 09.11.2023 um 00:25 schrieb Kaz Kylheku:
>>
>>> Spurious wakesup are part of the "Mesa semantics" of monitors
>>> and condition variables, in contrast to the "Hoare semantics".
>>
>> Hoare monitors suck since they are less efficient.
>
> Hoare gave us the concept of monitors and condition variables,
> which deserves respect.
>
> The original variant is semantically useful; the guarantees that it
> provides can make it easier to reason about correctness.
>
> It's something to know about as part of a well-rounded education
> in concurrent programming.
>

I concur with that assessment.

Bonita Montero

unread,

Nov 9, 2023, 12:20:58 AM11/9/23

to

Am 09.11.2023 um 06:17 schrieb Kaz Kylheku:

> Hoare gave us the concept of monitors and condition variables,
> which deserves respect.

Hoare monitors are less efficient since they give up ownership
of the mutex part while notifying. That are two kernel calls
which could be prevented.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:21:17 AM11/9/23

to

I wonder if Bontia is pushing things to a borderline. Heck, he/she is
almost making me want to work on it!!!

https://youtu.be/rSaC-YbSDpo

Chris M. Thomasson

unread,

Nov 9, 2023, 12:21:49 AM11/9/23

to

On 11/8/2023 9:19 PM, Bonita Montero wrote:

Avoiding Kernel calls is great.com.

Bonita Montero

unread,

Nov 9, 2023, 12:23:09 AM11/9/23

to

Am 09.11.2023 um 06:17 schrieb Chris M. Thomasson:

> Humm... Sounds good. However, I need to try it out. Also, if you don't
> mind I might actually model it in relacy.

I've witten my own unit test. The Win32 code worked immediately,
but the SysV-code didn't work immediately also because I forgot
to have IPC_NOWAIT while releasing a semaphore. Why is there a
way to wait for the release of a mutex to be accepted by another
thread ? Who comes up with that ?

Chris M. Thomasson

unread,

Nov 9, 2023, 12:28:42 AM11/9/23

to

Well, invvvho, it might be prudent of me to model it in Relacy. The act
of me porting your work over into its logic base is going to get me
really intimate with your code.

Bonita Montero

unread,

Nov 9, 2023, 12:30:33 AM11/9/23

to

Am 09.11.2023 um 06:27 schrieb Chris M. Thomasson:

> Well, invvvho, it might be prudent of me to model it in Relacy.
> The act of me porting your work over into its logic base is
> going to get me really intimate with your code.

Just reading the code is easier.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:31:12 AM11/9/23

to

Can you feel me? lol. ;^)

I have to work on some of my fractal IFS right now, but, I will try to
port your work over to Relacy. Fwiw, here is a taste of some work I ave
to do right now:

https://paulbourke.net/fractals/multijulia

I am trying to create a nice volumetric form of it.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:31:51 AM11/9/23

to

Yup. Porting your code to Relacy is going to force me to read every damn
line of your code. So, touche?

Chris M. Thomasson

unread,

Nov 9, 2023, 12:33:32 AM11/9/23

to

https://youtu.be/XpbPzrSXOgk

Bonita Montero

unread,

Nov 9, 2023, 12:34:57 AM11/9/23

to

Am 09.11.2023 um 06:31 schrieb Chris M. Thomasson:

> Yup. Porting your code to Relacy is going to force me to read every damn
> line of your code. So, touche?

Reading the code doesn't hurt since the functions are short.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:36:35 AM11/9/23

to

Wait morphing is a way that shows how interconnected a mutex actually is
with a condition variable...

Chris M. Thomasson

unread,

Nov 9, 2023, 12:37:28 AM11/9/23

to

Porting your code to Relacy makes me read every damn line. You masking
is interesting.

Bonita Montero

unread,

Nov 9, 2023, 12:40:28 AM11/9/23

to

Am 09.11.2023 um 06:35 schrieb Chris M. Thomasson:

> Wait morphing is a way that shows how interconnected a mutex actually is
> with a condition variable...

As you can derive from what I said I know what wait morphing is.
I think wait morphing could be prevented unter systems supporting
SysV seamphores by allocating a semaphore set of two semaphores
for each mutex and leaving the second unused until you have a
condition variable.

Bonita Montero

unread,

Nov 9, 2023, 12:41:33 AM11/9/23

to

Am 09.11.2023 um 06:36 schrieb Chris M. Thomasson:

> Porting your code to Relacy makes me read every damn line.
> You masking is interesting.

My code is understandable if you know MT-primitives
and SysV-IPC. There's nothing "damn" with my code.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:41:34 AM11/9/23

to

Can you move waitsets over from mutex to futex and vise versa?

Chris M. Thomasson

unread,

Nov 9, 2023, 12:42:41 AM11/9/23

to

This is in the kernel...

Chris M. Thomasson

unread,

Nov 9, 2023, 12:42:53 AM11/9/23

to

Oh well, like I said, you seem to be a fun person to work with...

Bonita Montero

unread,

Nov 9, 2023, 12:43:05 AM11/9/23

to

Am 09.11.2023 um 06:40 schrieb Chris M. Thomasson:

> Can you move waitsets over from mutex to futex and vise versa?

glibc doesn't do this either.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:44:43 AM11/9/23

to

Wait morphing is not in the realm of the compiler. It's in the kernel.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:45:24 AM11/9/23

to

OOPS! I thought you were talking about gcc. Sorry Bonita!

Bonita Montero

unread,

Nov 9, 2023, 12:46:14 AM11/9/23

to

Am 09.11.2023 um 06:44 schrieb Chris M. Thomasson:

> Wait morphing is not in the realm of the compiler. It's in the kernel.

Read this:
https://stackoverflow.com/questions/45163701/which-os-platforms-implement-wait-morphing-optimization

Chris M. Thomasson

unread,

Nov 9, 2023, 12:49:03 AM11/9/23

to

Wait morphing can be highly beneficial.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:51:13 AM11/9/23

to

I have to go to work on my fractals right now, will get back to you. I
will mostly have time to port your code into a Relacy unit test sometime
later on tonight or tomorrow. This work will be for free for you. Will
you even appreciate it in any way shape or form? Or mock me?

Bonita Montero

unread,

Nov 9, 2023, 12:51:43 AM11/9/23

to

Sorry, this dosn't work beyond one condvar per mutex.

Bonita Montero

unread,

Nov 9, 2023, 12:53:21 AM11/9/23

to

Wait morphing isn't necessary under Win32 since you can wait
for the mutexe's binary semaphore and for the condvar's counting
semaphore in one step with WaitForMultipleObjects. Unfortunately
there's nothing under Linux like that.

Chris M. Thomasson

unread,

Nov 9, 2023, 12:59:01 AM11/9/23

to

the funny thing is that I need to model one of my new wait-free queue
experiments in Relacy for use in my rendering engine.

Bonita Montero

unread,

Nov 9, 2023, 1:32:33 AM11/9/23

to

If you were here we would go through the code together
and you would immediately understand it.

Chris M. Thomasson

unread,

Nov 9, 2023, 1:47:16 AM11/9/23

to

An example of my main experiment:

https://youtu.be/n13GHyYEfLA

All of my own GLSL shaders, pure C++ and openGL.

Chris M. Thomasson

unread,

Nov 9, 2023, 1:54:28 AM11/9/23

to

Since I have to model one of my experimental algorithms in Relacy
anyway, well, I will be right up in it. Wrt my code, well, its trying to
make some fractals go volumetric and I need to highly efficient and
specialized LIFO/FIFO stack/queue system for it. They are running on the
CPU, I might even be able to get it run in shaders, but for now, I need
to work on modeling my sketch of my code in Relacy, create some test
units, and give it a go. Fwiw, here is one of my vector fields:

https://youtu.be/poXeq5V0dso

This used an older queue of mine to help distribute the field processing
across multiple processors.

Chris M. Thomasson

unread,

Nov 9, 2023, 2:01:30 AM11/9/23

to

Fwiw, this one is basically embarrassingly parallel to create each
frame. Well, that is kind of cheating wrt embarrassingly parallel, but,
oh well:

https://youtu.be/DrPp6xfLe4Q

This one is from a recursive algorithm of mine, not too efficient wrt
the generation part that gives me my field points to work with. It takes
a while to render an animation in 4k. The recursive nature of it can
blow a threads stack if I get too detailed. So, I need to refine my
current quick and dirty proof of concept code, so to speak. It is kind
of embarrassingly parallel...

Bonita Montero

unread,

Nov 9, 2023, 2:01:46 AM11/9/23

to

Am 09.11.2023 um 06:48 schrieb Chris M. Thomasson:

> Wait morphing can be highly beneficial.

I just checkes how many voluntary context switches I have under Linux
when having a poing-pong between two theads serving a common mutex and
individual condvars.

#include <iostream>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <atomic>
#include <sys/resource.h>

using namespace std;

int main()
{
mutex mtx;
struct wait_t
{
bool signal;
condition_variable cond;
} waitA, waitB;
constexpr size_t ROUNDS = 100'000;
atomic<uint64_t> switches( 0 );
auto thr = [&]( wait_t &waitMe, wait_t &waitYou )
{
for( size_t r = ROUNDS; r--; )
{
unique_lock<mutex> lock( mtx );
while( !waitMe.signal )
waitMe.cond.wait( lock );
waitMe.signal = false;
waitYou.signal = true;
waitYou.cond.notify_one();
}
rusage ru;
if( getrusage( RUSAGE_THREAD, &ru ) )
terminate();
switches += ru.ru_nvcsw;
};
waitA.signal = true;
waitA.cond.notify_one();
jthread
thrA( thr, ref( waitA ), ref( waitB ) ),
thrB( thr, ref( waitB ), ref( waitA ) );
thrA.join();
thrB.join();
cout << switches << endl;
}

The code prints about ROUNDS * context switches, that's great.

Bonita Montero

unread,

Nov 9, 2023, 2:02:11 AM11/9/23

to

* 2

Chris M. Thomasson

unread,

Nov 9, 2023, 2:03:33 AM11/9/23

to

oh shit, I forgot the damn link:

https://youtu.be/YsAkm0VlCsw
(should be 4k, damn it!)

Also, this one is ripe for for improvement. I just know that my new
queue system is going to work for it wrt generating its frames.

https://youtu.be/HwIkk9zENcg

Will create another thread to continue this.

Bonita Montero

unread,

Nov 9, 2023, 4:08:12 AM11/9/23

to

I did a test of my monitor against two mutexes and two condition
variables, playing pingpong with each other. On Windows my imple-
mentation is about 12 times faster than the mentioned mutex with
condvar on a AMD 7950X Zen4 16 core system. Under WSL2 on the
same machine the Linux implementation is 12% faster than my code.
On Linux bare metal with a Zen2 3990X 64 core system my code is
about 8.5% faster.
As I recently found in this that a wait() incurs only one context
switch back and forth I thought my code woudln't be faster, but
on bare metal it actually is faster. And I was really surprised
that the MS condvar implementation is that extremely slow compared
to my monitor.

#include <iostream>
#include <thread>
#include <chrono>
#include <vector>
#include <mutex>
#include <condition_variable>
#include "monitor.h"

using namespace std;
using namespace chrono;

int main( int argc, char **argv )
{
atomic_int32_t tSum;
auto time = [&]( auto fn )
{
tSum = 0;
auto start = high_resolution_clock::now();
fn();
tSum += (int64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();

};
constexpr size_t ROUNDS = 100'000;

struct not_t
{
monitor mon;
bool notifiy;
} notA { {}, true }, notB { {}, false };
notA.notifiy = true;
auto monThr = [&]( not_t &me, not_t &you )
{
time( [&]()
{
for( size_t r = ROUNDS; r; --r )
{
me.mon.lock();
for( ; !me.notifiy; me.mon.wait() );
me.notifiy = false;
me.mon.unlock();
you.mon.lock();
you.notifiy = true;
you.mon.notify();
you.mon.unlock();
}
} );
};
vector<jthread> threads;
threads.reserve( 0 );
threads.emplace_back( monThr, ref( notA ), ref( notB ) ),
threads.emplace_back( monThr, ref( notB ), ref( notA ) );
threads.resize( 0 );
cout << tSum / ((double)ROUNDS * 2) << endl;
struct cv_t
{
mutex mtx;
condition_variable cv;
bool signal;
} cvA = { {}, {}, true }, cvB = { {}, {}, false };
auto cvThr = [&]( cv_t &cvMe, cv_t &cvYou )
{
time( [&]()
{
for( size_t r = ROUNDS; r; --r )
{
unique_lock<mutex> lockMe( cvMe.mtx );
for( ; !cvMe.signal; cvMe.cv.wait( lockMe ) );
cvMe.signal = false;
lockMe.unlock();
unique_lock<mutex> lockYou( cvYou.mtx );
cvYou.signal = true;
cvYou.cv.notify_one();
}
} );
};
threads.emplace_back( cvThr, ref( cvA ), ref( cvB ) );
threads.emplace_back( cvThr, ref( cvB ), ref( cvA ) );
threads.resize( 0 );
cout << tSum / ((double)ROUNDS * 2) << endl;
}

Chris M. Thomasson

unread,

Nov 9, 2023, 4:11:29 AM11/9/23

to

On 11/9/2023 1:07 AM, Bonita Montero wrote:
> I did a test of my monitor against two mutexes and two condition
> variables, playing pingpong with each other. On Windows my imple-
> mentation is about 12 times faster than the mentioned mutex with
> condvar on a AMD 7950X Zen4 16 core system. Under WSL2 on the
> same machine the Linux implementation is 12% faster than my code.
> On Linux bare metal with a Zen2 3990X 64 core system my code is
> about 8.5% faster.
> As I recently found in this that a wait() incurs only one context
> switch back and forth I thought my code woudln't be faster, but
> on bare metal it actually is faster. And I was really surprised
> that the MS condvar implementation is that extremely slow compared
> to my monitor.

[...]

I am just starting to model some of my queue code. Its going to fun to
model your monitor and see if its bites the dust.

Bonita Montero

unread,

Nov 9, 2023, 4:17:57 AM11/9/23

to

Am 09.11.2023 um 10:11 schrieb Chris M. Thomasson:

> I am just starting to model some of my queue code. Its going to fun to
> model your monitor and see if its bites the dust.

The advantage under Linux bare metal is only 12%, if you do additional
things in userspace the effect should become smaller. So measuring a
simple bool ping pong shows almost the sole performance of my code.
But you would get a noticeable difference with Windows.

Chris M. Thomasson

unread,

Nov 9, 2023, 4:22:34 AM11/9/23

to

Modeling it is not about sheer performance, it is about correctness.

Chris M. Thomasson

unread,

Nov 9, 2023, 4:23:20 AM11/9/23

to

Make sure it is sound and correct first, then we can sit back and think
about how to make it much faster...

Branimir Maksimovic

unread,

Nov 10, 2023, 8:56:46 AM11/10/23

to

On 2023-11-09, Bonita Montero <Bonita....@gmail.com> wrote:
> I did a test of my monitor against two mutexes and two condition
> variables, playing pingpong with each other. On Windows my imple-
> mentation is about 12 times faster than the mentioned mutex with
> condvar on a AMD 7950X Zen4 16 core system. Under WSL2 on the
> same machine the Linux implementation is 12% faster than my code.
> On Linux bare metal with a Zen2 3990X 64 core system my code is
> about 8.5% faster.
> As I recently found in this that a wait() incurs only one context
> switch back and forth I thought my code woudln't be faster, but
> on bare metal it actually is faster. And I was really surprised
> that the MS condvar implementation is that extremely slow compared
> to my monitor.
>

here is what i got on macOS:
(lldb) bt
* thread #1, queue = 'com.apple.main-thread', stop reason = signal SIGABRT
* frame #0: 0x000000018c33d0bc libsystem_kernel.dylib`__pthread_kill + 8
frame #1: 0x000000018c374cc0 libsystem_pthread.dylib`pthread_kill + 288
frame #2: 0x000000018c280a40 libsystem_c.dylib`abort + 180
frame #3: 0x000000018c32c070 libc++abi.dylib`abort_message + 132
frame #4: 0x000000018c31c004 libc++abi.dylib`demangling_terminate_handler() + 52
frame #5: 0x000000018c32b434 libc++abi.dylib`std::__terminate(void (*)()) + 16
frame #6: 0x000000018c32b390 libc++abi.dylib`std::terminate() + 36
frame #7: 0x000000018c2aa714 libc++.1.dylib`std::__1::thread::~thread() + 32
frame #8: 0x0000000100007cd0 cond_var`void std::__1::__destroy_at[abi:v160006]<std::__1::thread, 0>(__loc=0x00006000036c4028) at construct_at.h:66:13
frame #9: 0x0000000100007cac cond_var`void std::__1::destroy_at[abi:v160006]<std::__1::thread, 0>(__loc=0x00006000036c4028) at construct_at.h:101:5
frame #10: 0x0000000100007c10 cond_var`void std::__1::allocator_traits<std::__1::allocator<std::__1::thread>>::destroy[abi:v160006]<std::__1::thread, void, void>((null)=0x000000016fdff040, __p=0x00006000036c4028) at allocator_traits.h:323:9
frame #11: 0x000000010000a090 cond_var`std::__1::vector<std::__1::thread, std::__1::allocator<std::__1::thread>>::__base_destruct_at_end[abi:v160006](this=0x000000016fdff030 size=2, __new_last=0x00006000036c4020) at vector:836:9
frame #12: 0x0000000100009d10 cond_var`std::__1::vector<std::__1::thread, std::__1::allocator<std::__1::thread>>::__destruct_at_end[abi:v160006](this=0x000000016fdff030 size=2, __new_last=0x00006000036c4020) at vector:724:9
frame #13: 0x00000001000063d4 cond_var`std::__1::vector<std::__1::thread, std::__1::allocator<std::__1::thread>>::resize(this=0x000000016fdff030 size=2, __sz=0) at vector:1912:15
frame #14: 0x0000000100005eec cond_var`main(argc=1, argv=0x000000016fdff420) at test_cond.cpp:52:10
frame #15: 0x000000018bff50e0 dyld`start + 2360

--

7-77-777, Evil Sinner!
https://www.linkedin.com/in/branimir-maksimovic-6762bbaa/

Bonita Montero

unread,

Nov 10, 2023, 9:09:08 AM11/10/23

to

Am 10.11.2023 um 14:56 schrieb Branimir Maksimovic:

> frame #13: 0x00000001000063d4 cond_var`std::__1::vector<std::__1::thread, std::__1::allocator<std::__1::thread>>::resize(this=0x000000016fdff030 size=2, __sz=0) at vector:1912:15
> frame #14: 0x0000000100005eec cond_var`main(argc=1, argv=0x000000016fdff420) at test_cond.cpp:52:10
> frame #15: 0x000000018bff50e0 dyld`start + 2360

It seems that resizing the thread-vector while doing an emplace_back(),
which itself seems to be inlined, fails. I don't know why.

Bonita Montero

unread,

Nov 10, 2023, 9:15:53 AM11/10/23

to

I think I've got it: I'm using C++20 jthreads which are joined on the
destruction of the jthread object. I'm just resizing the jthread vector
to join both threads. But your jthread-implementation seems to behave
like a normal C++11 thread which calls abort() on destruction when a
thread which is joinable and not joind.
You may verify that with this code.

#include <thread>

using namespace std;

int main()
{

(void)jthread( []() { this_thread::sleep_for( 1s ); } );
}

The temporary is very like to be destructed before the thread
is terminated.

Chris M. Thomasson

unread,

Nov 10, 2023, 3:45:10 PM11/10/23

to

You should of modeled in a race-detector first!

Branimir Maksimovic

unread,

Nov 10, 2023, 10:00:03 PM11/10/23

to

yes, i don't have jthread. Will try with g++ rather clang...
Yeeee, real g++ has jthread:

--
la@MacBook-Air News % g++-13 -O3 cond_var.cpp test_cond.cpp -o cond_var -std=c++20 -D__unix__
ld: warning: ignoring duplicate libraries: '-lgcc'
Lola@MacBook-Air News % ./cond_var
3881.83
2984.36
Lola@MacBook-Air News %

Bonita Montero

unread,

Nov 10, 2023, 10:54:15 PM11/10/23

to

To find bugs inside his jthread-implementation ?

Chris M. Thomasson

unread,

Nov 10, 2023, 11:10:17 PM11/10/23

to

To help find a bug, yup. Think it up, draft it out, create test units,
model them with a race detector, Get it working... Then, we can think
about improving performance.

Branimir Maksimovic

unread,

Nov 10, 2023, 11:29:26 PM11/10/23

to

There is no bug. I simply used thread instead of jtrhead
as Apple g++ implementation does not have it.
Since thread destructor throws exception
if thread is still running, it calls terminate
handler.
Real g++ implementation works:
Lola@MacBook-Air News % ./cond_var
3566.26
3292.95
Lola@MacBook-Air News %
Same thing is happening with all other
I showed previously.
I placed switch -std=c++20 in Apple's g++,
but anyway it does not have jthread.
take a look:
Lola@MacBook-Air News % clang++ -O3 cond_var.cpp test_cond.cpp -o cond_var -std=c++20 -D__unix__
test_cond.cpp:48:9: error: unknown type name 'jthread'; did you mean 'thread'?
vector<jthread> threads;
^~~~~~~
thread
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/c++/v1/thread:225:24: note: 'thread' declared here
class _LIBCPP_TYPE_VIS thread
^
1 error generated.
Lola@MacBook-Air News %

Bonita Montero

unread,

Nov 11, 2023, 1:08:10 AM11/11/23

to

Am 11.11.2023 um 05:25 schrieb Branimir Maksimovic:

> Lola@MacBook-Air News % ./cond_var
> 3566.26
> 3292.95

Same as on my 3990X Linux PC: 8% faster.

Bonita Montero

unread,

Nov 11, 2023, 5:41:47 AM11/11/23

to

I think I've put the finishing touches to the code now. For the mutex
part I introduced spinning, which I adopted from glibc. Spinning usually
makes little sense for producer-consumer relationships because the time
it takes to put an item in the queue or take it out of it is usually
very short, and the time it takes to produce the item before and consume
it afterwards is usually very short is usually orders of magnitude
higher; Therefore, a collision during locking can occur quite rarely.
Nevertheless, I can also imagine cases where items are produced and
consumed with high frequency, and spinning could make sense there.

So, here are the two changed files:

// monitor.h

#pragma once
#if defined(_WIN32)
#define NOMINMAX
#include <Windows.h>
#elif defined(__unix__)
#include <sys/types.h>
#include <sys/sem.h>
#include <sys/stat.h>
#else
#error unsupported platform
#endif
#include <atomic>
#include <type_traits>
#if defined(_WIN32)
#include "xhandle.h"
#endif

struct monitor
{
monitor( uint16_t maxSpin = 0 );
~monitor();
void lock();
void unlock();
bool try_lock();
void wait();
void notify();
void notify_all();
uint16_t maxSpin( int16_t maxSpin );
private:
inline static thread_local char t_dummy;
static constexpr bool USE64 = std::atomic_int64_t::is_always_lock_free;
using aword_t = std::conditional_t<USE64, uint64_t, uint32_t>;
static constexpr unsigned BITS = sizeof(aword_t) * 8 / 2;
static constexpr aword_t
ENTER_VALUE = 1,
SIGNAL_VALUE = 1ull << BITS,
ENTER_MASK = SIGNAL_VALUE - 1,
SIGNAL_MASK = ENTER_MASK << BITS;
std::atomic<aword_t> m_atomic;
std::atomic<void *> m_threadId;
uint32_t m_recCount;
bool spinLock( aword_t &ref, bool once );
#if defined(_WIN32)
static constexpr uint32_t SEM_MAX = std::numeric_limits<LONG>::max();
XHANDLE
m_xhEnterEvt,
m_xhSignalSem;
#elif defined(__unix__)
static constexpr uint32_t SEM_MAX = std::numeric_limits<short>::max();
int m_sems;
int semop( std::initializer_list<sembuf> sems );
#endif
std::atomic_uint16_t m_maxSpin, m_spinLimit;
};

// monitor.cpp

#include <iostream>
#include <limits>
#include <system_error>
#include <cassert>
#if defined(__x86_64__) || defined(__i386__)
#include <immintrin.h>
#endif

#include "monitor.h"

using namespace std;

monitor::monitor( uint16_t maxSpin ) :
m_atomic( 0 ),
m_threadId( nullptr ),
#if defined(_WIN32)
m_xhEnterEvt( CreateEventA( nullptr, FALSE, FALSE, nullptr ) ),
m_xhSignalSem( CreateSemaphoreA( nullptr, 0, SEM_MAX, nullptr ) ),
#elif defined(__unix__)
m_sems( semget( IPC_PRIVATE, 2, S_IRUSR | S_IWUSR ) ),
#endif
m_maxSpin( maxSpin ),
m_spinLimit( 0 )
{
#if defined(_WIN32)
if( !m_xhEnterEvt.get() || !m_xhSignalSem.get() )
throw system_error( GetLastError(), system_category(), "can't
initialize monitor object" );
#elif defined(__unix__)
auto zeroSem = [&]() -> bool
{
#if defined(__linux__)
return true;
#else
short vals[2] = { 0, 0 };
return !semctl( m_sems, 0, SETALL, vals );
#endif
};
if( m_sems == -1 || zeroSem() )
{
int errNo = errno;
if( m_sems != -1 )
this->~monitor();
throw system_error(errNo, system_category(), "can't initialize monitor
object" );
}
#endif
}

monitor::~monitor()
{
#if defined(__unix__)
int ret = semctl( m_sems, 0, IPC_RMID );
assert(ret != -1);
#endif
}

void monitor::lock()
{
if( m_threadId.load( memory_order_relaxed ) == &t_dummy )
{
if( m_recCount == (uint32_t)-1 )
throw system_error( (int)errc::result_out_of_range,
generic_category(), "montor's recursion count saturated" );
++m_recCount;
return;
}
aword_t ref = m_atomic.load( memory_order_relaxed );
if( spinLock( ref, false ) )
return;
do
{
if( (ref & ENTER_MASK) == ENTER_MASK )
throw system_error( (int)errc::result_out_of_range,
generic_category(), "montor's locker count saturated" );
assert((ref & ENTER_MASK) >= ref >> BITS);
} while( !m_atomic.compare_exchange_strong( ref, ref + 1,
memory_order_acquire, memory_order_relaxed ) );
if( (ref & ENTER_MASK) != ref >> BITS ) [[likely]]
{
#if defined(_WIN32)
if( WaitForSingleObject( m_xhEnterEvt.get(), INFINITE ) != WAIT_OBJECT_0 )
terminate();
#elif defined(__unix__)
if( semop( { { 0, -1, 0 } } ) == -1 )
terminate();
#endif
}
m_threadId.store( &t_dummy, memory_order_relaxed );
m_recCount = 0;
}

void monitor::unlock()
{
if( m_threadId.load( memory_order_relaxed ) == &t_dummy && m_recCount )
return (void)--m_recCount;
aword_t ref = m_atomic.load( memory_order_relaxed );
assert((ref & ENTER_MASK) && m_threadId == &t_dummy);
m_threadId.store( nullptr, memory_order_relaxed );
do
assert((ref & ENTER_MASK) > ref >> BITS);
while( !m_atomic.compare_exchange_strong( ref, ref - 1,
memory_order_release, memory_order_relaxed ) );
if( (ref & ENTER_MASK) - (ref >> BITS) == 1 ) [[likely]]
return;
#if defined(_WIN32)
if( !SetEvent( m_xhEnterEvt.get() ) )
terminate();
#elif defined(__unix__)
if( semop( { { 0, 1, IPC_NOWAIT } } ) == -1 )
terminate();
#endif
}

bool monitor::try_lock()
{
aword_t ref = m_atomic.load( memory_order_relaxed );
return spinLock( ref, true );
}

void monitor::wait()
{
assert(m_threadId == &t_dummy && !m_recCount);
m_threadId.store( nullptr, memory_order_relaxed );
aword_t ref = m_atomic.load( memory_order_relaxed );
do
assert((ref & ENTER_MASK) > ref >> BITS);
while( !m_atomic.compare_exchange_strong( ref, ref + SIGNAL_VALUE,
memory_order_release, memory_order_relaxed ) );
if( (ref & ENTER_MASK) - (ref >> BITS) > 1 )
{
#if defined(_WIN32)
if( !SetEvent( m_xhEnterEvt.get() ) )
terminate();
#elif defined(__unix__)
if( semop( { { 0, 1, IPC_NOWAIT } } ) == -1 )
terminate();
#endif
}
#if defined(_WIN32)
HANDLE waitFor[2] { m_xhEnterEvt.get(), m_xhSignalSem.get() };
if( WaitForMultipleObjects( 2, waitFor, TRUE, INFINITE ) != WAIT_OBJECT_0 )
terminate();
#elif defined(__unix__)
if( semop( { { 0, -1, 0 }, { 1, -1, 0 } } ) == -1 )
terminate();
#endif
m_threadId.store( &t_dummy, memory_order_relaxed );
m_recCount = 0;
}

void monitor::notify()
{
aword_t ref = m_atomic.load( memory_order_relaxed );
assert((ref & ENTER_MASK) > ref >> BITS && m_threadId == &t_dummy);
do
if( !(ref >> BITS) )
return;
while( !m_atomic.compare_exchange_strong( ref, ref - SIGNAL_VALUE,
memory_order_relaxed, memory_order_relaxed ) );
#if defined(_WIN32)
if( !ReleaseSemaphore( m_xhSignalSem.get(), 1, nullptr ) )
terminate();
#elif defined(__unix__)
if( semop( { { 1, 1, IPC_NOWAIT } }) == -1 )
terminate();
#endif
}

void monitor::notify_all()
{
aword_t ref = m_atomic.load( memory_order_relaxed );
assert((ref & ENTER_MASK) > ref >> BITS && m_threadId == &t_dummy);
uint32_t n;
do
if( !(n = (uint32_t)(ref >> BITS)) )
return;
while( !m_atomic.compare_exchange_strong( ref, ref & ENTER_MASK,
memory_order_relaxed, memory_order_relaxed ) );
#if defined(_WIN32)
if( n > SEM_MAX || !ReleaseSemaphore( m_xhSignalSem.get(), n, nullptr ) )
terminate();
#elif defined(__unix__)
for( uint32_t nRelease; n; n -= nRelease )
if( semop( { { 1, (short)(nRelease = n <= SEM_MAX ? n : SEM_MAX),
IPC_NOWAIT } } ) == -1 )
terminate();
#endif
}

uint16_t monitor::maxSpin( int16_t maxSpin )
{
uint16_t curMaxSpin = m_maxSpin.load( memory_order_relaxed );
if( maxSpin >= 0 )
m_maxSpin.store( maxSpin, memory_order_relaxed );
return curMaxSpin;
}

bool monitor::spinLock( aword_t &ref, bool once )
{
// spinning algorithm taken from glibc
uint32_t maxSpin = m_maxSpin.load( memory_order_relaxed );
once = once && !maxSpin;
maxSpin = !once ? maxSpin : 1;
if( !maxSpin )
return false;
uint32_t
prevSpinLimit = m_spinLimit.load( memory_order_relaxed ),
spinLimit = prevSpinLimit * 2u + 10u,
spinCount = 0;
spinLimit = spinLimit <= maxSpin ? spinLimit : maxSpin;
bool locked = false;
for( ; ; ref = m_atomic.load( memory_order_relaxed ) )
{
assert((ref & ENTER_MASK) >= ref >> BITS);
if( uint32_t enterers = ref & ENTER_MASK;
enterers == ref >> BITS && enterers != ENTER_MASK
&& m_atomic.compare_exchange_strong( ref, ref + 1,
memory_order_acquire, memory_order_relaxed ) )
{
m_threadId.store( &t_dummy, memory_order_relaxed );
m_recCount = 0;
locked = true;
break;
}
if( ++spinCount == spinLimit )
break;
#if defined(_WIN32)
YieldProcessor();
#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__)
|| defined(__i386__))
_mm_pause();
#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__arm__) ||
defined(__aarch64__))
__yield();
#else
#error "need platform-specific pause-instruction"
#endif
}
if( !once ) [[likely]]
m_spinLimit.store( (uint16_t)(prevSpinLimit + (int32_t)(spinCount -
prevSpinLimit) / 8), memory_order_relaxed );
return locked;
}

#if defined(__unix__)
inline int monitor::semop( initializer_list<sembuf> sems )
{
int ret;
while( (ret = ::semop( m_sems, const_cast<sembuf *>(sems.begin()),
sems.size() )) == -1 && errno == EINTR );
return ret;
}
#endif

Bonita Montero

unread,

Nov 11, 2023, 10:39:24 AM11/11/23

to

Am 11.11.2023 um 11:41 schrieb Bonita Montero:

> if( m_sems == -1 || zeroSem() )

if( m_sems == -1 || !zeroSem() )

Chris M. Thomasson

unread,

Nov 11, 2023, 2:41:38 PM11/11/23

to

On 11/11/2023 2:41 AM, Bonita Montero wrote:
> I think I've put the finishing touches to the code now. For the mutex
> part I introduced spinning, which I adopted from glibc. Spinning usually

[...]

Food for thought... I learned something really neat over on comp.arch
wrt Lynn Wheeler many years ago. Basically, why spin doing nothing? Oh,
you use a yield... Well, that is still doing nothing. Think of a spin
along the lines of:

we try to use accomplished work as a backoff/yield for a spin...

<quick pseudo-code>
______________
void lock()
{
while (! mutex_trylock())
{
// try to do some "other" work as a
// yield, in a sense... Hummm.... ;^)
if (! try_to_do_some__other__work())
{
// failed to do some other work, lock it...
mutex_lock();
break;
}
}

// we are locked! =^D
}

void unlock()
{
mutex_unlock();
}
______________

Well, this can be beneficial in certain setups...

Chris M. Thomasson

unread,

Nov 11, 2023, 2:42:41 PM11/11/23

to

Is that yet another bug correction? Remember my advise, get it working
then try to make it faster.

Pavel

unread,

Nov 11, 2023, 5:45:57 PM11/11/23

to

Bonita Montero wrote:
> Am 09.11.2023 um 05:42 schrieb Chris M. Thomasson:
>> On 11/8/2023 8:38 PM, Bonita Montero wrote:
>>> Am 09.11.2023 um 05:36 schrieb Chris M. Thomasson:
>>>
>>>> Humm... Are you okay Bonita? Anything wrong with you?
>>>
>>> Hoare monitors relase a waiting thread immediately after a notify()
>>> and that's less efficient.
>
>> Yawn.
>
> Re-acquiring the mutex part of a monitor after notify()
is exactly what Java does -- and it does it for reason.
> is an superfluous extra part that takes CPU time.

Pavel

unread,

Nov 11, 2023, 5:49:19 PM11/11/23

to

Chris, I think you are preaching to the deaf. I would give up 5 times
already. Your patience is angelic.

Bonita Montero

unread,

Nov 11, 2023, 11:41:07 PM11/11/23

to

Am 11.11.2023 um 20:42 schrieb Chris M. Thomasson:

> Is that yet another bug correction? Remember my advise, get it working
> then try to make it faster.

The code immediately crashed because of an exception;
easiest debugging.

Bonita Montero

unread,

Nov 11, 2023, 11:43:35 PM11/11/23

to

No, Java and .net keep holding the mutex while doing a notify().
That's called a Mesa monitor.

Kaz Kylheku

unread,

Nov 12, 2023, 12:02:28 AM11/12/23

to

On 2023-11-12, Bonita Montero <Bonita....@gmail.com> wrote:
> No, Java and .net keep holding the mutex while doing a notify().
> That's called a Mesa monitor.

I don't suspect that is part of Mesa semantics. (It's definitely part of
Hoare semantics.) Do you have a reference?

Since under Mesa semantics, threads re-acquire the mutex with fewer
guarantees and must re-test the desired condition, Mesa semantics
supports the more efficient protocol of signaling outside of the
monitor.

If you're using POSIX mutexes and conditions, you should call
pthread_cond_signal and pthread_cond_broadcast outside of the mutex,
whenever possible.

(In a nutshell, if you're going to be telling some thread(s) to go ahead
and grab a mutex, get the hell out of their way first).

--
TXR Programming Language: http://nongnu.org/txr
Cygnal: Cygwin Native Application Library: http://kylheku.com/cygnal
Mastodon: @Kazi...@mstdn.ca
NOTE: If you use Google Groups, I don't see you, unless you're whitelisted.

Bonita Montero

unread,

Nov 12, 2023, 5:00:53 AM11/12/23

to

Am 12.11.2023 um 06:02 schrieb Kaz Kylheku:

> I don't suspect that is part of Mesa semantics. (It's definitely part of
> Hoare semantics.) Do you have a reference?

Wikipedia (https://en.wikipedia.org/wiki/Monitor_(synchronization):
"With nonblocking condition variables (also called "Mesa style"
condition variables or "signal and continue" condition variables),
signaling does not cause the signaling thread to lose occupancy
of the monitor. Instead the signaled threads are moved to the e
queue."

> Since under Mesa semantics, threads re-acquire the mutex with fewer
> guarantees and must re-test the desired condition, Mesa semantics
> supports the more efficient protocol of signaling outside of the
> monitor.

This is a theoretical advantage. In fact, a combination of mutex
and condition variable, like a monitor implicitly is, is intended
for procuder-consumer patterns. And at this point it never happens
that you want to signal something but don't modify a common state.

> If you're using POSIX mutexes and conditions, you should call
> pthread_cond_signal and pthread_cond_broadcast outside of the
> mutex, whenever possible.

That actually never happens because you have to lock the mutex
anyway.

Kaz Kylheku

unread,

Nov 12, 2023, 12:18:19 PM11/12/23

to

On 2023-11-12, Bonita Montero <Bonita....@gmail.com> wrote:

> Am 12.11.2023 um 06:02 schrieb Kaz Kylheku:
>
>> I don't suspect that is part of Mesa semantics. (It's definitely part of
>> Hoare semantics.) Do you have a reference?
>
> Wikipedia (https://en.wikipedia.org/wiki/Monitor_(synchronization):
> "With nonblocking condition variables (also called "Mesa style"
> condition variables or "signal and continue" condition variables),
> signaling does not cause the signaling thread to lose occupancy
> of the monitor. Instead the signaled threads are moved to the e
> queue."

That doesn't say that signaling *requires* the monitor to be held,
though!

>> Since under Mesa semantics, threads re-acquire the mutex with fewer
>> guarantees and must re-test the desired condition, Mesa semantics
>> supports the more efficient protocol of signaling outside of the
>> monitor.
>
> This is a theoretical advantage.

No it isn't. The signal operation can trap into a kernel, which can
require hundreds of cycles. The mutex/monitor should ideally be only
held only as long as necessary to protect the shared variables, and not
be extended over unrelated, lengthy operations. That encourages
unnecessary contention.

>> If you're using POSIX mutexes and conditions, you should call
>> pthread_cond_signal and pthread_cond_broadcast outside of the
>> mutex, whenever possible.
>
> That actually never happens because you have to lock the mutex
> anyway.

Pardon me, what is it that you believe doesn't actually happen? People
coding this:

// ...

pthread_mutex_unlock(&obj->mtx);
pthread_cond_signal(&obj->foo_cond);

rather than this:

// ...

pthread_cond_signal(&obj->foo_cond);
pthread_mutex_unlock(&obj->mtx);

?

Scott Lurndal

unread,

Nov 12, 2023, 12:54:08 PM11/12/23

to

No, you don't. If you updated the predicate that the condition
variable is monitoring while holding the mutex, you should release
the mutex before signaling or broadcasting the condition variable
to avoid unnecessary context switches.

Chris M. Thomasson

unread,

Nov 12, 2023, 3:46:52 PM11/12/23

to

Yet, you missed it? Actually, I am not quite sure how to parse your
response. I have not had the free time to port your code over to a
Relacy test unit, yet...

:^)

Chris M. Thomasson

unread,

Nov 12, 2023, 3:49:13 PM11/12/23

to

On 11/12/2023 2:00 AM, Bonita Montero wrote:
> Am 12.11.2023 um 06:02 schrieb Kaz Kylheku:
>
>> I don't suspect that is part of Mesa semantics. (It's definitely part of
>> Hoare semantics.) Do you have a reference?
>
> Wikipedia (https://en.wikipedia.org/wiki/Monitor_(synchronization):
> "With nonblocking condition variables (also called "Mesa style"
> condition variables or "signal and continue" condition variables),
> signaling does not cause the signaling thread to lose occupancy
> of the monitor. Instead the signaled threads are moved to the e
> queue."

Humm... A wait morph? ;^)

Chris M. Thomasson

unread,

Nov 12, 2023, 3:51:45 PM11/12/23

to

Yup. Actually, there was an old discussion about this around 20 ish
years ago back on comp.programming.threads. David Butenhof was involved.

Pavel

unread,

Nov 12, 2023, 7:17:42 PM11/12/23

to

Bonita Montero wrote:
> Am 11.11.2023 um 23:45 schrieb Pavel:
>> Bonita Montero wrote:
>>> Am 09.11.2023 um 05:42 schrieb Chris M. Thomasson:
>>>> On 11/8/2023 8:38 PM, Bonita Montero wrote:
>>>>> Am 09.11.2023 um 05:36 schrieb Chris M. Thomasson:
>>>>>
>>>>>> Humm... Are you okay Bonita? Anything wrong with you?
>>>>>
>>>>> Hoare monitors relase a waiting thread immediately after a notify()
>>>>> and that's less efficient.
>>>
>>>> Yawn.
>>>
>>> Re-acquiring the mutex part of a monitor after notify()
>> is exactly what Java does -- and it does it for reason.
>
> No, Java and .net keep holding the mutex while doing a notify().

Don't change the subject. Java releases lock *on waiting thread* (which
is the behavior of a Hoare monitor by design) while waiting and then
reacquires it after it was notified.

RTFM for once (although, I know you won't):

"
public final void wait()
throws InterruptedException

Causes the current thread to wait until another thread invokes the
notify() method or the notifyAll() method for this object. In other
words, this method behaves exactly as if it simply performs the call
wait(0).

The current thread must own this object's monitor. The thread releases
ownership of this monitor and waits until another thread notifies
threads waiting on this object's monitor to wake up either through a
call to the notify method or the notifyAll method. The thread then waits
until it can re-obtain ownership of the monitor and resumes execution.

Pavel

unread,

Nov 12, 2023, 7:29:39 PM11/12/23

to

Kaz Kylheku wrote:
> On 2023-11-12, Bonita Montero <Bonita....@gmail.com> wrote:
>> No, Java and .net keep holding the mutex while doing a notify().
>> That's called a Mesa monitor.
>
> I don't suspect that is part of Mesa semantics. (It's definitely part of
> Hoare semantics.) Do you have a reference?

She doesn't. See my citation in response to her post for the reference
to the contrary.

>
> Since under Mesa semantics, threads re-acquire the mutex with fewer
> guarantees and must re-test the desired condition, Mesa semantics
> supports the more efficient protocol of signaling outside of the
> monitor.
>
> If you're using POSIX mutexes and conditions, you should call
> pthread_cond_signal and pthread_cond_broadcast outside of the mutex,
> whenever possible.

This is recommended against by the standard for the same reason why Java
implements Hoare monitor behavior. Citation:

"
The pthread_cond_broadcast() or pthread_cond_signal() functions may be
called by a thread whether or not it currently owns the mutex that
threads calling pthread_cond_wait() or pthread_cond_timedwait() have
associated with the condition variable during their waits; however, if
predictable scheduling behavior is required, then that mutex shall be
locked by the thread calling pthread_cond_broadcast() or
pthread_cond_signal().

Pavel

unread,

Nov 12, 2023, 7:33:26 PM11/12/23

to

Why would you have additional context switches if you signal before
releasing the lock?

Chris M. Thomasson

unread,

Nov 12, 2023, 7:44:50 PM11/12/23

to

Basically, if you signal while locked then another waiting thread is
going to try to acquire the mutex that is already locked by the
signalling thread, instant contention. However, wait morphing techniques
can be used to handle this since a mutex and a condvar are very intimate
with each other anyway. Usually, signalling outside of the mutex is ideal.

Kaz Kylheku

unread,

Nov 12, 2023, 11:48:50 PM11/12/23

to

On 2023-11-13, Pavel <pauldont...@removeyourself.dontspam.yahoo> wrote:

> Kaz Kylheku wrote:
>> If you're using POSIX mutexes and conditions, you should call
>> pthread_cond_signal and pthread_cond_broadcast outside of the mutex,
>> whenever possible.
> This is recommended against by the standard for the same reason why Java
> implements Hoare monitor behavior. Citation:
>
> "
> The pthread_cond_broadcast() or pthread_cond_signal() functions may be
> called by a thread whether or not it currently owns the mutex that
> threads calling pthread_cond_wait() or pthread_cond_timedwait() have
> associated with the condition variable during their waits; however, if
> predictable scheduling behavior is required, then that mutex shall be
> locked by the thread calling pthread_cond_broadcast() or
> pthread_cond_signal().
> "

But that text is stupid/defective, because you will not actually get
predictable scheduling behavior just by doing that.

Signal the condition while still holding the mutex doesn't give you any
guarantees about which thread will get the mutex next.

Suppose:

1. The signal operation wake up the next waiting thread.

2. The signaler then gives up the mutex.

3. Before that awoken next-waiting-thread gets the mutex, some
another thread comes along and seizes the mutex.

Signaling in the mutex can blow up the critical region from "handful of
instructions" to "hundreds of instructions".

If we compare:

mutex_lock(&stack->lock);
node->next = stack->top;
stack->top = node;
mutex_unlock(&stack->lock);

cond_signal(&stack->item_pushed);

All that is in the critical region are the mutex are the list
manipulation instructions, plus some of the mutex code itself.

If we move cond_signal before mutex_unlock, everything done by that
function, including potentially going into the kernel to wake up a
thread, is now in the mutex.

That's a lot to pay for some vague, unfulfillable promise of
"predictable scheduling behavior", on which you can base approximately
nothing.

Hoare semantics gives you something: that if there are waiting tasks
queued on a condition, the monitor is transferred to the first waiting
one. *And* (I seem to recall) when that thread leaves the monitor, the
original signaler gets it again!

Kaz Kylheku

unread,

Nov 12, 2023, 11:50:26 PM11/12/23

to

On 2023-11-13, Pavel <pauldont...@removeyourself.dontspam.yahoo> wrote:

> Scott Lurndal wrote:
>> No, you don't. If you updated the predicate that the condition
>> variable is monitoring while holding the mutex, you should release
>> the mutex before signaling or broadcasting the condition variable
>> to avoid unnecessary context switches.
>>
> Why would you have additional context switches if you signal before
> releasing the lock?

Because of the situation that the thread which was signaled is
trying to acquire the mutex, which, stupidly, the signaling thread
is still holding. So, oops, back it goes into suspended state, and we
have to context switch to the mutex holder which has to release the
mutex and then switch to that signaled thread again.

Bonita Montero

unread,

Nov 13, 2023, 1:33:53 AM11/13/23

to

Am 12.11.2023 um 18:18 schrieb Kaz Kylheku:

>> This is a theoretical advantage.

> No it isn't. ...

With this code singalling from inside is about 50% faster on
my 3990X Zen2 64 core PC under Ubuntu.

#include <iostream>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <vector>

using namespace std;

int main( int argc, char ** )
{
mutex mtx;
condition_variable cv;
constexpr uint64_t N_ITEMS = 100'000'000;
atomic_uint64_t itemOutput( 0 );
auto producer = [&]()
{
static atomic_uint64_t itemCounter( 0 );
while( itemCounter.fetch_add( 1, memory_order_relaxed ) < N_ITEMS )
{
{
unique_lock lock( mtx );
itemOutput.fetch_add( 1, memory_order_relaxed );
if( argc <= 1 )
cv.notify_one();
}
if( argc > 1 )
cv.notify_one();
}
};
unsigned nProducers = jthread::hardware_concurrency() - 1;
vector<jthread> procuders;
procuders.reserve( nProducers );
for( unsigned t = 0; t != nProducers; ++t )
procuders.emplace_back( producer );
uint64_t nextItem = 0;
for( ; ; )
{
unique_lock lock( mtx );
uint64_t lastItem;
while( (lastItem = itemOutput.load( memory_order_relaxed )) < nextItem )
cv.wait( lock );
if( lastItem >= N_ITEMS )
break;
nextItem = lastItem;
}
}

Bonita Montero

unread,

Nov 13, 2023, 1:35:20 AM11/13/23

to

Am 12.11.2023 um 18:53 schrieb Scott Lurndal:

> No, you don't. If you updated the predicate that the condition
> variable is monitoring while holding the mutex, you should release
> the mutex before signaling or broadcasting the condition variable
> to avoid unnecessary context switches.

The context switch occurs only if _both_ conditions are met:
the mutex is unlocked and the condition variable is signalled.