Simple producer/consumer copy of 18080 512 byte record file
with 40 buffers.
ru_nvcsw - voluntary context switches
ru_nivcsw - involuntary context switches
cvwait_r - # times producer thread did a cond_wait
cvwait_w - # times consumer thread did a cond_wait
NPTL condvars:
time = 1.359141
user time = 0.189971
system time = 1.165823
ru_nvcsw = 54063
ru_nivcsw = 54106
cvwait_r = 18021
cvwait_w = 0
fastcv w/o sched_yield:
time = 1.332471
user time = 0.199970
system time = 1.129828
ru_nvcsw = 36084
ru_nivcsw = 36133
cvwait_r = 18041
cvwait_w = 0
fastcv w/ sched_yield:
time = 0.572876
user time = 0.047993
system time = 0.514922
ru_nvcsw = 582
ru_nivcsw = 1694
cvwait_r = 561
cvwait_w = 6
NPTL semaphores:
time = 1.103147
user time = 0.095986
system time = 1.004847
ru_nvcsw = 17948
ru_nivcsw = 17990
cvwait_r = 0
cvwait_w = 0
NPTL semaphores w/ sched_yield after sem_wait:
time = 0.966496
user time = 0.098985
system time = 0.865868
ru_nvcsw = 460
ru_nivcsw = 37079
cvwait_r = 0
cvwait_w = 0
fast semaphores w/ sched_yield after sem_wait:
time = 0.555656
user time = 0.047993
system time = 0.505923
ru_nvcsw = 557
ru_nivcsw = 1664
cvwait_r = 0
cvwait_w = 0
Single thread read/write single buffer loop copy:
time = 0.559268
user time = 0.017998
system time = 0.539918
ru_nvcsw = 0
ru_nivcsw = 39
cvwait_r = 0
cvwait_w = 0
NPTL condvar w/ sched_yield after pthread_mutex_unlock:
time = 0.927346
user time = 0.099985
system time = 0.825875
ru_nvcsw = 27
ru_nivcsw = 38978
cvwait_r = 0
cvwait_w = 1
nreads = 18080
That last item is kind of strange. Combining it with
sched_yields in other places slows things down. I don't
know what is going on in that case.
Joe Seigh
Joe Seigh
fastcv.h
--------------------------------------------
#ifndef _FASTCV_H
#define _FASTCV_H
#include <pthread.h>
typedef struct {
int waiters; // atomic
int eventcount; // broadcast events (futex)
} pthread_cond_x_t;
#define pthread_cond_t pthread_cond_x_t
#undef PTHREAD_COND_INITIALIZER
#define PTHREAD_COND_INITIALIZER {0, 0}
#define pthread_cond_wait(c, m) pthread_cond_timedwait_x(c, m, NULL)
#define pthread_cond_timedwait pthread_cond_timedwait_x
#define pthread_cond_signal pthread_cond_signal_x
#define pthread_cond_broadcast pthread_cond_broadcast_x
#define pthread_cond_init pthread_cond_init_x
#define pthread_cond_destroy pthread_cond_destroy_x
extern int pthread_cond_timedwait_x(pthread_cond_t *, pthread_mutex_t *, struct timespec *);
extern int pthread_cond_signal_x(pthread_cond_t *);
extern int pthread_cond_broadcast_x(pthread_cond_t *);
extern int pthread_cond_init_x(pthread_cond_t *, pthread_condattr_t *);
extern int pthread_cond_destroy_x(pthread_cond_t *);
#endif /* _FASTCV_H */
--------------------------------------------
fastcv.c
--------------------------------------------
#include <pthread.h>
#include <errno.h>
#include <limits.h>
#include <string.h>
#include <sys/syscall.h>
#include <linux/futex.h>
#include <sched.h>
#include <asm/atomic.h>
#define atomic_inc_x(x) atomic_inc((atomic_t *)(x))
#define atomic_add_negative_x(i, x) atomic_add_negative((i), (atomic_t *)(x))
#include "fastcv.h"
//---------------------------------------------------------------------
// futex syscall definitions
//
//---------------------------------------------------------------------
#define futex_wait(futex, val, ts) \
syscall(SYS_futex, (unsigned long)futex, FUTEX_WAIT, val, (unsigned long)ts, 0)
#define futex_wake(futex, nwake) \
syscall(SYS_futex, (unsigned long)futex, FUTEX_WAKE, nwake, 0, 0)
//---------------------------------------------------------------------
// cancelation cleanup handler buffer
//
//---------------------------------------------------------------------
struct cbuffer {
pthread_cond_t *cvar;
pthread_mutex_t *mutex;
};
//---------------------------------------------------------------------
// abstime_to_reltime --
//
// returns ETIMEDOUT if abstime has passed
//---------------------------------------------------------------------
int abstime_to_reltime(struct timespec *reltime, struct timespec *abstime) {
struct timespec now;
int borrow = 0;
clock_gettime(CLOCK_REALTIME, &now);
if ((reltime->tv_nsec = (abstime->tv_nsec - now.tv_nsec)) < 0) {
borrow = 1;
reltime->tv_nsec += 1000000000;
}
// <
if ((reltime->tv_sec = (abstime->tv_sec - now.tv_sec) - borrow) < 0)
return ETIMEDOUT;
// ==
else if (reltime->tv_sec == 0 && reltime->tv_nsec == 0)
return ETIMEDOUT;
// >
else
return 0;
}
//---------------------------------------------------------------------
// dec_waiters -- atomically decrement waiters count
// If result is -1, condvar is being destroyed, signal
// thread waiting on cond_destroy.
//
//---------------------------------------------------------------------
int dec_waiters(pthread_cond_t *cvar, int count) {
// if (atomic_dec(&(cvar->waiters)) == 0)
if (atomic_add_negative_x(-count, &(cvar->waiters))) {
// signal if condvar pending deletion (waiters == -1)
cvar->eventcount++; // need not be atomic at this point
futex_wake(&(cvar->eventcount), 1);
return 1;
}
else
return 0;
}
//---------------------------------------------------------------------
// cvar_cleanup -- handle canceled condition wait
//
//---------------------------------------------------------------------
void cvar_cleanup(void *arg) {
struct cbuffer *cbuf = (struct cbuffer *)arg;
if (dec_waiters(cbuf->cvar, 1) == 0)
pthread_cond_broadcast(cbuf->cvar);
if (pthread_mutex_lock(cbuf->mutex) != 0)
abort();
}
//---------------------------------------------------------------------
// pthread_cond_timedwait --
//
//---------------------------------------------------------------------
int pthread_cond_timedwait_x(pthread_cond_t *cvar, pthread_mutex_t *mutex, struct timespec *abstime) {
int rc, rc2;
int waitval;
int canceltype;
struct cbuffer cbuf;
struct timespec reltime;
struct timespec *rtp = NULL;
cbuf.cvar = cvar;
cbuf.mutex = mutex;
atomic_inc_x(&(cvar->waiters)); // increment waiters count
waitval = cvar->eventcount; // atomic
if ((rc = pthread_mutex_unlock(mutex)) != 0) {
dec_waiters(cvar, 1);
return rc;
}
pthread_cleanup_push(&cvar_cleanup, &cbuf);
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &canceltype);
rc = 0;
if (abstime) {
rtp = &reltime;
rc = abstime_to_reltime(rtp, abstime);
}
if (rc == 0) {
rc = futex_wait(&(cvar->eventcount), waitval, rtp);
if (rc < 0)
rc = errno;
#ifdef _FASTCV_YIELD
else
sched_yield();
#endif
}
pthread_setcanceltype(canceltype, NULL);
pthread_cleanup_pop(0);
// If not woken by futex_wakeup, decrement waiters count
//
if (rc != 0)
dec_waiters(cvar, 1);
if ((rc2 = pthread_mutex_lock(mutex)) != 0)
return rc2;
else if (rc != ETIMEDOUT)
return 0;
else
return ETIMEDOUT;
}
//---------------------------------------------------------------------
// pthread_cond_signal --
//
//---------------------------------------------------------------------
int pthread_cond_signal_x(pthread_cond_t *cvar) {
int rc;
if (cvar->waiters > 0) {
// wake up a single waiter
rc = futex_wake(&(cvar->eventcount), 1);
if (rc == 0) {
// none woken up,
// do broadcast instead
atomic_inc_x(&(cvar->eventcount));
rc = futex_wake(&(cvar->eventcount), INT_MAX);
}
if (rc > 0)
dec_waiters(cvar, rc);
}
return 0;
}
//---------------------------------------------------------------------
// pthread_cond_broadcast --
//
//---------------------------------------------------------------------
int pthread_cond_broadcast_x(pthread_cond_t *cvar) {
int rc;
if (cvar->waiters > 0) {
atomic_inc_x(&(cvar->eventcount));
rc = futex_wake(&(cvar->eventcount), INT_MAX);
if (rc > 0)
dec_waiters(cvar, rc);
}
return 0;
}
//---------------------------------------------------------------------
// pthread_cond_init --
//
//---------------------------------------------------------------------
int pthread_cond_init_x(pthread_cond_t *cvar, pthread_condattr_t *attr) {
memset(cvar, 0, sizeof(pthread_cond_t));
return 0;
}
//---------------------------------------------------------------------
// pthread_cond_destroy --
//
//---------------------------------------------------------------------
int pthread_cond_destroy_x(pthread_cond_t *cvar) {
int rc;
int waitval;
//if (atomic_dec(&(cvar->waiters) != 0) {
if (!atomic_add_negative_x(-1, &(cvar->waiters))) {
// not -1, wait for it.
for (;;) {
waitval = cvar->eventcount;
// rmb();
if (cvar->waiters == -1) break;
futex_wait(&(cvar->eventcount), waitval, NULL);
}
}
// nothing needs to be done currently.
return 0;
}
/*-*/
That was a quick and dirty fast semaphore written with atomic
increment and decrement. You can write it with sem_trywait()
instead. You get a slight time increase to around .65.
Joe Seigh
Nice! I am looking forward to playing around with this code. It will teach
me more about these damn futexs, I don't quite understand them fully.
It just uses futex_wait and futex_wake which are fairly straightforward. An
earlier version looped on EINTR if the evencount was unchanged. NPTL's
version can't loop because they use wait morphing and the waiting thread
can't know which futex it woke up on. futex_cmp_requeue is a little strange.
You sort of have to do a broadcast if the compare fails because you might have
lost a signal when the threads were temporarily dequeued. I don't know why
they didn't just hard code this into futex_cmp_requeue. It would be fairly
straightforward to add waitmorphing to my code. I just got tired of messing
with it.
Also the cancelation logic could be make simpler. I suspect that thread cancelation
is done with a built-in signal. If that was so, instead of setting up cancelation
handlers and enabling async cancelation, just call pthread_testcancel after reacquiring
the lock if futex_wait had exited with EINTR.
But enough fooling around with condvars. It's not my job, there are people who are
paid to do it and it's their job. I'll probably do a "zero overhead" eventcount
based on the futex since it is an eventcount of sorts.
Joe Seigh
I was wondering how futex_wait atomically checks if the futex value is the
same as the expected value. It kind of seems like a cas operation.
Something like this:
int futex_wait( int *futex, int cmp )
{
hash_futex_lock( futex );
if ( atomic_cas( futex, cmp, cmp ) == cmp )
{
ret = queue_me_and_wait( futex );
} else { ret = EWOULDBLOCK; }
hash_futex_unlock( futex );
}
?
No, it just has to queue (suspend) the waiting thread, fetch the futex
value, and dequeue (resume) the waiting thread if the futex value is
different than the cmp value.
There has to be a store-load barrier between the queue and futex fetch.
If dequeue failes because thread resumed by something else that's ok.
If thread has been dequeued and subsequently requeued before you attempted
the dequeue, that's ok even if the new cmp value has changed from the one
you are using. It's just a spurious wakeup.
There's currently a discussion about this on the kernel mailing list.
Joe
Would the following pseudo-code implement a zero-overhead eventcount?
It should, hummm...
#define EC_WAITERS 1
#define EC_LOCKED 2
#define EC_LOCK_WAITERS 4
typedef struct ec_
{
volatile ULONG count;
HANDLE waitset; // auto-reset event
per_thread_t *front;
per_thread_t *back;
} ec_t;
/* Locks and increments event count */
void prv_ec_lock_inc( ec_t *_this )
{
register int waited = 0;
register LONG cmp, xchg, old = _this->count;
for ( ;; )
{
if ( ! ( old & EC_LOCKED ) )
{
/* lock and inc */
xchg = ( old | EC_LOCKED ) + 0x00000010;
}
else { xchg = old | EC_LOCK_WAITERS; }
if ( waited ) { xchg |= EC_LOCK_WAITERS; }
cmp = old;
old = InterlockedCompareExchange
( &_this->count,
xchg & ~EC_WAITERS,
cmp );
if ( cmp == old )
{
if ( ! ( old & EC_LOCKED ) ) { break; }
if ( WaitForSingleObject
( _this->waitset,
INFINITE ) != WAIT_OBJECT_0 )
{
abort();
}
waited = 1;
old = _this->count;
}
}
}
/* Compares count, and locks if equal */
BOOL prv_ec_lock_cmp( ec_t *_this, LONG cmp )
{
register int waited = 0;
register LONG cmp, xchg, old = _this->count;
for ( ;; )
{
/* compare event count */
if ( ( old & 0xFFFFFFF0 ) == cmp )
{
if ( ! ( old & EC_LOCKED ) )
{
/* lock */
xchg = old | EC_LOCKED;
}
else { xchg = old | EC_LOCK_WAITERS; }
}
else { xchg = old; }
if ( waited ) { xchg |= EC_LOCK_WAITERS; }
cmp = old;
old = InterlockedCompareExchange
( &_this->count,
xchg,
cmp );
if ( cmp == old )
{
if ( ( old & 0xFFFFFFF0 ) != cmp ) { break; }
else if ( ! ( old & EC_LOCKED ) ) { return TRUE; }
if ( WaitForSingleObject
( _this->waitset,
INFINITE ) != WAIT_OBJECT_0 )
{
abort();
}
waited = 1;
old = _this->count;
}
}
return FALSE;
}
/* unlocks */
void prv_ec_unlock( ec_t *_this )
{
register LONG cmp, old = _this->old;
do
{
cmp = old;
old = InterlockedCompareExchange
( &_this->count,
old & ~( EC_LOCKED | EC_LOCKED_WAITERS ),
cmp );
} while { cmp != old );
if ( old & EC_LOCKED_WAITERS )
{
if ( ! SetEvent( _this->waitset ) ) { abort(); }
}
}
LONG ec_get( ec_t *_this )
{
register LONG cmp, old = _this->count;
do
{
cmp = old;
/* set ec waiters */
old = InterlockedCompareExchange
( &_this->count,
old | EC_WAITERS,
cmp );
} while ( cmp != old );
return old & 0xFFFFFFF0;
}
void ec_wait( ec_t *_this, LONG cmp )
{
if ( prv_ec_lock_cmp( _this, cmp ) )
{
per_thread_t *thread = per_thread_self();
InterlockedIncrement( &thread->refs );
if ( ! _this->front )
{
_this->front = thread;
_this->back = thread;
}
else
{
_this->back->next = thread;
_this->back = thread;
}
prv_ec_unlock( _this );
if ( WaitForSingleObject
( thread->waiter,
INFINITE )
{
abort();
}
}
}
void ec_broadcast( ec_t *_this )
{
register LONG old = _this->count;
/* zero-overhead check for waiters??? */
if ( old & EC_WAITERS )
{
per_thread_t *thread, *next;
prv_ec_lock_inc( _this );
thread = _this->front;
_this->front = 0;
_this->back = 0;
prv_ec_unlock( _this );
while ( thread )
{
next = thread->next;
thread->next = 0;
if ( ! SetEvent( thread->waiter ) ) { abort(); }
per_thread_release( thread );
next = thread;
}
}
}
The eventcount and the mutex for the wait queue are combined in a single
word... Is this nessessary?
I wanted to be able to acquire the wait queue lock and compare/inc the event
in one atomic operation.
I still don't understand lock-free eventcounts Joe, am I getting close, or
straying off into the wild!
;) lol
Would the following pseudo-code implement a zero-overhead eventcount?
SenderX wrote:
[...]
>
> The eventcount and the mutex for the wait queue are combined in a single
> word... Is this nessessary?
>
> I wanted to be able to acquire the wait queue lock and compare/inc the event
> in one atomic operation.
Potential for unnecessary spurious wakeups. They might not be a problem but
I wouldn't have all the resources to test and verify that.
>
> I still don't understand lock-free eventcounts Joe, am I getting close, or
> straying off into the wild!
>
> ;) lol
Actually, futexes are basically lock-free eventcounts by themselves. There's
nothing really to add except a waiter count if you want to avoid signaling when
there's no waiters. But you may not want or need a waiters count if there's
other ways to determine if there's waiters. If the wait condition is an empty
queue or stack then you don't need to signal when you enqueue onto an non-empty
queue. Enqueueing onto an empty queue doesn't necessarily mean there are waiters
but you'd have to do some testing to see how much of a problem that actually is.
Joe Seigh
Joe Seigh wrote:
>
> SenderX wrote:
> >
> > > It just uses futex_wait and futex_wake which are fairly straightforward.
> >
> > I was wondering how futex_wait atomically checks if the futex value is the
> > same as the expected value. It kind of seems like a cas operation.
> >
> > Something like this:
[...]
> No, it just has to queue (suspend) the waiting thread, fetch the futex
> value, and dequeue (resume) the waiting thread if the futex value is
> different than the cmp value.
>
> There has to be a store-load barrier between the queue and futex fetch.
>
> If dequeue failes because thread resumed by something else that's ok.
>
> If thread has been dequeued and subsequently requeued before you attempted
> the dequeue, that's ok even if the new cmp value has changed from the one
> you are using. It's just a spurious wakeup.
>
> There's currently a discussion about this on the kernel mailing list.
>
Maybe they do have a problem.
http://www.ussg.iu.edu/hypermail/linux/kernel/0411.2/0953.html
Seems you can wake a thread that was already "woken" up by the cmp value
before it had a chance to do the unqueue_me.
I think you can fix it by waking up another waiting thread if the unqueue_me
fails. Eventually you wake a thread that wasn't already "woken".
Joe Seigh
Using the eventcount algorithm I posted, you could probably do win32 convar
waits like this:
typedef ec_t cv_t;
void cv_wait( cv_t *_this, LPCRITICAL_SECTION mutex )
{
LONG cmp = ec_get( _this );
LeaveCriticalSection( mutex );
ec_wait( _this, cmp );
EnterCriticalSection( mutex );
}
Humm, looks to simple. But it should work fine...
> Maybe they do have a problem.
>
> http://www.ussg.iu.edu/hypermail/linux/kernel/0411.2/0953.html
If there are at least two threads blocking on pthread_cond_wait, and you
make two calls to pthread_cond_signal, it is not acceptable to wake the same
thread twice (assuming it doesn't come back around and call
pthread_cond_wait again).
DS
That has to because they queue the thread before they check the cmp value?
Yeah, basically. They're going to have to go back to the ealier futex implementation
where they held the queue lock when they did the compare first before enqeueing.
Joe Seigh
I wrote:
> Maybe they do have a problem.
>
> http://www.ussg.iu.edu/hypermail/linux/kernel/0411.2/0953.html
>
> Seems you can wake a thread that was already "woken" up by the cmp value
> before it had a chance to do the unqueue_me.
>
> I think you can fix it by waking up another waiting thread if the unqueue_me
> fails. Eventually you wake a thread that wasn't already "woken".
>
If you go this route, the wait queue has to be FIFO, which it appears to be,
and the delayed woken thread need a different return code besides 0, for
instance EALREADY, unless you want to break the correspondence between the
return code from futex_wake and the number of threads getting return code 0.
Joe Seigh
> /* Compares count, and locks if equal */
> BOOL prv_ec_lock_cmp( ec_t *_this, LONG cmp )
BOOL prv_ec_lock_cmp( ec_t *_this, LONG ec_cmp )
> {
> register int waited = 0;
> register LONG cmp, xchg, old = _this->count;
>
> for ( ;; )
> {
> /* compare event count */
> if ( ( old & 0xFFFFFFF0 ) == cmp )
if ( ( old & 0xFFFFFFF0 ) == ec_cmp )
> {
> if ( ! ( old & EC_LOCKED ) )
> {
> /* lock */
> xchg = old | EC_LOCKED;
> }
>
> else { xchg = old | EC_LOCK_WAITERS; }
> }
>
> else { xchg = old; }
>
> if ( waited ) { xchg |= EC_LOCK_WAITERS; }
>
> cmp = old;
>
> old = InterlockedCompareExchange
> ( &_this->count,
> xchg,
> cmp );
>
> if ( cmp == old )
> {
> if ( ( old & 0xFFFFFFF0 ) != cmp ) { break; }
if ( ( old & 0xFFFFFFF0 ) != ec_cmp ) { break; }
Using eventcount's for cv's work great with broadcast's. However, not so
good with signals...
I am getting a race-condition when I try to add signal logic to the
eventcount. In order to signal, it seems I need to put waiting threads into
the wait queue when a request for the eventcount value is made, ec_get(
... ). But then I run into the same problem futex has with sending signal to
threads that have been signaled already due to the eventcount compare
failing.
Can efficient eventcounts be made to allow for signaling instead of pure
broadcasting?
Speaking of unnecessary spurious wakeups, it seems that alls you have to do
to signal a single thread in an eventcount wrt condvars is inc + wake one.
But that "could" also signal an entire waitset of threads. Is this ok for
cv's?
> atomic_inc_x(&(cvar->eventcount));
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This can wake threads
> rc = futex_wake(&(cvar->eventcount), INT_MAX);
And this can wake threads.
It this legal for condvar signal? I understand that these would be observed
as "simple and expected" spurious wakes by the user application. Every use
of condvars should handle extra waits anyway right?
> }
>
> if (rc > 0)
> dec_waiters(cvar, rc);
> }
>
> return 0;
> }
It seems eventcounts are legally broadcast only... ;(
Not if you get the thundering herd problem. Thread pools are probably the only
situation that would have that problem unless you used semaphores to limit the
number of threads waiting on the condvar.
Condvar signal is a bad api design decision but it's too well established at this
point. Better to create a new api.
Joe Seigh
SenderX wrote:
>
> > int pthread_cond_signal_x(pthread_cond_t *cvar) {
> > int rc;
> >
> > if (cvar->waiters > 0) {
> > // wake up a single waiter
> > rc = futex_wake(&(cvar->eventcount), 1);
> > if (rc == 0) {
> > // none woken up,
> > // do broadcast instead
>
> > atomic_inc_x(&(cvar->eventcount));
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> This can wake threads
>
> > rc = futex_wake(&(cvar->eventcount), INT_MAX);
>
> And this can wake threads.
>
> It this legal for condvar signal? I understand that these would be observed
> as "simple and expected" spurious wakes by the user application. Every use
> of condvars should handle extra waits anyway right?
Besides spurious wakeups allowed, pthread_cond_signal is defined as waking up
as least one, so actually waking up all waiting threads all the time would be
valid for pthread_cond_signal. In this implementation it almost never happens.
If you do signal only on a condvar then the eventcount value will be the number
of times it had to do a broadcast. On my uniprocessor I've only seen it happen
a couple of times maybe.
>
> > }
> >
> > if (rc > 0)
> > dec_waiters(cvar, rc);
> > }
> >
> > return 0;
> > }
>
> It seems eventcounts are legally broadcast only... ;(
Aside from weird semantics for win32 Event which makes them
complicated as well as useless.
Joe Seigh
Yeah.
>> > if (rc == 0) {
>> > // none woken up,
>> > // do broadcast instead
>>
>> > atomic_inc_x(&(cvar->eventcount));
The window is very small for a large waiter count to pile up right here.
>> > rc = futex_wake(&(cvar->eventcount), INT_MAX);
If it did get bad, wait morphing could absorb all of the contention on the
user mutex.
>>
>> It seems eventcounts are legally broadcast only... ;(
>
> Aside from weird semantics for win32 Event which makes them
> complicated as well as useless.
They also have horrible performance. Eventcount is superior in terms of
correctness and performance.
:)
So the lock-free version has wait morphing and runs faster than NPTL. 30% fast w/
wait morping only, 225x faster w/ yield only hack and 250x faster with both. That's
for a deliberate thundering herd testcase and the really high performace factors are
due to NPTL running really slow with thundering herd.
Joe Seigh
Both are totally brain-damaged. Give it up, Joe. Futex-like beasts
(improved ones, not totally brain-dead stuff that they have currently)
are really good for locks (semas, mutexes, rwlocks), not condvars
and/or barriers. You just can't beat a condvar with an "explicit"
queue and per-thread sema-like gates with real "wait morphing."
regards,
alexander.
How much of a performance improvement did you get from that?
Joe Seigh
I'm talking theory. And you just can't beat theory. ;-)
regards,
alexander.
Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > How much of a performance improvement did you get from that?
>
> I'm talking theory. And you just can't beat theory. ;-)
>
In theory, anyway. :)
Joe Seigh
cvwaits = total number of waits by all consumers
NPTL signal:
time = 29.770862
user time = 7.161911
system time = 22.568569
ru_nvcsw = 6000001
ru_nivcsw = 6000431
cvwaits = 2000000
NPTL broadcast:
time = 173.251578
user time = 46.028003
system time = 126.859714
ru_nvcsw = 27309705
ru_nivcsw = 4993749
cvwaits = 24807747
lock-free signal:
time = 23.243099
user time = 6.329038
system time = 16.829441
ru_nvcsw = 4000020
ru_nivcsw = 4000379
cvwaits = 2000000
lock-free broadcast:
time = 286.615726
user time = 78.123124
system time = 207.901394
ru_nvcsw = 47827422
ru_nivcsw = 2850720
cvwaits = 23913022
NPTL signal after unlock:
time = 22.624151
user time = 6.106072
system time = 16.469496
ru_nvcsw = 4000014
ru_nivcsw = 4000362
cvwaits = 2000000
lock-free signal after unlock:
time = 15.190874
user time = 5.813116
system time = 9.341580
ru_nvcsw = 2000021
ru_nivcsw = 2000253
cvwaits = 2000000
NPTL broadcast after unlock:
time = 190.805691
user time = 49.233516
system time = 141.172539
ru_nvcsw = 30088078
ru_nivcsw = 1894149
cvwaits = 29934391
lock-free broadcast after unlock:
time = 244.205641
user time = 74.801628
system time = 168.898324
ru_nvcsw = 38410421
ru_nivcsw = 2159221
cvwaits = 38270044
lock-free signal w/ sched_yield:
time = 0.793973
user time = 0.775882
system time = 0.015997
ru_nvcsw = 14253
ru_nivcsw = 19942
cvwaits = 9371
lock-free broadcast w/ sched_yield:
time = 0.796090
user time = 0.761884
system time = 0.028996
ru_nvcsw = 14608
ru_nivcsw = 11375
cvwaits = 9423
lock-free signal after unlock w/ sched_yield:
time = 0.789347
user time = 0.757885
system time = 0.029996
ru_nvcsw = 13515
ru_nivcsw = 19924
cvwaits = 9401
lock-free broadcast after unlock w/ sched_yield
time = 0.781465
user time = 0.711892
system time = 0.063990
ru_nvcsw = 13526
ru_nivcsw = 11098
cvwaits = 9385
lock-free broadcast w/ wait morphing
time = 104.396387
user time = 30.187411
system time = 73.954758
ru_nvcsw = 15088934
ru_nivcsw = 2289617
cvwaits = 14259217
lock-free broadcast after unlock w/ wait morphing
time = 139.794501
user time = 43.773346
system time = 95.617464
ru_nvcsw = 20290971
ru_nivcsw = 1287170
cvwaits = 20290364
lock-free broadcast w/ sched_yield & wait morphing:
time = 0.693834
user time = 0.690895
system time = 0.000000
ru_nvcsw = 1439
ru_nivcsw = 2058
cvwaits = 891
lock-free broadcast after unlock w/ sched_yield & wait morphing:
time = 0.685087
user time = 0.677897
system time = 0.006999
ru_nvcsw = 1608
ru_nivcsw = 2226
cvwaits = 1075
Joe Seigh
No response in the kernel mailing list since this problem was stated. I don't
think they realize there's a problem.
Joe Seigh
Yeah, the fast win32 condvar sketch I posted really screams on 4-way
hyperthreaded xeon under win32 or linux. It beats current NPTL condvar.
explicit managed queue on win32 SMP seems to run fine, without any apparent
performance problems from SCHED_OTHER. Hummm...
This could easily be solved with Joes idea of using an explicit semaphore
for each eventcount generation...
Joe Seigh
>
> Well, so much for Linux. GCC suddenly decided to segfault for all programs
> including HelloWorld. Nothing to see here. Move along.
>
Ah, nothing. It was just the Linux equivalent of BSOD.
Well, now that I got GCC to work again, I have definitive proof that futex wait morphing
is somewhat suboptimal. Lucky thing that being suboptimal is not a problem for Linux.
Joe Seigh
> > If you go this route, the wait queue has to be FIFO, which it appears to be,
> > and the delayed woken thread need a different return code besides 0, for
> > instance EALREADY, unless you want to break the correspondence between the
> > return code from futex_wake and the number of threads getting return code 0.
> >
>
> No response in the kernel mailing list since this problem was stated. I don't
> think they realize there's a problem.
>
It looks like they picked it up again. They'll probably try to "fix" it. What do
you think the odds are that if there was a bug from using the simpler interface
there won't be a bug caused by a much more complicated and subtle interface?
Whatever they do will probably break the code I did, so it's probably a good thing
it was just experimental.
Joe Seigh
You mean that "drowsy" thing? My, this is fun. ;-)
regards,
alexander.
Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > It looks like they picked it up again.
>
> You mean that "drowsy" thing? My, this is fun. ;-)
>
Actually, no. I'm sorry I did the alternate condvar
implementation now. It's better if you believe all the
POSIX threads implemetations are optimal so you can
concentrate on the logic of your programs rather than
using performance hacks. Now that they all run on the
same platforms as fully supported, can you imagine if
someone actually did pthread performance measurements
for Linux, Solaris and windows on the same hardware.
How could you write portable code if you though performance
was important?
Joe Seigh
Why? Your algorithm is extremely straight forward. Even the sched_yield
thing... Its so simple!
Linux maintainers need to understand that.
>It's better if you believe all the
>POSIX threads implemetations are optimal so you can
>concentrate on the logic of your programs rather than
>using performance hacks.
> Now that they all run on the
>same platforms as fully supported, can you imagine if
>someone actually did pthread performance measurements
>for Linux, Solaris and windows on the same hardware.
>How could you write portable code if you though performance
>was important?
I think the answer is to the problem is, link against an alternate pthread
library...
:O
SenderX wrote:
>
> > Actually, no. I'm sorry I did the alternate condvar
> > implementation now.
>
> Why? Your algorithm is extremely straight forward. Even the sched_yield
> thing... Its so simple!
>
> Linux maintainers need to understand that.
You need to understand Linux kernel developers. They exist in their own
self contained universe. Anyway the tail wagging the dog api design is
fun to watch. You can see where they think some aspect of their implementation
is neat and they will attempt to preserve it no matter what the cost in
complexity and actual performance.
Joe Seigh
Agreed.
> You can see where they think some aspect of their implementation
> is neat and they will attempt to preserve it no matter what the cost in
> complexity and actual performance.
I have no idea what they think, but their "requeue" stuff is awfully
silly with or without "compare." Condvars aside for a moment (futex
based pigs won't fly) they can't get those futex beasts (lock queues)
right for locks, to begin with. I mean kernel maintained waiters bit
for semas, mutexes, and read-write locks; primarily.
Just look at their pthread barrier thing... what's the point of
doing all sorts of utterly ugly stuff in the user space if all
threads always end up in the kernel?!
<quote>
Barriers pseudocode
===================
int pthread_barrier_wait(barrier_t *barrier);
struct barrier_t {
unsigned int lock:
- internal mutex
unsigned int left;
- current barrier count, # of threads still needed.
unsigned int init_count;
- number of threads needed for the barrier to continue.
unsigned int curr_event;
- generation count
}
pthread_barrier_wait(barrier_t *barrier)
{
unsigned int event;
result = 0;
lll_lock(barrier->lock);
if (!--barrier->left) {
barrier->curr_event++;
futex_wake(&barrier->curr_event, INT_MAX)
result = BARRIER_SERIAL_THREAD;
} else {
event = barrier->curr_event;
lll_unlock(barrier->lock);
do {
futex_wait(&barrier->curr_event, event)
} while (event == barrier->curr_event);
}
if (atomic_increment_val (barrier->left) == barrier->init_count)
lll_unlock(barrier->lock);
return result;
}
</quote>
regards,
alexander.
Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > fun to watch.
>
> Agreed.
>
> > You can see where they think some aspect of their implementation
> > is neat and they will attempt to preserve it no matter what the cost in
> > complexity and actual performance.
>
> I have no idea what they think, but their "requeue" stuff is awfully
> silly with or without "compare." Condvars aside for a moment (futex
> based pigs won't fly) they can't get those futex beasts (lock queues)
> right for locks, to begin with. I mean kernel maintained waiters bit
> for semas, mutexes, and read-write locks; primarily.
They'd need a way to set the bit on all platforms. And they'd need to
lock the queue or block dequeue operations while they examined or set
the futex, something they don't do currently.
>
> Just look at their pthread barrier thing... what's the point of
> doing all sorts of utterly ugly stuff in the user space if all
> threads always end up in the kernel?!
>
(barrier code snipped)
Well, the lock is entirely redundant but probably doesn't hurt anything
as barrier ususally blocks ((n-1)/n) of the time. They'd be better off
if the barrier split out the signal finish and the wait sections so
threads could do something else and be more likely to avoid entering
the kernel and blocking. Maybe do interations based on the (n-2)th result
rather than (n-1)th result and use alternating staggered barriers.
Joe Seigh
Sure. What's the problem?
> And they'd need to
> lock the queue or block dequeue operations while they examined or set
> the futex, something they don't do currently.
I don't think that their current futex impl is lock-free.
[... barrier ...]
> They'd be better off
> if the barrier split out the signal finish and the wait sections so
> threads could do something else and be more likely to avoid entering
> the kernel and blocking. Maybe do interations based on the (n-2)th result
> rather than (n-1)th result and use alternating staggered barriers.
Or maybe simply count waiters in the kernel (per-thread gates with fast
path approach aside for a moment).
https://listman.redhat.com/archives/phil-list/2003-October/msg00031.html
regards,
alexander.
Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > They'd need a way to set the bit on all platforms.
>
> Sure. What's the problem?
None. They can't use the current routines to examine user storage, they'd
have to write a new one with unique implementations for all platforms.
Just extra work, that's all.
>
> > And they'd need to
> > lock the queue or block dequeue operations while they examined or set
> > the futex, something they don't do currently.
>
> I don't think that their current futex impl is lock-free.
They currently don't hold the queue lock while examining the futex value.
I'm guessing this is because they don't want the overhead of pinning and
unpinning the futex memory or something like that. It opens up a race
condition where a futex_wake can wake up a thread that has observably
already been "woken" by the futex value. Since they've become rather fond
of not having to pin memory, they're entertaining some rather contrived logic
to continue that way. I don't know but if they managed to get an error
with some rather simple logic, the odds for errors in much more complicated
logic are probably a bit more.
Anyhow, Linux has managed to get their own equivalent of Microsoft's
PulseEvent problem. That's progress.
Joe Seigh
Ah that. It's similar problem as with "stolen" signals due to
timeouts (under unreasonable Patrick TJ McPhee's interpretation,
hello Patrick ;-) )
See <http://www.terekhov.de/DESIGN-futex-CV.cpp>.
I mean "stolen_sig = this_waiter.is_reset()" in timedwait() and
"m_futex.wake(ALL)" in leave_wait() as circumvention. But even
with all such things fixed (and with added optimizations for fast
signaling with no waiters), such futex based condition variables
are still pigs only good for some mental gymnastics, so to say.
regards,
alexander.
Well, futexes are all you have to work with, warts and all, so
I guess that make them warthogs. :)
Unless you want to write your own sycall. I haven't considered that
because it's too much trouble trying to figure out the kernel api
from reading scattered code fragments, given that the api to do that
should conceptually be very simple. Kind of like what application
programming would be if there were no man pages and everybody was
expected to look at libc source code to figure out what and how
to use it.
Joe Seigh
Lowering the "barrier to entry" (with respect to kernel programming)
isn't in the {finanical, etc.} interests of the kernel gang. ;-) ;-)
regards,
alexander.
Well, you don't want too many kernel developers. You might argue there
are too many already. So that's one way of doing it, by keeping it
difficult to learn. But the downside to that is you may get all the
wrong sort of kernel developers that way, ones who because they can
deal with complexity think nothing of keeping it what way and writing
even more complex code which ensures the competition is kept out.
Probably one of the reasons microkernels are disliked so much by the
Linux establishment.
Joe Seigh
Alexander Terekhov wrote:
>
> ... But even
> with all such things fixed (and with added optimizations for fast
> signaling with no waiters), such futex based condition variables
> are still pigs only good for some mental gymnastics, so to say.
>
But being a pig, it can be cured. Hmm... I could fix the futex problem
plus make it run even faster. Interesting.
Joe Seigh
I posted an eventcount algorithm sketch that basically embedded the mutex
for its explicit waitqueue in the eventcount variable itself:
http://groups.google.com/groups?selm=Sp0nd.422326%24D%25.325220%40attbi_s51
( 2 bits reserved by the waitqueue, and 1 waiters bit reserved by the
eventcount )
The algorithm I came up with can acquire the explicit waitqueue lock "and"
compare and/or set the eventcount atomically. That totally eliminates any
race-conditions wrt setting the eventcount and waking threads. Do you see
any problems with this schema? I think that the ability to lock the waitset
and compare/set the eventcount is critical.
It seems if a futex impl could be heavily improved if it had a bit or two
reserved for a mutex that protects it's explicit waitqueue...
You can't have the lock that protects the queue, or any kernel data struture,
in user space. Deliberate or inadvertent damage to the lockwork by a user
program could cause corruption of the kernel. The present futex logic of
just comparing against the futex word is ok since any value is ok. The
worst that could happen due to corruption of the futex word is spurious
or lost wakeups, not a kernel panic.
There's better ways of implimenting futexes but none of us are in a position
to do that. I suspect that if they realize that the current futex api allows
alternate pthread implementations, they will change it so that the only possible
implementation is theirs so as to not suffer in comparison.
Joe Seigh
Unfourtanlly your correct. So I guess any sort of waiter's bit maintained by
the kernel is out of the question, since the bit would imply some sort of
sync between the user-space futex and its hashed kernel waitqueue. Or is
Alex talking about something else when he refers to a waiters bit?
Unfourtanlly your correct. So I guess any sort of waiter's bit maintained by
As long as the kernel doesn't do anything with the waiters bit except set or
unset it to reflect the queue status it should be ok. If a user process
clobbered the waiters bit, it should eventually get set again to the right
value by the kernel. Some user logic might get screwed up but that shouldn't
affect kernel integrity.
Joe Seigh
I don't quite understand. It seems that the waiters bit would be critical.
If the waiters bit got clobbered right before a call to futex_wake, would
that not be a potential problem?
futex_wake wouldn't use the waiters bit as input. In fact it would mask it
out to do the compare. When it's finished it sets the waiters bit to reflect
the queue status. The waiters bit is only used by user code to "see" what
the queue status is.
Joe Seigh
I think I understand now. futex_wait/wake would atomically set the waiters
bit if there were still any waiters in the queue, or clear the bit of the
queue was empty, before unlocking the queue mutex...
So, I guess a mutex impl could use the waiters bit as a simple flag to call
futex_wake, just as you mentioned...
#define WAITERS_BIT 0x80000000
mutex lock
-------------
while ( atomic_xchg( &futex, 1 ) )
{
futex_wait( &futex, 1 )
{
lock_wait_queue();
if ( *futex == 1 )
{
enqueue_us;
}
if ( there_are_any_waiters )
{
atomic_bit_set( futex, 0x80000000 );
}
else { atomic_bit_clear( futex, 0x80000000 ); }
unlock_wait_queue();
if ( we_enqueued )
{
wait on our gate...
}
}
}
mutex unlock
-------------
if ( atomic_xchg( &futex, 0 ) & WAITERS_BIT )
{
futex_wake( &futex, 1 )
{
lock_wait_queue();
dequeue_a_waiter;
if ( there_are_any_waiters_left )
{
atomic_bit_set( futex, 0x80000000 );
}
else { atomic_bit_clear( futex, 0x80000000 ); }
unlock_wait_queue();
if ( we_dequeued_a_waiter )
{
signal waiter's gate...
}
}
}
Is that about it?
BTW, thanks for bearing with me so far!
;)
Here is the fixed lock function
mutex lock
-------------
while ( ( atomic_xchg( &futex, 1 ) & ~WAITERS_BIT ) )
{
futex_wait( &futex, 1 )
{
lock_wait_queue();
if ( ( *futex & ~WAITERS_BIT ) == 1 )
SenderX wrote:
>
> Arghrghrg. I totally forgot to mask off the fu#$% bit!
>
> Here is the fixed lock function
>
> mutex lock
> -------------
>
> while ( ( atomic_xchg( &futex, 1 ) & ~WAITERS_BIT ) )
> {
> futex_wait( &futex, 1 )
> {
pin_memory_page(futex);
> lock_wait_queue();
>
> if ( ( *futex & ~WAITERS_BIT ) == 1 )
> {
> enqueue_us;
> }
>
> if ( there_are_any_waiters )
> {
> atomic_bit_set( futex, 0x80000000 );
> }
>
> else { atomic_bit_clear( futex, 0x80000000 ); }
>
> unlock_wait_queue();
unpin_memory_page(futex);
>
> if ( we_enqueued )
> {
> wait on our gate...
> }
> }
> }
Of course it's a little bit more complicated. Currently the only routines for
accessing user storage are things like get_user() and put_user(). You'd have
to write a set of interlocked user access routines -- for all platforms.
Plus the pin/unpin memory routines probably aren't that cheap. They're required
to make the user memory access routines safe to use while holding a spin lock.
That's whay they're trying to avoid. It sounds like that's what the orginal
2.4 futex did. Anybody know what the performance difference was between the
old futex and the current one?
Joe Seigh
WHILE
atomic_bit_test_set_ccacq(&futex, 1)
lock_queue_wait(&futex, 1) // wait if locked bit is set
> > {
> > futex_wait( &futex, 1 )
> > {
>
> pin_memory_page(futex);
> > lock_wait_queue();
> >
> > if ( ( *futex & ~WAITERS_BIT ) == 1 )
> > {
> > enqueue_us;
> > }
> >
> > if ( there_are_any_waiters )
> > {
> > atomic_bit_set( futex, 0x80000000 );
> > }
No. Waiters bit must be set *before* lock state check or "together"
with lock state check (I mean CAS LL/LR-SC).
> >
> > else { atomic_bit_clear( futex, 0x80000000 ); }
If "realtime" is not an issue, it can be cleared in the user-space
by mutex_unlock.
IF atomic_swap_rel(lock_queue = &futex, 0) < 0 // or > 1
THEN lock_queue_wake(lock_queue, 1)
lock_queue_wake() (or wake exit path in lock_queue_wait()) would
have to set waiters bit again if queue remains not empty.
regards,
alexander.
Joe Seigh
What's the issue with pinning and unpinning (or lack thereof)?
regards,
alexander.
Taking a page fault while holding a spin lock.
Joe Seigh
Well, whatever. Waiters bit can be set without holding a lock.
And "lost signals" that they're fighting now is not a problem for
locks.
regards,
alexander.
Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> >
> > Alexander Terekhov wrote:
> > >
> > > Joe Seigh wrote:
> > > >
> > > > What's the advantage of the waiters bit that is going to outweigh
> > > > the disavantage of pinning and unpinning memory?
> > >
> > > What's the issue with pinning and unpinning (or lack thereof)?
> > >
> >
> > Taking a page fault while holding a spin lock.
>
> Well, whatever. Waiters bit can be set without holding a lock.
Without race conditions with regard to the state of the wait queue
which is protected by a spin lock?
>
> And "lost signals" that they're fighting now is not a problem for
> locks.
But lost waiters bit would be.
Joe Seigh
False positives won't affect correctness.
> >
> > And "lost signals" that they're fighting now is not a problem for
> > locks.
>
> But lost waiters bit would be.
Again, consider:
mutex_lock {
WHILE
atomic_bit_test_set_ccacq(&futex, 1)
lock_queue_wait(&futex, 1) // wait if locked bit is set
}
mutex_unlock {
IF atomic_swap_rel(lock_queue = &futex, 0) < 0 // or > 1
THEN lock_queue_wake(lock_queue, 1)
}
And I still don't quite undestand the problem of page faults while
holding locks associated with futexes (I'm talking about per-futex
locks).
regards,
alexander.
Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > > Well, whatever. Waiters bit can be set without holding a lock.
> >
> > Without race conditions with regard to the state of the wait queue
> > which is protected by a spin lock?
>
> False positives won't affect correctness.
False negatives would effect correctness.
>
> > >
> > > And "lost signals" that they're fighting now is not a problem for
> > > locks.
> >
> > But lost waiters bit would be.
>
> Again, consider:
>
> mutex_lock {
> WHILE
> atomic_bit_test_set_ccacq(&futex, 1)
> lock_queue_wait(&futex, 1) // wait if locked bit is set
> }
>
> mutex_unlock {
> IF atomic_swap_rel(lock_queue = &futex, 0) < 0 // or > 1
> THEN lock_queue_wake(lock_queue, 1)
> }
>
> And I still don't quite undestand the problem of page faults while
> holding locks associated with futexes (I'm talking about per-futex
> locks).
>
Well, they couldn't use spin locks, it would have to be another form
of blocking.
Anyway you haven't answered why a waiters bit would be useful enough
to outweigh its disadvantages.
Joe Seigh
Consider the sketch below. Suppose that you've got mutiple waiters.
mutex_unlock would clear waiters bit and there will be a time window
with "false negative." But that's harmless provided that
"lock_queue_wake(lock_queue, 1)" would set waiters bit again (before
"releasing" a waiter).
> >
> > > >
> > > > And "lost signals" that they're fighting now is not a problem for
> > > > locks.
> > >
> > > But lost waiters bit would be.
> >
> > Again, consider:
> >
> > mutex_lock {
> > WHILE
> > atomic_bit_test_set_ccacq(&futex, 1)
> > lock_queue_wait(&futex, 1) // wait if locked bit is set
> > }
> >
> > mutex_unlock {
> > IF atomic_swap_rel(lock_queue = &futex, 0) < 0 // or > 1
> > THEN lock_queue_wake(lock_queue, 1)
> > }
> >
> > And I still don't quite undestand the problem of page faults while
> > holding locks associated with futexes (I'm talking about per-futex
> > locks).
> >
>
> Well, they couldn't use spin locks, it would have to be another form
> of blocking.
Whatever.
>
> Anyway you haven't answered why a waiters bit would be useful enough
> to outweigh its disadvantages.
Again, what disadvantages?
regards,
alexander.
As for some advantages, think of real (not the idiocy they're
entertaining currently) wait-morphing. Both pthread_cond_signal()
and pthread_cond_broadcast() would do it nicely with waiters bit
on a mutex. No need to always release one waiter and make him set
"contention state" on a mutex. That waiter would instead go
directly to the queue of the locked mutex with waiters bit set
by the kernel.
regards,
alexander.
Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> >
> > Alexander Terekhov wrote:
> > >
Skip the race condition issue for now since it may depend on how
locking is performed.
> > >
> > > And I still don't quite undestand the problem of page faults while
> > > holding locks associated with futexes (I'm talking about per-futex
> > > locks).
> > >
> >
> > Well, they couldn't use spin locks, it would have to be another form
> > of blocking.
>
> Whatever.
The other form of blocking is likely to be more expensive, otherwise why
have spin locks which have a more limited application.
>
> >
> > Anyway you haven't answered why a waiters bit would be useful enough
> > to outweigh its disadvantages.
>
> Again, what disadvantages?
>
The lack of any advantages, apparently. Plus, most people would consider that
it's not implemented to be a distinct disadvantage also.
Joe Seigh
Actually, that's not the reason they wake up that one waiter. I found
that out the hard way. Given the way adaptive mutexes work and from
what I've measured, I doubt you could even measure any effect from the
spurious wakeups.
Joe Seigh
As I mentioned one or two times before, RT-NPTL project has already
implemented (modulu bugs) something essentially similar to lock
futexes with waiters bits. They call them "fulocks". The kernel
indicates the presense of waiters by modifying the user-space state.
They set the lock state to "VFULOCK_WP" (WP stands for "waiters
present" I guess).
regards,
alexander.
What are you talking about? Details and "actual reason", please.
regards,
alexander.
Alexander Terekhov wrote:
>
> >
> > Actually, that's not the reason they wake up that one waiter. I found
> > that out the hard way. Given the way adaptive mutexes work and from
> > what I've measured, I doubt you could even measure any effect from the
> > spurious wakeups.
>
> What are you talking about? Details and "actual reason", please.
>
Well, the testcase I ran was .8 sec for broadcast w/ thundering herd
and .7 sec for broadcast w/ wait morphing. If waking up all of the
threads didn't cause a huge difference then I don't think the extra
spurious wake per broadcast is going to be that significant. Why?
Have you measured different results?
Joe Seigh
Does this sketch look a bit more correct...
int futex_wait( int *futex, int futex_cmp )
{
register int cmp, old;
pin_memory_page( futex );
lock_wait_queue( futex );
old = *futex;
do
{
if ( old != futex_cmp && there_are_no_waiters ) { goto bye; }
cmp = old;
old = atomic_cas( futex, old | WAITERS_BIT, cmp );
} while ( cmp != old );
if ( old == futex_cmp )
{
enqueue_us;
}
bye:
unlock_wait_queue( futex );
unpin_memory_page( futex );
if ( we_enqueued )
{
wait_on_gate( ... );
}
return 0;
}
;(......
int futex_wait( int *futex, int futex_cmp )
{
register int cmp, old;
pin_memory_page( futex );
lock_wait_queue( futex );
old = *futex;
do
{
if ( ( old & ~WAITERS_BIT ) != futex_cmp && there_are_no_waiters )
{ goto bye; }
cmp = old;
old = atomic_cas( futex, old | WAITERS_BIT, cmp );
} while ( cmp != old );
if ( ( old & ~WAITERS_BIT ) == futex_cmp )
old = atomic_cas( futex, cmp, old | WAITERS_BIT );
It "seems" that CAS/LL-SC could be used to do just that, along with the
futex compare, before you pin and lock the futex queue... Because an unequal
futex comprand is basically treated like a broadcast, so there really would
be no deadlocks?
They're probably FIFO or some other scheduler controled service order
lock rather than an adaptive lock, so they could do that with a simple
put_user(). Of course there's other ways to get the same effect without
having to do a put_user. And there's ways that don't even require that
information.
I should patent some of ways that haven't been disclosed yet. With
the fixation on suboptimal solutions, they might be worth something.
Joe Seigh