Fast futex based condition variables

Joe Seigh

unread,

Nov 17, 2004, 11:08:25 AM11/17/04

to

This is a lock-free futex based condvar. It doesn't do
wait morphing. The speed up hack was simply placing a
sched_yield() after the futex wait. I'll post the code
in follow up posts for anyone who wants to experiment.
I'm not going to support it as its not clear it's worth it.
Stats on high contention condvar usage repored earler.
Here's some stats on a more conventional application.
Disk based filesystem i/o was used and which is buffered,
particularly the file read i/o. Network i/o would have
been a better test.

Simple producer/consumer copy of 18080 512 byte record file
with 40 buffers.

ru_nvcsw - voluntary context switches
ru_nivcsw - involuntary context switches
cvwait_r - # times producer thread did a cond_wait
cvwait_w - # times consumer thread did a cond_wait

NPTL condvars:
time = 1.359141
user time = 0.189971
system time = 1.165823
ru_nvcsw = 54063
ru_nivcsw = 54106
cvwait_r = 18021
cvwait_w = 0

fastcv w/o sched_yield:
time = 1.332471
user time = 0.199970
system time = 1.129828
ru_nvcsw = 36084
ru_nivcsw = 36133
cvwait_r = 18041
cvwait_w = 0

fastcv w/ sched_yield:
time = 0.572876
user time = 0.047993
system time = 0.514922
ru_nvcsw = 582
ru_nivcsw = 1694
cvwait_r = 561
cvwait_w = 6

NPTL semaphores:
time = 1.103147
user time = 0.095986
system time = 1.004847
ru_nvcsw = 17948
ru_nivcsw = 17990
cvwait_r = 0
cvwait_w = 0

NPTL semaphores w/ sched_yield after sem_wait:
time = 0.966496
user time = 0.098985
system time = 0.865868
ru_nvcsw = 460
ru_nivcsw = 37079
cvwait_r = 0
cvwait_w = 0

fast semaphores w/ sched_yield after sem_wait:
time = 0.555656
user time = 0.047993
system time = 0.505923
ru_nvcsw = 557
ru_nivcsw = 1664
cvwait_r = 0
cvwait_w = 0

Single thread read/write single buffer loop copy:
time = 0.559268
user time = 0.017998
system time = 0.539918
ru_nvcsw = 0
ru_nivcsw = 39
cvwait_r = 0
cvwait_w = 0

NPTL condvar w/ sched_yield after pthread_mutex_unlock:
time = 0.927346
user time = 0.099985
system time = 0.825875
ru_nvcsw = 27
ru_nivcsw = 38978
cvwait_r = 0
cvwait_w = 1
nreads = 18080

That last item is kind of strange. Combining it with
sched_yields in other places slows things down. I don't
know what is going on in that case.

Joe Seigh

unread,

Nov 17, 2004, 11:29:52 AM11/17/04

to

It's optimistically named fastcv. The files are fastcv.h fastcv.c. To enable
the sched_yield specify -D_FASTCV_YIELD on the compile of fastcv.c. You can
specifgy -include fastcv.h on the application compile to avoid any source code
changes. Library options are -lpthread and -lrt.

Joe Seigh

fastcv.h
--------------------------------------------
#ifndef _FASTCV_H
#define _FASTCV_H

#include <pthread.h>

typedef struct {
int waiters; // atomic
int eventcount; // broadcast events (futex)
} pthread_cond_x_t;

#define pthread_cond_t pthread_cond_x_t
#undef PTHREAD_COND_INITIALIZER
#define PTHREAD_COND_INITIALIZER {0, 0}
#define pthread_cond_wait(c, m) pthread_cond_timedwait_x(c, m, NULL)
#define pthread_cond_timedwait pthread_cond_timedwait_x
#define pthread_cond_signal pthread_cond_signal_x
#define pthread_cond_broadcast pthread_cond_broadcast_x
#define pthread_cond_init pthread_cond_init_x
#define pthread_cond_destroy pthread_cond_destroy_x

extern int pthread_cond_timedwait_x(pthread_cond_t *, pthread_mutex_t *, struct timespec *);
extern int pthread_cond_signal_x(pthread_cond_t *);
extern int pthread_cond_broadcast_x(pthread_cond_t *);
extern int pthread_cond_init_x(pthread_cond_t *, pthread_condattr_t *);
extern int pthread_cond_destroy_x(pthread_cond_t *);

#endif /* _FASTCV_H */

--------------------------------------------
fastcv.c
--------------------------------------------
#include <pthread.h>
#include <errno.h>
#include <limits.h>
#include <string.h>
#include <sys/syscall.h>
#include <linux/futex.h>
#include <sched.h>

#include <asm/atomic.h>

#define atomic_inc_x(x) atomic_inc((atomic_t *)(x))
#define atomic_add_negative_x(i, x) atomic_add_negative((i), (atomic_t *)(x))

#include "fastcv.h"

//---------------------------------------------------------------------
// futex syscall definitions
//
//---------------------------------------------------------------------
#define futex_wait(futex, val, ts) \
syscall(SYS_futex, (unsigned long)futex, FUTEX_WAIT, val, (unsigned long)ts, 0)

#define futex_wake(futex, nwake) \
syscall(SYS_futex, (unsigned long)futex, FUTEX_WAKE, nwake, 0, 0)

//---------------------------------------------------------------------
// cancelation cleanup handler buffer
//
//---------------------------------------------------------------------
struct cbuffer {
pthread_cond_t *cvar;
pthread_mutex_t *mutex;
};

//---------------------------------------------------------------------
// abstime_to_reltime --
//
// returns ETIMEDOUT if abstime has passed
//---------------------------------------------------------------------
int abstime_to_reltime(struct timespec *reltime, struct timespec *abstime) {
struct timespec now;
int borrow = 0;

clock_gettime(CLOCK_REALTIME, &now);

if ((reltime->tv_nsec = (abstime->tv_nsec - now.tv_nsec)) < 0) {
borrow = 1;
reltime->tv_nsec += 1000000000;
}

// <
if ((reltime->tv_sec = (abstime->tv_sec - now.tv_sec) - borrow) < 0)
return ETIMEDOUT;
// ==
else if (reltime->tv_sec == 0 && reltime->tv_nsec == 0)
return ETIMEDOUT;
// >
else
return 0;
}

//---------------------------------------------------------------------
// dec_waiters -- atomically decrement waiters count
// If result is -1, condvar is being destroyed, signal
// thread waiting on cond_destroy.
//
//---------------------------------------------------------------------
int dec_waiters(pthread_cond_t *cvar, int count) {
// if (atomic_dec(&(cvar->waiters)) == 0)
if (atomic_add_negative_x(-count, &(cvar->waiters))) {
// signal if condvar pending deletion (waiters == -1)
cvar->eventcount++; // need not be atomic at this point
futex_wake(&(cvar->eventcount), 1);
return 1;
}

else
return 0;
}

//---------------------------------------------------------------------
// cvar_cleanup -- handle canceled condition wait
//
//---------------------------------------------------------------------
void cvar_cleanup(void *arg) {
struct cbuffer *cbuf = (struct cbuffer *)arg;

if (dec_waiters(cbuf->cvar, 1) == 0)
pthread_cond_broadcast(cbuf->cvar);

if (pthread_mutex_lock(cbuf->mutex) != 0)
abort();
}

//---------------------------------------------------------------------
// pthread_cond_timedwait --
//
//---------------------------------------------------------------------
int pthread_cond_timedwait_x(pthread_cond_t *cvar, pthread_mutex_t *mutex, struct timespec *abstime) {
int rc, rc2;
int waitval;
int canceltype;
struct cbuffer cbuf;
struct timespec reltime;
struct timespec *rtp = NULL;

cbuf.cvar = cvar;
cbuf.mutex = mutex;

atomic_inc_x(&(cvar->waiters)); // increment waiters count

waitval = cvar->eventcount; // atomic

if ((rc = pthread_mutex_unlock(mutex)) != 0) {
dec_waiters(cvar, 1);
return rc;
}

pthread_cleanup_push(&cvar_cleanup, &cbuf);
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &canceltype);

rc = 0;
if (abstime) {
rtp = &reltime;
rc = abstime_to_reltime(rtp, abstime);
}

if (rc == 0) {
rc = futex_wait(&(cvar->eventcount), waitval, rtp);
if (rc < 0)
rc = errno;
#ifdef _FASTCV_YIELD
else
sched_yield();
#endif
}

pthread_setcanceltype(canceltype, NULL);
pthread_cleanup_pop(0);

// If not woken by futex_wakeup, decrement waiters count
//
if (rc != 0)
dec_waiters(cvar, 1);

if ((rc2 = pthread_mutex_lock(mutex)) != 0)
return rc2;
else if (rc != ETIMEDOUT)
return 0;
else
return ETIMEDOUT;

}

//---------------------------------------------------------------------
// pthread_cond_signal --
//
//---------------------------------------------------------------------
int pthread_cond_signal_x(pthread_cond_t *cvar) {
int rc;

if (cvar->waiters > 0) {
// wake up a single waiter
rc = futex_wake(&(cvar->eventcount), 1);
if (rc == 0) {
// none woken up,
// do broadcast instead
atomic_inc_x(&(cvar->eventcount));
rc = futex_wake(&(cvar->eventcount), INT_MAX);
}

if (rc > 0)
dec_waiters(cvar, rc);
}

return 0;
}

//---------------------------------------------------------------------
// pthread_cond_broadcast --
//
//---------------------------------------------------------------------
int pthread_cond_broadcast_x(pthread_cond_t *cvar) {
int rc;

if (cvar->waiters > 0) {
atomic_inc_x(&(cvar->eventcount));
rc = futex_wake(&(cvar->eventcount), INT_MAX);

if (rc > 0)
dec_waiters(cvar, rc);
}

return 0;
}

//---------------------------------------------------------------------
// pthread_cond_init --
//
//---------------------------------------------------------------------
int pthread_cond_init_x(pthread_cond_t *cvar, pthread_condattr_t *attr) {
memset(cvar, 0, sizeof(pthread_cond_t));
return 0;
}

//---------------------------------------------------------------------
// pthread_cond_destroy --
//
//---------------------------------------------------------------------
int pthread_cond_destroy_x(pthread_cond_t *cvar) {
int rc;
int waitval;
//if (atomic_dec(&(cvar->waiters) != 0) {
if (!atomic_add_negative_x(-1, &(cvar->waiters))) {
// not -1, wait for it.
for (;;) {
waitval = cvar->eventcount;
// rmb();
if (cvar->waiters == -1) break;
futex_wait(&(cvar->eventcount), waitval, NULL);
}
}
// nothing needs to be done currently.

return 0;
}

/*-*/

Joe Seigh

unread,

Nov 17, 2004, 1:04:29 PM11/17/04

to

> fast semaphores w/ sched_yield after sem_wait:
> time = 0.555656
> user time = 0.047993
> system time = 0.505923
> ru_nvcsw = 557
> ru_nivcsw = 1664
> cvwait_r = 0
> cvwait_w = 0
>

That was a quick and dirty fast semaphore written with atomic
increment and decrement. You can write it with sem_trywait()
instead. You get a slight time increase to around .65.

Joe Seigh

SenderX

unread,

Nov 17, 2004, 8:44:44 PM11/17/04

to

> It's optimistically named fastcv. The files are fastcv.h fastcv.c. To
> enable
> the sched_yield specify -D_FASTCV_YIELD on the compile of fastcv.c. You
> can
> specifgy -include fastcv.h on the application compile to avoid any source
> code
> changes. Library options are -lpthread and -lrt.

Nice! I am looking forward to playing around with this code. It will teach
me more about these damn futexs, I don't quite understand them fully.

Joe Seigh

unread,

Nov 17, 2004, 9:31:43 PM11/17/04

to

It just uses futex_wait and futex_wake which are fairly straightforward. An
earlier version looped on EINTR if the evencount was unchanged. NPTL's
version can't loop because they use wait morphing and the waiting thread
can't know which futex it woke up on. futex_cmp_requeue is a little strange.
You sort of have to do a broadcast if the compare fails because you might have
lost a signal when the threads were temporarily dequeued. I don't know why
they didn't just hard code this into futex_cmp_requeue. It would be fairly
straightforward to add waitmorphing to my code. I just got tired of messing
with it.

Also the cancelation logic could be make simpler. I suspect that thread cancelation
is done with a built-in signal. If that was so, instead of setting up cancelation
handlers and enabling async cancelation, just call pthread_testcancel after reacquiring
the lock if futex_wait had exited with EINTR.

But enough fooling around with condvars. It's not my job, there are people who are
paid to do it and it's their job. I'll probably do a "zero overhead" eventcount
based on the futex since it is an eventcount of sorts.

Joe Seigh

SenderX

unread,

Nov 18, 2004, 4:23:17 AM11/18/04

to

> It just uses futex_wait and futex_wake which are fairly straightforward.

I was wondering how futex_wait atomically checks if the futex value is the
same as the expected value. It kind of seems like a cas operation.

Something like this:

int futex_wait( int *futex, int cmp )
{
hash_futex_lock( futex );

if ( atomic_cas( futex, cmp, cmp ) == cmp )
{
ret = queue_me_and_wait( futex );

} else { ret = EWOULDBLOCK; }

hash_futex_unlock( futex );
}

?

Joe Seigh

unread,

Nov 18, 2004, 6:57:21 AM11/18/04

to

No, it just has to queue (suspend) the waiting thread, fetch the futex
value, and dequeue (resume) the waiting thread if the futex value is
different than the cmp value.

There has to be a store-load barrier between the queue and futex fetch.

If dequeue failes because thread resumed by something else that's ok.

If thread has been dequeued and subsequently requeued before you attempted
the dequeue, that's ok even if the new cmp value has changed from the one
you are using. It's just a spurious wakeup.

There's currently a discussion about this on the kernel mailing list.

Joe

SenderX

unread,

Nov 18, 2004, 7:17:54 AM11/18/04

to

> It just uses futex_wait and futex_wake which are fairly straightforward.
> An
> earlier version looped on EINTR if the evencount was unchanged.

Would the following pseudo-code implement a zero-overhead eventcount?

It should, hummm...

#define EC_WAITERS 1
#define EC_LOCKED 2
#define EC_LOCK_WAITERS 4

typedef struct ec_
{
volatile ULONG count;
HANDLE waitset; // auto-reset event
per_thread_t *front;
per_thread_t *back;

} ec_t;

/* Locks and increments event count */
void prv_ec_lock_inc( ec_t *_this )
{
register int waited = 0;
register LONG cmp, xchg, old = _this->count;

for ( ;; )
{
if ( ! ( old & EC_LOCKED ) )
{
/* lock and inc */
xchg = ( old | EC_LOCKED ) + 0x00000010;
}

else { xchg = old | EC_LOCK_WAITERS; }

if ( waited ) { xchg |= EC_LOCK_WAITERS; }

cmp = old;

old = InterlockedCompareExchange
( &_this->count,
xchg & ~EC_WAITERS,
cmp );

if ( cmp == old )
{
if ( ! ( old & EC_LOCKED ) ) { break; }

if ( WaitForSingleObject
( _this->waitset,
INFINITE ) != WAIT_OBJECT_0 )
{
abort();
}

waited = 1;
old = _this->count;
}
}
}

/* Compares count, and locks if equal */
BOOL prv_ec_lock_cmp( ec_t *_this, LONG cmp )
{
register int waited = 0;
register LONG cmp, xchg, old = _this->count;

for ( ;; )
{
/* compare event count */
if ( ( old & 0xFFFFFFF0 ) == cmp )
{
if ( ! ( old & EC_LOCKED ) )
{
/* lock */
xchg = old | EC_LOCKED;
}

else { xchg = old | EC_LOCK_WAITERS; }
}

else { xchg = old; }

if ( waited ) { xchg |= EC_LOCK_WAITERS; }

cmp = old;

old = InterlockedCompareExchange
( &_this->count,
xchg,
cmp );

if ( cmp == old )
{
if ( ( old & 0xFFFFFFF0 ) != cmp ) { break; }

else if ( ! ( old & EC_LOCKED ) ) { return TRUE; }

if ( WaitForSingleObject
( _this->waitset,
INFINITE ) != WAIT_OBJECT_0 )
{
abort();
}

waited = 1;
old = _this->count;
}
}

return FALSE;
}

/* unlocks */
void prv_ec_unlock( ec_t *_this )
{
register LONG cmp, old = _this->old;

do
{
cmp = old;

old = InterlockedCompareExchange
( &_this->count,
old & ~( EC_LOCKED | EC_LOCKED_WAITERS ),
cmp );

} while { cmp != old );

if ( old & EC_LOCKED_WAITERS )
{
if ( ! SetEvent( _this->waitset ) ) { abort(); }
}
}

LONG ec_get( ec_t *_this )
{
register LONG cmp, old = _this->count;

do
{
cmp = old;

/* set ec waiters */
old = InterlockedCompareExchange
( &_this->count,
old | EC_WAITERS,
cmp );

} while ( cmp != old );

return old & 0xFFFFFFF0;
}

void ec_wait( ec_t *_this, LONG cmp )
{
if ( prv_ec_lock_cmp( _this, cmp ) )
{
per_thread_t *thread = per_thread_self();

InterlockedIncrement( &thread->refs );

if ( ! _this->front )
{
_this->front = thread;
_this->back = thread;
}

else
{
_this->back->next = thread;
_this->back = thread;
}

prv_ec_unlock( _this );

if ( WaitForSingleObject
( thread->waiter,
INFINITE )
{
abort();
}
}
}

void ec_broadcast( ec_t *_this )
{
register LONG old = _this->count;

/* zero-overhead check for waiters??? */
if ( old & EC_WAITERS )
{
per_thread_t *thread, *next;

prv_ec_lock_inc( _this );

thread = _this->front;

_this->front = 0;
_this->back = 0;

prv_ec_unlock( _this );

while ( thread )
{
next = thread->next;

thread->next = 0;
if ( ! SetEvent( thread->waiter ) ) { abort(); }
per_thread_release( thread );

next = thread;
}
}
}

The eventcount and the mutex for the wait queue are combined in a single
word... Is this nessessary?

I wanted to be able to acquire the wait queue lock and compare/inc the event
in one atomic operation.

I still don't understand lock-free eventcounts Joe, am I getting close, or
straying off into the wild!

;) lol

SenderX

unread,

Nov 18, 2004, 7:19:36 AM11/18/04

to

> It just uses futex_wait and futex_wake which are fairly straightforward.
> An
> earlier version looped on EINTR if the evencount was unchanged.

Would the following pseudo-code implement a zero-overhead eventcount?

Joe Seigh

unread,

Nov 18, 2004, 8:04:26 AM11/18/04

to

SenderX wrote:
[...]

>
> The eventcount and the mutex for the wait queue are combined in a single
> word... Is this nessessary?
>
> I wanted to be able to acquire the wait queue lock and compare/inc the event
> in one atomic operation.

Potential for unnecessary spurious wakeups. They might not be a problem but
I wouldn't have all the resources to test and verify that.

>
> I still don't understand lock-free eventcounts Joe, am I getting close, or
> straying off into the wild!
>
> ;) lol

Actually, futexes are basically lock-free eventcounts by themselves. There's
nothing really to add except a waiter count if you want to avoid signaling when
there's no waiters. But you may not want or need a waiters count if there's
other ways to determine if there's waiters. If the wait condition is an empty
queue or stack then you don't need to signal when you enqueue onto an non-empty
queue. Enqueueing onto an empty queue doesn't necessarily mean there are waiters
but you'd have to do some testing to see how much of a problem that actually is.

Joe Seigh

unread,

Nov 18, 2004, 8:05:15 PM11/18/04

to

Joe Seigh wrote:
>
> SenderX wrote:
> >
> > > It just uses futex_wait and futex_wake which are fairly straightforward.
> >
> > I was wondering how futex_wait atomically checks if the futex value is the
> > same as the expected value. It kind of seems like a cas operation.
> >
> > Something like this:

[...]

> No, it just has to queue (suspend) the waiting thread, fetch the futex
> value, and dequeue (resume) the waiting thread if the futex value is
> different than the cmp value.
>
> There has to be a store-load barrier between the queue and futex fetch.
>
> If dequeue failes because thread resumed by something else that's ok.
>
> If thread has been dequeued and subsequently requeued before you attempted
> the dequeue, that's ok even if the new cmp value has changed from the one
> you are using. It's just a spurious wakeup.
>
> There's currently a discussion about this on the kernel mailing list.
>

Maybe they do have a problem.

http://www.ussg.iu.edu/hypermail/linux/kernel/0411.2/0953.html

Seems you can wake a thread that was already "woken" up by the cmp value
before it had a chance to do the unqueue_me.

I think you can fix it by waking up another waiting thread if the unqueue_me
fails. Eventually you wake a thread that wasn't already "woken".

Joe Seigh

SenderX

unread,

Nov 19, 2004, 12:26:22 AM11/19/04

to

"SenderX" <x...@xxx.com> wrote in message
news:sr0nd.626650$8_6.612301@attbi_s04...

>> It just uses futex_wait and futex_wake which are fairly straightforward.
>> An
>> earlier version looped on EINTR if the evencount was unchanged.
>
> Would the following pseudo-code implement a zero-overhead eventcount?
>
>
> It should, hummm...
>
>

Using the eventcount algorithm I posted, you could probably do win32 convar
waits like this:

typedef ec_t cv_t;

void cv_wait( cv_t *_this, LPCRITICAL_SECTION mutex )
{
LONG cmp = ec_get( _this );

LeaveCriticalSection( mutex );

ec_wait( _this, cmp );

EnterCriticalSection( mutex );
}

Humm, looks to simple. But it should work fine...

David Schwartz

unread,

Nov 18, 2004, 8:55:58 PM11/18/04

to

"Joe Seigh" <jsei...@xemaps.com> wrote in message
news:419D47D0...@xemaps.com...

> Maybe they do have a problem.
>
> http://www.ussg.iu.edu/hypermail/linux/kernel/0411.2/0953.html

If there are at least two threads blocking on pthread_cond_wait, and you
make two calls to pthread_cond_signal, it is not acceptable to wake the same
thread twice (assuming it doesn't come back around and call
pthread_cond_wait again).

DS

SenderX

unread,

Nov 19, 2004, 8:59:36 AM11/19/04

to

> Maybe they do have a problem.
>
> http://www.ussg.iu.edu/hypermail/linux/kernel/0411.2/0953.html
>
> Seems you can wake a thread that was already "woken" up by the cmp value
> before it had a chance to do the unqueue_me.

That has to because they queue the thread before they check the cmp value?

Joe Seigh

unread,

Nov 19, 2004, 9:34:27 AM11/19/04

to

Yeah, basically. They're going to have to go back to the ealier futex implementation
where they held the queue lock when they did the compare first before enqeueing.

Joe Seigh

unread,

Nov 19, 2004, 10:28:56 AM11/19/04

to

I wrote:
> Maybe they do have a problem.
>
> http://www.ussg.iu.edu/hypermail/linux/kernel/0411.2/0953.html
>
> Seems you can wake a thread that was already "woken" up by the cmp value
> before it had a chance to do the unqueue_me.
>
> I think you can fix it by waking up another waiting thread if the unqueue_me
> fails. Eventually you wake a thread that wasn't already "woken".
>

If you go this route, the wait queue has to be FIFO, which it appears to be,
and the delayed woken thread need a different return code besides 0, for
instance EALREADY, unless you want to break the correspondence between the
return code from futex_wake and the number of threads getting return code 0.

Joe Seigh

SenderX

unread,

Nov 20, 2004, 2:03:29 AM11/20/04

to

The cmp param name needs to be changed:

> /* Compares count, and locks if equal */
> BOOL prv_ec_lock_cmp( ec_t *_this, LONG cmp )

BOOL prv_ec_lock_cmp( ec_t *_this, LONG ec_cmp )

> {
> register int waited = 0;
> register LONG cmp, xchg, old = _this->count;
>
> for ( ;; )
> {
> /* compare event count */
> if ( ( old & 0xFFFFFFF0 ) == cmp )

if ( ( old & 0xFFFFFFF0 ) == ec_cmp )

> {
> if ( ! ( old & EC_LOCKED ) )
> {
> /* lock */
> xchg = old | EC_LOCKED;
> }
>
> else { xchg = old | EC_LOCK_WAITERS; }
> }
>
> else { xchg = old; }
>
> if ( waited ) { xchg |= EC_LOCK_WAITERS; }
>
> cmp = old;
>
> old = InterlockedCompareExchange
> ( &_this->count,
> xchg,
> cmp );
>
> if ( cmp == old )
> {
> if ( ( old & 0xFFFFFFF0 ) != cmp ) { break; }

if ( ( old & 0xFFFFFFF0 ) != ec_cmp ) { break; }

SenderX

unread,

Nov 20, 2004, 3:41:14 AM11/20/04

to

> Using the eventcount algorithm I posted, you could probably do win32
> convar
> waits like this:
>
>
> typedef ec_t cv_t;
>
>
> void cv_wait( cv_t *_this, LPCRITICAL_SECTION mutex )
> {
> LONG cmp = ec_get( _this );
>
> LeaveCriticalSection( mutex );
>
> ec_wait( _this, cmp );
>
> EnterCriticalSection( mutex );
> }
>
>
> Humm, looks to simple. But it should work fine...

Using eventcount's for cv's work great with broadcast's. However, not so
good with signals...

I am getting a race-condition when I try to add signal logic to the
eventcount. In order to signal, it seems I need to put waiting threads into
the wait queue when a request for the eventcount value is made, ec_get(
... ). But then I run into the same problem futex has with sending signal to
threads that have been signaled already due to the eventcount compare
failing.

Can efficient eventcounts be made to allow for signaling instead of pure
broadcasting?

SenderX

unread,

Nov 20, 2004, 4:26:38 AM11/20/04

to

>> The eventcount and the mutex for the wait queue are combined in a single
>> word... Is this nessessary?
>>
>> I wanted to be able to acquire the wait queue lock and compare/inc the
>> event
>> in one atomic operation.
>
> Potential for unnecessary spurious wakeups. They might not be a problem
> but
> I wouldn't have all the resources to test and verify that.

Speaking of unnecessary spurious wakeups, it seems that alls you have to do
to signal a single thread in an eventcount wrt condvars is inc + wake one.
But that "could" also signal an entire waitset of threads. Is this ok for
cv's?

SenderX

unread,

Nov 20, 2004, 6:19:04 AM11/20/04

to

> int pthread_cond_signal_x(pthread_cond_t *cvar) {
> int rc;
>
> if (cvar->waiters > 0) {
> // wake up a single waiter
> rc = futex_wake(&(cvar->eventcount), 1);
> if (rc == 0) {
> // none woken up,
> // do broadcast instead

> atomic_inc_x(&(cvar->eventcount));
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This can wake threads

> rc = futex_wake(&(cvar->eventcount), INT_MAX);

And this can wake threads.

It this legal for condvar signal? I understand that these would be observed
as "simple and expected" spurious wakes by the user application. Every use
of condvars should handle extra waits anyway right?

> }
>
> if (rc > 0)
> dec_waiters(cvar, rc);
> }
>
> return 0;
> }

It seems eventcounts are legally broadcast only... ;(

Joe Seigh

unread,

Nov 20, 2004, 10:35:23 AM11/20/04

to

Not if you get the thundering herd problem. Thread pools are probably the only
situation that would have that problem unless you used semaphores to limit the
number of threads waiting on the condvar.

Condvar signal is a bad api design decision but it's too well established at this
point. Better to create a new api.

Joe Seigh

unread,

Nov 20, 2004, 10:45:02 AM11/20/04

to

SenderX wrote:
>
> > int pthread_cond_signal_x(pthread_cond_t *cvar) {
> > int rc;
> >
> > if (cvar->waiters > 0) {
> > // wake up a single waiter
> > rc = futex_wake(&(cvar->eventcount), 1);
> > if (rc == 0) {
> > // none woken up,
> > // do broadcast instead
>
> > atomic_inc_x(&(cvar->eventcount));
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> This can wake threads
>
> > rc = futex_wake(&(cvar->eventcount), INT_MAX);
>
> And this can wake threads.
>
> It this legal for condvar signal? I understand that these would be observed
> as "simple and expected" spurious wakes by the user application. Every use
> of condvars should handle extra waits anyway right?

Besides spurious wakeups allowed, pthread_cond_signal is defined as waking up
as least one, so actually waking up all waiting threads all the time would be
valid for pthread_cond_signal. In this implementation it almost never happens.
If you do signal only on a condvar then the eventcount value will be the number
of times it had to do a broadcast. On my uniprocessor I've only seen it happen
a couple of times maybe.

>
> > }
> >
> > if (rc > 0)
> > dec_waiters(cvar, rc);
> > }
> >
> > return 0;
> > }
>
> It seems eventcounts are legally broadcast only... ;(

Aside from weird semantics for win32 Event which makes them
complicated as well as useless.

Joe Seigh

SenderX

unread,

Nov 20, 2004, 6:17:23 PM11/20/04

to

> Besides spurious wakeups allowed, pthread_cond_signal is defined as waking
> up
> as least one, so actually waking up all waiting threads all the time would
> be
> valid for pthread_cond_signal. In this implementation it almost never
> happens.

Yeah.

>> > if (rc == 0) {
>> > // none woken up,
>> > // do broadcast instead
>>
>> > atomic_inc_x(&(cvar->eventcount));

The window is very small for a large waiter count to pile up right here.

>> > rc = futex_wake(&(cvar->eventcount), INT_MAX);

If it did get bad, wait morphing could absorb all of the contention on the
user mutex.

>>
>> It seems eventcounts are legally broadcast only... ;(
>
> Aside from weird semantics for win32 Event which makes them
> complicated as well as useless.

They also have horrible performance. Eventcount is superior in terms of
correctness and performance.

:)

Joe Seigh

unread,

Nov 21, 2004, 9:35:22 PM11/21/04

to

I got the futex_requeue to work, not futex_cmp_requeue since I don't know how to get
7 parameters on a syscall.

So the lock-free version has wait morphing and runs faster than NPTL. 30% fast w/
wait morping only, 225x faster w/ yield only hack and 250x faster with both. That's
for a deliberate thundering herd testcase and the really high performace factors are
due to NPTL running really slow with thundering herd.

Joe Seigh

Alexander Terekhov

unread,

Nov 22, 2004, 4:50:39 AM11/22/04

to

Joe Seigh wrote:
>
> I got the futex_requeue to work, not futex_cmp_requeue since I don't know how to get
> 7 parameters on a syscall.

Both are totally brain-damaged. Give it up, Joe. Futex-like beasts
(improved ones, not totally brain-dead stuff that they have currently)
are really good for locks (semas, mutexes, rwlocks), not condvars
and/or barriers. You just can't beat a condvar with an "explicit"
queue and per-thread sema-like gates with real "wait morphing."

regards,
alexander.

Joe Seigh

unread,

Nov 22, 2004, 7:05:51 AM11/22/04

to

How much of a performance improvement did you get from that?

Joe Seigh

Alexander Terekhov

unread,

Nov 22, 2004, 8:26:15 AM11/22/04

to

Joe Seigh wrote:
[...]

> How much of a performance improvement did you get from that?

I'm talking theory. And you just can't beat theory. ;-)

regards,
alexander.

Joe Seigh

unread,

Nov 22, 2004, 8:39:15 AM11/22/04

to

Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > How much of a performance improvement did you get from that?
>
> I'm talking theory. And you just can't beat theory. ;-)
>

In theory, anyway. :)

Joe Seigh

unread,

Nov 22, 2004, 5:56:41 PM11/22/04

to

Some timings. This is a different testcase than the
previous one, no i/o and deliberate high contention.
Testcase is 1 producer thread signaling 2000000 times and
20 consumer threads waiting at least 100000 times each.

cvwaits = total number of waits by all consumers

NPTL signal:
time = 29.770862
user time = 7.161911
system time = 22.568569
ru_nvcsw = 6000001
ru_nivcsw = 6000431
cvwaits = 2000000

NPTL broadcast:
time = 173.251578
user time = 46.028003
system time = 126.859714
ru_nvcsw = 27309705
ru_nivcsw = 4993749
cvwaits = 24807747

lock-free signal:
time = 23.243099
user time = 6.329038
system time = 16.829441
ru_nvcsw = 4000020
ru_nivcsw = 4000379
cvwaits = 2000000

lock-free broadcast:
time = 286.615726
user time = 78.123124
system time = 207.901394
ru_nvcsw = 47827422
ru_nivcsw = 2850720
cvwaits = 23913022

NPTL signal after unlock:
time = 22.624151
user time = 6.106072
system time = 16.469496
ru_nvcsw = 4000014
ru_nivcsw = 4000362
cvwaits = 2000000

lock-free signal after unlock:
time = 15.190874
user time = 5.813116
system time = 9.341580
ru_nvcsw = 2000021
ru_nivcsw = 2000253
cvwaits = 2000000

NPTL broadcast after unlock:
time = 190.805691
user time = 49.233516
system time = 141.172539
ru_nvcsw = 30088078
ru_nivcsw = 1894149
cvwaits = 29934391

lock-free broadcast after unlock:
time = 244.205641
user time = 74.801628
system time = 168.898324
ru_nvcsw = 38410421
ru_nivcsw = 2159221
cvwaits = 38270044

lock-free signal w/ sched_yield:
time = 0.793973
user time = 0.775882
system time = 0.015997
ru_nvcsw = 14253
ru_nivcsw = 19942
cvwaits = 9371

lock-free broadcast w/ sched_yield:
time = 0.796090
user time = 0.761884
system time = 0.028996
ru_nvcsw = 14608
ru_nivcsw = 11375
cvwaits = 9423

lock-free signal after unlock w/ sched_yield:
time = 0.789347
user time = 0.757885
system time = 0.029996
ru_nvcsw = 13515
ru_nivcsw = 19924
cvwaits = 9401

lock-free broadcast after unlock w/ sched_yield
time = 0.781465
user time = 0.711892
system time = 0.063990
ru_nvcsw = 13526
ru_nivcsw = 11098
cvwaits = 9385

lock-free broadcast w/ wait morphing
time = 104.396387
user time = 30.187411
system time = 73.954758
ru_nvcsw = 15088934
ru_nivcsw = 2289617
cvwaits = 14259217

lock-free broadcast after unlock w/ wait morphing
time = 139.794501
user time = 43.773346
system time = 95.617464
ru_nvcsw = 20290971
ru_nivcsw = 1287170
cvwaits = 20290364

lock-free broadcast w/ sched_yield & wait morphing:
time = 0.693834
user time = 0.690895
system time = 0.000000
ru_nvcsw = 1439
ru_nivcsw = 2058
cvwaits = 891

lock-free broadcast after unlock w/ sched_yield & wait morphing:
time = 0.685087
user time = 0.677897
system time = 0.006999
ru_nvcsw = 1608
ru_nivcsw = 2226
cvwaits = 1075

Joe Seigh

unread,

Nov 23, 2004, 8:09:55 PM11/23/04

to

No response in the kernel mailing list since this problem was stated. I don't
think they realize there's a problem.

Joe Seigh

SenderX

unread,

Nov 23, 2004, 11:05:42 PM11/23/04

to

You just can't beat a condvar with an "explicit"
> queue and per-thread sema-like gates with real "wait morphing."

Yeah, the fast win32 condvar sketch I posted really screams on 4-way
hyperthreaded xeon under win32 or linux. It beats current NPTL condvar.
explicit managed queue on win32 SMP seems to run fine, without any apparent
performance problems from SCHED_OTHER. Hummm...

This could easily be solved with Joes idea of using an explicit semaphore
for each eventcount generation...

Joe Seigh

unread,

Nov 27, 2004, 11:25:51 AM11/27/04

to

Well, so much for Linux. GCC suddenly decided to segfault for all programs
including HelloWorld. Nothing to see here. Move along.

Joe Seigh

unread,

Nov 27, 2004, 1:16:59 PM11/27/04

to

>
> Well, so much for Linux. GCC suddenly decided to segfault for all programs
> including HelloWorld. Nothing to see here. Move along.
>

Ah, nothing. It was just the Linux equivalent of BSOD.

Well, now that I got GCC to work again, I have definitive proof that futex wait morphing
is somewhat suboptimal. Lucky thing that being suboptimal is not a problem for Linux.

Joe Seigh

unread,

Nov 29, 2004, 7:39:44 AM11/29/04

to

> > If you go this route, the wait queue has to be FIFO, which it appears to be,
> > and the delayed woken thread need a different return code besides 0, for
> > instance EALREADY, unless you want to break the correspondence between the
> > return code from futex_wake and the number of threads getting return code 0.
> >
>
> No response in the kernel mailing list since this problem was stated. I don't
> think they realize there's a problem.
>

It looks like they picked it up again. They'll probably try to "fix" it. What do
you think the odds are that if there was a bug from using the simpler interface
there won't be a bug caused by a much more complicated and subtle interface?

Whatever they do will probably break the code I did, so it's probably a good thing
it was just experimental.

Joe Seigh

Alexander Terekhov

unread,

Nov 29, 2004, 7:51:00 AM11/29/04

to

Joe Seigh wrote:
[...]

> It looks like they picked it up again.

You mean that "drowsy" thing? My, this is fun. ;-)

regards,
alexander.

Joe Seigh

unread,

Nov 29, 2004, 9:13:23 AM11/29/04

to

Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > It looks like they picked it up again.
>
> You mean that "drowsy" thing? My, this is fun. ;-)
>

Actually, no. I'm sorry I did the alternate condvar
implementation now. It's better if you believe all the
POSIX threads implemetations are optimal so you can
concentrate on the logic of your programs rather than
using performance hacks. Now that they all run on the
same platforms as fully supported, can you imagine if
someone actually did pthread performance measurements
for Linux, Solaris and windows on the same hardware.
How could you write portable code if you though performance
was important?

Joe Seigh

SenderX

unread,

Nov 29, 2004, 1:30:41 PM11/29/04

to

> Actually, no. I'm sorry I did the alternate condvar
> implementation now.

Why? Your algorithm is extremely straight forward. Even the sched_yield
thing... Its so simple!

Linux maintainers need to understand that.

>It's better if you believe all the
>POSIX threads implemetations are optimal so you can
>concentrate on the logic of your programs rather than
>using performance hacks.
> Now that they all run on the
>same platforms as fully supported, can you imagine if
>someone actually did pthread performance measurements
>for Linux, Solaris and windows on the same hardware.
>How could you write portable code if you though performance
>was important?

I think the answer is to the problem is, link against an alternate pthread
library...

:O

Joe Seigh

unread,

Nov 29, 2004, 7:28:53 PM11/29/04

to

SenderX wrote:
>
> > Actually, no. I'm sorry I did the alternate condvar
> > implementation now.
>
> Why? Your algorithm is extremely straight forward. Even the sched_yield
> thing... Its so simple!
>
> Linux maintainers need to understand that.

You need to understand Linux kernel developers. They exist in their own
self contained universe. Anyway the tail wagging the dog api design is
fun to watch. You can see where they think some aspect of their implementation
is neat and they will attempt to preserve it no matter what the cost in
complexity and actual performance.

Joe Seigh

Alexander Terekhov

unread,

Nov 30, 2004, 6:53:43 AM11/30/04

to

Joe Seigh wrote:
[...]
> fun to watch.

Agreed.

> You can see where they think some aspect of their implementation
> is neat and they will attempt to preserve it no matter what the cost in
> complexity and actual performance.

I have no idea what they think, but their "requeue" stuff is awfully
silly with or without "compare." Condvars aside for a moment (futex
based pigs won't fly) they can't get those futex beasts (lock queues)
right for locks, to begin with. I mean kernel maintained waiters bit
for semas, mutexes, and read-write locks; primarily.

Just look at their pthread barrier thing... what's the point of
doing all sorts of utterly ugly stuff in the user space if all
threads always end up in the kernel?!

http://sources.redhat.com/cgi-bin/cvsweb.cgi/libc/nptl/DESIGN-barrier.txt?rev=1.4&content-type=text/x-cvsweb-markup&cvsroot=glibc

<quote>

Barriers pseudocode
===================

int pthread_barrier_wait(barrier_t *barrier);

struct barrier_t {

unsigned int lock:
- internal mutex

unsigned int left;
- current barrier count, # of threads still needed.

unsigned int init_count;
- number of threads needed for the barrier to continue.

unsigned int curr_event;
- generation count
}

pthread_barrier_wait(barrier_t *barrier)
{
unsigned int event;
result = 0;

lll_lock(barrier->lock);
if (!--barrier->left) {
barrier->curr_event++;
futex_wake(&barrier->curr_event, INT_MAX)

result = BARRIER_SERIAL_THREAD;
} else {
event = barrier->curr_event;
lll_unlock(barrier->lock);
do {
futex_wait(&barrier->curr_event, event)
} while (event == barrier->curr_event);
}

if (atomic_increment_val (barrier->left) == barrier->init_count)
lll_unlock(barrier->lock);

return result;
}

</quote>

regards,
alexander.

Joe Seigh

unread,

Nov 30, 2004, 7:32:55 AM11/30/04

to

Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > fun to watch.
>
> Agreed.
>
> > You can see where they think some aspect of their implementation
> > is neat and they will attempt to preserve it no matter what the cost in
> > complexity and actual performance.
>
> I have no idea what they think, but their "requeue" stuff is awfully
> silly with or without "compare." Condvars aside for a moment (futex
> based pigs won't fly) they can't get those futex beasts (lock queues)
> right for locks, to begin with. I mean kernel maintained waiters bit
> for semas, mutexes, and read-write locks; primarily.

They'd need a way to set the bit on all platforms. And they'd need to
lock the queue or block dequeue operations while they examined or set
the futex, something they don't do currently.

>
> Just look at their pthread barrier thing... what's the point of
> doing all sorts of utterly ugly stuff in the user space if all
> threads always end up in the kernel?!
>

(barrier code snipped)

Well, the lock is entirely redundant but probably doesn't hurt anything
as barrier ususally blocks ((n-1)/n) of the time. They'd be better off
if the barrier split out the signal finish and the wait sections so
threads could do something else and be more likely to avoid entering
the kernel and blocking. Maybe do interations based on the (n-2)th result
rather than (n-1)th result and use alternating staggered barriers.

Joe Seigh

Alexander Terekhov

unread,

Nov 30, 2004, 8:22:42 AM11/30/04

to

Joe Seigh wrote:
[...]

> They'd need a way to set the bit on all platforms.

Sure. What's the problem?

> And they'd need to
> lock the queue or block dequeue operations while they examined or set
> the futex, something they don't do currently.

I don't think that their current futex impl is lock-free.

[... barrier ...]

> They'd be better off
> if the barrier split out the signal finish and the wait sections so
> threads could do something else and be more likely to avoid entering
> the kernel and blocking. Maybe do interations based on the (n-2)th result
> rather than (n-1)th result and use alternating staggered barriers.

Or maybe simply count waiters in the kernel (per-thread gates with fast
path approach aside for a moment).

https://listman.redhat.com/archives/phil-list/2003-October/msg00031.html

regards,
alexander.

Joe Seigh

unread,

Nov 30, 2004, 8:50:29 AM11/30/04

to

Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > They'd need a way to set the bit on all platforms.
>
> Sure. What's the problem?

None. They can't use the current routines to examine user storage, they'd
have to write a new one with unique implementations for all platforms.
Just extra work, that's all.

>
> > And they'd need to
> > lock the queue or block dequeue operations while they examined or set
> > the futex, something they don't do currently.
>
> I don't think that their current futex impl is lock-free.

They currently don't hold the queue lock while examining the futex value.
I'm guessing this is because they don't want the overhead of pinning and
unpinning the futex memory or something like that. It opens up a race
condition where a futex_wake can wake up a thread that has observably
already been "woken" by the futex value. Since they've become rather fond
of not having to pin memory, they're entertaining some rather contrived logic
to continue that way. I don't know but if they managed to get an error
with some rather simple logic, the odds for errors in much more complicated
logic are probably a bit more.

Anyhow, Linux has managed to get their own equivalent of Microsoft's
PulseEvent problem. That's progress.

Joe Seigh

Alexander Terekhov

unread,

Nov 30, 2004, 10:24:07 AM11/30/04

to

Joe Seigh wrote:
[...]

> They currently don't hold the queue lock while examining the futex value.
> I'm guessing this is because they don't want the overhead of pinning and
> unpinning the futex memory or something like that. It opens up a race
> condition where a futex_wake can wake up a thread that has observably
> already been "woken"

Ah that. It's similar problem as with "stolen" signals due to
timeouts (under unreasonable Patrick TJ McPhee's interpretation,
hello Patrick ;-) )

See <http://www.terekhov.de/DESIGN-futex-CV.cpp>.

I mean "stolen_sig = this_waiter.is_reset()" in timedwait() and
"m_futex.wake(ALL)" in leave_wait() as circumvention. But even
with all such things fixed (and with added optimizations for fast
signaling with no waiters), such futex based condition variables
are still pigs only good for some mental gymnastics, so to say.

regards,
alexander.

Joe Seigh

unread,

Nov 30, 2004, 11:21:51 AM11/30/04

to

Well, futexes are all you have to work with, warts and all, so
I guess that make them warthogs. :)

Unless you want to write your own sycall. I haven't considered that
because it's too much trouble trying to figure out the kernel api
from reading scattered code fragments, given that the api to do that
should conceptually be very simple. Kind of like what application
programming would be if there were no man pages and everybody was
expected to look at libc source code to figure out what and how
to use it.

Joe Seigh

Alexander Terekhov

unread,

Nov 30, 2004, 11:38:24 AM11/30/04

to

Joe Seigh wrote:
[...]

> Unless you want to write your own sycall. I haven't considered that
> because it's too much trouble trying to figure out the kernel api
> from reading scattered code fragments, given that the api to do that
> should conceptually be very simple. Kind of like what application
> programming would be if there were no man pages and everybody was
> expected to look at libc source code to figure out what and how
> to use it.

Lowering the "barrier to entry" (with respect to kernel programming)
isn't in the {finanical, etc.} interests of the kernel gang. ;-) ;-)

regards,
alexander.

Joe Seigh

unread,

Nov 30, 2004, 11:57:00 AM11/30/04

to

Well, you don't want too many kernel developers. You might argue there
are too many already. So that's one way of doing it, by keeping it
difficult to learn. But the downside to that is you may get all the
wrong sort of kernel developers that way, ones who because they can
deal with complexity think nothing of keeping it what way and writing
even more complex code which ensures the competition is kept out.
Probably one of the reasons microkernels are disliked so much by the
Linux establishment.

Joe Seigh

unread,

Nov 30, 2004, 2:11:06 PM11/30/04

to

Alexander Terekhov wrote:
>
> ... But even

> with all such things fixed (and with added optimizations for fast
> signaling with no waiters), such futex based condition variables
> are still pigs only good for some mental gymnastics, so to say.
>

But being a pig, it can be cured. Hmm... I could fix the futex problem
plus make it run even faster. Interesting.

Joe Seigh

SenderX

unread,

Nov 30, 2004, 7:50:16 PM11/30/04

to

>> They currently don't hold the queue lock while examining the futex value.
>> I'm guessing this is because they don't want the overhead of pinning and
>> unpinning the futex memory or something like that. It opens up a race
>> condition where a futex_wake can wake up a thread that has observably
>> already been "woken"
>
> Ah that. It's similar problem as with "stolen" signals due to
> timeouts (under unreasonable Patrick TJ McPhee's interpretation,
> hello Patrick ;-) )

I posted an eventcount algorithm sketch that basically embedded the mutex
for its explicit waitqueue in the eventcount variable itself:

http://groups.google.com/groups?selm=Sp0nd.422326%24D%25.325220%40attbi_s51
( 2 bits reserved by the waitqueue, and 1 waiters bit reserved by the
eventcount )

The algorithm I came up with can acquire the explicit waitqueue lock "and"
compare and/or set the eventcount atomically. That totally eliminates any
race-conditions wrt setting the eventcount and waking threads. Do you see
any problems with this schema? I think that the ability to lock the waitset
and compare/set the eventcount is critical.

It seems if a futex impl could be heavily improved if it had a bit or two
reserved for a mutex that protects it's explicit waitqueue...

Joe Seigh

unread,

Nov 30, 2004, 8:53:43 PM11/30/04

to

You can't have the lock that protects the queue, or any kernel data struture,
in user space. Deliberate or inadvertent damage to the lockwork by a user
program could cause corruption of the kernel. The present futex logic of
just comparing against the futex word is ok since any value is ok. The
worst that could happen due to corruption of the futex word is spurious
or lost wakeups, not a kernel panic.

There's better ways of implimenting futexes but none of us are in a position
to do that. I suspect that if they realize that the current futex api allows
alternate pthread implementations, they will change it so that the only possible
implementation is theirs so as to not suffer in comparison.

Joe Seigh

SenderX

unread,

Nov 30, 2004, 9:44:40 PM11/30/04

to

> You can't have the lock that protects the queue, or any kernel data
> struture,
> in user space. Deliberate or inadvertent damage to the lockwork by a user
> program could cause corruption of the kernel.

Unfourtanlly your correct. So I guess any sort of waiter's bit maintained by
the kernel is out of the question, since the bit would imply some sort of
sync between the user-space futex and its hashed kernel waitqueue. Or is
Alex talking about something else when he refers to a waiters bit?

SenderX

unread,

Nov 30, 2004, 9:45:24 PM11/30/04

to

> You can't have the lock that protects the queue, or any kernel data
> struture,
> in user space. Deliberate or inadvertent damage to the lockwork by a user
> program could cause corruption of the kernel.

Unfourtanlly your correct. So I guess any sort of waiter's bit maintained by

Joe Seigh

unread,

Nov 30, 2004, 9:55:35 PM11/30/04

to

As long as the kernel doesn't do anything with the waiters bit except set or
unset it to reflect the queue status it should be ok. If a user process
clobbered the waiters bit, it should eventually get set again to the right
value by the kernel. Some user logic might get screwed up but that shouldn't
affect kernel integrity.

Joe Seigh

SenderX

unread,

Nov 30, 2004, 10:08:52 PM11/30/04

to

> a user process
> clobbered the waiters bit, it should eventually get set again to the right
> value by the kernel.

I don't quite understand. It seems that the waiters bit would be critical.
If the waiters bit got clobbered right before a call to futex_wake, would
that not be a potential problem?

Joe Seigh

unread,

Nov 30, 2004, 10:23:10 PM11/30/04

to

futex_wake wouldn't use the waiters bit as input. In fact it would mask it
out to do the compare. When it's finished it sets the waiters bit to reflect
the queue status. The waiters bit is only used by user code to "see" what
the queue status is.

Joe Seigh

SenderX

unread,

Nov 30, 2004, 11:25:39 PM11/30/04

to

> futex_wake wouldn't use the waiters bit as input. In fact it would mask
> it
> out to do the compare. When it's finished it sets the waiters bit to
> reflect
> the queue status. The waiters bit is only used by user code to "see" what
> the queue status is.

I think I understand now. futex_wait/wake would atomically set the waiters
bit if there were still any waiters in the queue, or clear the bit of the
queue was empty, before unlocking the queue mutex...

So, I guess a mutex impl could use the waiters bit as a simple flag to call
futex_wake, just as you mentioned...

#define WAITERS_BIT 0x80000000

mutex lock
-------------

while ( atomic_xchg( &futex, 1 ) )
{
futex_wait( &futex, 1 )
{
lock_wait_queue();

if ( *futex == 1 )
{
enqueue_us;
}

if ( there_are_any_waiters )
{
atomic_bit_set( futex, 0x80000000 );
}

else { atomic_bit_clear( futex, 0x80000000 ); }

unlock_wait_queue();

if ( we_enqueued )
{
wait on our gate...
}
}
}

mutex unlock
-------------

if ( atomic_xchg( &futex, 0 ) & WAITERS_BIT )
{
futex_wake( &futex, 1 )
{
lock_wait_queue();

dequeue_a_waiter;

if ( there_are_any_waiters_left )
{
atomic_bit_set( futex, 0x80000000 );
}

else { atomic_bit_clear( futex, 0x80000000 ); }

unlock_wait_queue();

if ( we_dequeued_a_waiter )
{
signal waiter's gate...
}
}
}

Is that about it?

BTW, thanks for bearing with me so far!

;)

SenderX

unread,

Nov 30, 2004, 11:36:02 PM11/30/04

to

Arghrghrg. I totally forgot to mask off the fu#$% bit!

Here is the fixed lock function

mutex lock
-------------

while ( ( atomic_xchg( &futex, 1 ) & ~WAITERS_BIT ) )

{
futex_wait( &futex, 1 )
{
lock_wait_queue();

if ( ( *futex & ~WAITERS_BIT ) == 1 )

Joe Seigh

unread,

Dec 1, 2004, 6:39:17 AM12/1/04

to

SenderX wrote:
>
> Arghrghrg. I totally forgot to mask off the fu#$% bit!
>
> Here is the fixed lock function
>
> mutex lock
> -------------
>
> while ( ( atomic_xchg( &futex, 1 ) & ~WAITERS_BIT ) )
> {
> futex_wait( &futex, 1 )
> {

pin_memory_page(futex);

> lock_wait_queue();
>
> if ( ( *futex & ~WAITERS_BIT ) == 1 )
> {
> enqueue_us;
> }
>
> if ( there_are_any_waiters )
> {
> atomic_bit_set( futex, 0x80000000 );
> }
>
> else { atomic_bit_clear( futex, 0x80000000 ); }
>
> unlock_wait_queue();

unpin_memory_page(futex);

>
> if ( we_enqueued )
> {
> wait on our gate...
> }
> }
> }

Of course it's a little bit more complicated. Currently the only routines for
accessing user storage are things like get_user() and put_user(). You'd have
to write a set of interlocked user access routines -- for all platforms.

Plus the pin/unpin memory routines probably aren't that cheap. They're required
to make the user memory access routines safe to use while holding a spin lock.
That's whay they're trying to avoid. It sounds like that's what the orginal
2.4 futex did. Anybody know what the performance difference was between the
old futex and the current one?

Joe Seigh

Alexander Terekhov

unread,

Dec 1, 2004, 7:00:38 AM12/1/04

to

Joe Seigh wrote:
>
> SenderX wrote:
> >
> > Arghrghrg. I totally forgot to mask off the fu#$% bit!
> >
> > Here is the fixed lock function
> >
> > mutex lock
> > -------------
> >
> > while ( ( atomic_xchg( &futex, 1 ) & ~WAITERS_BIT ) )

WHILE
atomic_bit_test_set_ccacq(&futex, 1)
lock_queue_wait(&futex, 1) // wait if locked bit is set

> > {
> > futex_wait( &futex, 1 )
> > {
>
> pin_memory_page(futex);
> > lock_wait_queue();
> >
> > if ( ( *futex & ~WAITERS_BIT ) == 1 )
> > {
> > enqueue_us;
> > }
> >
> > if ( there_are_any_waiters )
> > {
> > atomic_bit_set( futex, 0x80000000 );
> > }

No. Waiters bit must be set *before* lock state check or "together"
with lock state check (I mean CAS LL/LR-SC).

> >
> > else { atomic_bit_clear( futex, 0x80000000 ); }

If "realtime" is not an issue, it can be cleared in the user-space
by mutex_unlock.

IF atomic_swap_rel(lock_queue = &futex, 0) < 0 // or > 1
THEN lock_queue_wake(lock_queue, 1)

lock_queue_wake() (or wake exit path in lock_queue_wait()) would
have to set waiters bit again if queue remains not empty.

regards,
alexander.

Joe Seigh

unread,

Dec 1, 2004, 7:39:18 AM12/1/04

to

What's the advantage of the waiters bit that is going to outweigh
the disavantage of pinning and unpinning memory?

Joe Seigh

Alexander Terekhov

unread,

Dec 1, 2004, 7:53:03 AM12/1/04

to

Joe Seigh wrote:
>
> What's the advantage of the waiters bit that is going to outweigh
> the disavantage of pinning and unpinning memory?

What's the issue with pinning and unpinning (or lack thereof)?

regards,
alexander.

Joe Seigh

unread,

Dec 1, 2004, 7:56:43 AM12/1/04

to

Taking a page fault while holding a spin lock.

Joe Seigh

Alexander Terekhov

unread,

Dec 1, 2004, 8:10:58 AM12/1/04

to

Well, whatever. Waiters bit can be set without holding a lock.

And "lost signals" that they're fighting now is not a problem for
locks.

regards,
alexander.

Joe Seigh

unread,

Dec 1, 2004, 8:19:49 AM12/1/04

to

Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> >
> > Alexander Terekhov wrote:
> > >
> > > Joe Seigh wrote:
> > > >
> > > > What's the advantage of the waiters bit that is going to outweigh
> > > > the disavantage of pinning and unpinning memory?
> > >
> > > What's the issue with pinning and unpinning (or lack thereof)?
> > >
> >
> > Taking a page fault while holding a spin lock.
>
> Well, whatever. Waiters bit can be set without holding a lock.

Without race conditions with regard to the state of the wait queue
which is protected by a spin lock?

>
> And "lost signals" that they're fighting now is not a problem for
> locks.

But lost waiters bit would be.

Joe Seigh

Alexander Terekhov

unread,

Dec 1, 2004, 8:49:44 AM12/1/04

to

Joe Seigh wrote:
[...]

> > Well, whatever. Waiters bit can be set without holding a lock.
>
> Without race conditions with regard to the state of the wait queue
> which is protected by a spin lock?

False positives won't affect correctness.

> >
> > And "lost signals" that they're fighting now is not a problem for
> > locks.
>
> But lost waiters bit would be.

Again, consider:

mutex_lock {

WHILE
atomic_bit_test_set_ccacq(&futex, 1)
lock_queue_wait(&futex, 1) // wait if locked bit is set
}

mutex_unlock {

IF atomic_swap_rel(lock_queue = &futex, 0) < 0 // or > 1
THEN lock_queue_wake(lock_queue, 1)
}

And I still don't quite undestand the problem of page faults while
holding locks associated with futexes (I'm talking about per-futex
locks).

regards,
alexander.

Joe Seigh

unread,

Dec 1, 2004, 9:15:49 AM12/1/04

to

Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> [...]
> > > Well, whatever. Waiters bit can be set without holding a lock.
> >
> > Without race conditions with regard to the state of the wait queue
> > which is protected by a spin lock?
>
> False positives won't affect correctness.

False negatives would effect correctness.

>
> > >
> > > And "lost signals" that they're fighting now is not a problem for
> > > locks.
> >
> > But lost waiters bit would be.
>
> Again, consider:
>
> mutex_lock {
> WHILE
> atomic_bit_test_set_ccacq(&futex, 1)
> lock_queue_wait(&futex, 1) // wait if locked bit is set
> }
>
> mutex_unlock {
> IF atomic_swap_rel(lock_queue = &futex, 0) < 0 // or > 1
> THEN lock_queue_wake(lock_queue, 1)
> }
>
> And I still don't quite undestand the problem of page faults while
> holding locks associated with futexes (I'm talking about per-futex
> locks).
>

Well, they couldn't use spin locks, it would have to be another form
of blocking.

Anyway you haven't answered why a waiters bit would be useful enough
to outweigh its disadvantages.

Joe Seigh

Alexander Terekhov

unread,

Dec 1, 2004, 9:32:49 AM12/1/04

to

Joe Seigh wrote:
>
> Alexander Terekhov wrote:
> >
> > Joe Seigh wrote:
> > [...]
> > > > Well, whatever. Waiters bit can be set without holding a lock.
> > >
> > > Without race conditions with regard to the state of the wait queue
> > > which is protected by a spin lock?
> >
> > False positives won't affect correctness.
>
> False negatives would effect correctness.

Consider the sketch below. Suppose that you've got mutiple waiters.
mutex_unlock would clear waiters bit and there will be a time window
with "false negative." But that's harmless provided that
"lock_queue_wake(lock_queue, 1)" would set waiters bit again (before
"releasing" a waiter).

> >
> > > >
> > > > And "lost signals" that they're fighting now is not a problem for
> > > > locks.
> > >
> > > But lost waiters bit would be.
> >
> > Again, consider:
> >
> > mutex_lock {
> > WHILE
> > atomic_bit_test_set_ccacq(&futex, 1)
> > lock_queue_wait(&futex, 1) // wait if locked bit is set
> > }
> >
> > mutex_unlock {
> > IF atomic_swap_rel(lock_queue = &futex, 0) < 0 // or > 1
> > THEN lock_queue_wake(lock_queue, 1)
> > }
> >
> > And I still don't quite undestand the problem of page faults while
> > holding locks associated with futexes (I'm talking about per-futex
> > locks).
> >
>
> Well, they couldn't use spin locks, it would have to be another form
> of blocking.

Whatever.

>
> Anyway you haven't answered why a waiters bit would be useful enough
> to outweigh its disadvantages.

Again, what disadvantages?

regards,
alexander.

Alexander Terekhov

unread,

Dec 1, 2004, 10:20:34 AM12/1/04

to

Alexander Terekhov wrote:
[...]

> > Anyway you haven't answered why a waiters bit would be useful enough
> > to outweigh its disadvantages.
>
> Again, what disadvantages?

As for some advantages, think of real (not the idiocy they're
entertaining currently) wait-morphing. Both pthread_cond_signal()
and pthread_cond_broadcast() would do it nicely with waiters bit
on a mutex. No need to always release one waiter and make him set
"contention state" on a mutex. That waiter would instead go
directly to the queue of the locked mutex with waiters bit set
by the kernel.

regards,
alexander.

Joe Seigh

unread,

Dec 1, 2004, 10:32:47 AM12/1/04

to

Alexander Terekhov wrote:
>
> Joe Seigh wrote:
> >

> > Alexander Terekhov wrote:
> > >
Skip the race condition issue for now since it may depend on how
locking is performed.

> > >
> > > And I still don't quite undestand the problem of page faults while
> > > holding locks associated with futexes (I'm talking about per-futex
> > > locks).
> > >
> >
> > Well, they couldn't use spin locks, it would have to be another form
> > of blocking.
>
> Whatever.

The other form of blocking is likely to be more expensive, otherwise why
have spin locks which have a more limited application.

>
> >
> > Anyway you haven't answered why a waiters bit would be useful enough
> > to outweigh its disadvantages.
>
> Again, what disadvantages?
>

The lack of any advantages, apparently. Plus, most people would consider that
it's not implemented to be a distinct disadvantage also.

Joe Seigh

unread,

Dec 1, 2004, 10:54:30 AM12/1/04

to

Actually, that's not the reason they wake up that one waiter. I found
that out the hard way. Given the way adaptive mutexes work and from
what I've measured, I doubt you could even measure any effect from the
spurious wakeups.

Joe Seigh

Alexander Terekhov

unread,

Dec 1, 2004, 11:03:28 AM12/1/04

to

Joe Seigh wrote:
[...]

> most people would consider that
> it's not implemented to be a distinct disadvantage also.

As I mentioned one or two times before, RT-NPTL project has already
implemented (modulu bugs) something essentially similar to lock
futexes with waiters bits. They call them "fulocks". The kernel
indicates the presense of waiters by modifying the user-space state.
They set the lock state to "VFULOCK_WP" (WP stands for "waiters
present" I guess).

regards,
alexander.

Alexander Terekhov

unread,

Dec 1, 2004, 11:06:52 AM12/1/04

to

Joe Seigh wrote:
[...]

> > As for some advantages, think of real (not the idiocy they're
> > entertaining currently) wait-morphing. Both pthread_cond_signal()
> > and pthread_cond_broadcast() would do it nicely with waiters bit
> > on a mutex. No need to always release one waiter and make him set
> > "contention state" on a mutex. That waiter would instead go
> > directly to the queue of the locked mutex with waiters bit set
> > by the kernel.
> >
>
> Actually, that's not the reason they wake up that one waiter. I found
> that out the hard way. Given the way adaptive mutexes work and from
> what I've measured, I doubt you could even measure any effect from the
> spurious wakeups.

What are you talking about? Details and "actual reason", please.

regards,
alexander.

Joe Seigh

unread,

Dec 1, 2004, 8:38:09 PM12/1/04

to

Alexander Terekhov wrote:
>

> >
> > Actually, that's not the reason they wake up that one waiter. I found
> > that out the hard way. Given the way adaptive mutexes work and from
> > what I've measured, I doubt you could even measure any effect from the
> > spurious wakeups.
>
> What are you talking about? Details and "actual reason", please.
>

Well, the testcase I ran was .8 sec for broadcast w/ thundering herd
and .7 sec for broadcast w/ wait morphing. If waking up all of the
threads didn't cause a huge difference then I don't think the extra
spurious wake per broadcast is going to be that significant. Why?
Have you measured different results?

Joe Seigh

SenderX

unread,

Dec 2, 2004, 12:56:52 AM12/2/04

to

> No. Waiters bit must be set *before* lock state check or "together"
> with lock state check (I mean CAS LL/LR-SC).

Does this sketch look a bit more correct...

int futex_wait( int *futex, int futex_cmp )
{
register int cmp, old;

pin_memory_page( futex );
lock_wait_queue( futex );

old = *futex;

do
{
if ( old != futex_cmp && there_are_no_waiters ) { goto bye; }

cmp = old;

old = atomic_cas( futex, old | WAITERS_BIT, cmp );

} while ( cmp != old );

if ( old == futex_cmp )
{
enqueue_us;
}

bye:

unlock_wait_queue( futex );
unpin_memory_page( futex );

if ( we_enqueued )
{
wait_on_gate( ... );
}

return 0;
}

SenderX

unread,

Dec 2, 2004, 5:27:50 AM12/2/04

to

Need to mask the bit's for the compare!

;(......

int futex_wait( int *futex, int futex_cmp )
{
register int cmp, old;

pin_memory_page( futex );
lock_wait_queue( futex );

old = *futex;

do
{
if ( ( old & ~WAITERS_BIT ) != futex_cmp && there_are_no_waiters )
{ goto bye; }

cmp = old;

old = atomic_cas( futex, old | WAITERS_BIT, cmp );

} while ( cmp != old );

if ( ( old & ~WAITERS_BIT ) == futex_cmp )

SenderX

unread,

Dec 2, 2004, 5:34:25 AM12/2/04

to

> if ( old != futex_cmp && there_are_no_waiters ) { goto bye; }
>
> cmp = old;
>
> old = atomic_cas( futex, old | WAITERS_BIT, cmp );

old = atomic_cas( futex, cmp, old | WAITERS_BIT );

SenderX

unread,

Dec 2, 2004, 1:32:37 PM12/2/04

to

> Well, whatever. Waiters bit can be set without holding a lock.

It "seems" that CAS/LL-SC could be used to do just that, along with the
futex compare, before you pin and lock the futex queue... Because an unequal
futex comprand is basically treated like a broadcast, so there really would
be no deadlocks?

Joe Seigh

unread,

Dec 2, 2004, 8:09:27 AM12/2/04

to

They're probably FIFO or some other scheduler controled service order
lock rather than an adaptive lock, so they could do that with a simple
put_user(). Of course there's other ways to get the same effect without
having to do a put_user. And there's ways that don't even require that
information.

I should patent some of ways that haven't been disclosed yet. With
the fixation on suboptimal solutions, they might be worth something.

Joe Seigh