Lock-free hash table

Josh Dybnis

unread,

Nov 10, 2008, 1:57:53 AM11/10/08

to

C implementation of Cliff Click's lock-free hash table. I built it
with current generation x86 systems as a target, so it uses full-
fenced versions of the atomic ops. Comments appreciated. The code is
in the public domain. The full source with makefile and supporting
libraries are at http://code.google.com/p/nbds/

#define COPIED_VALUE (-1)
#define TOMBSTONE STRIP_TAG(COPIED_VALUE)

#define ENTRIES_PER_BUCKET (CACHE_LINE_SIZE/sizeof(entry_t))
#define ENTRIES_PER_COPY_CHUNK (ENTRIES_PER_BUCKET * 2)
#define MIN_SCALE (__builtin_ctz(ENTRIES_PER_BUCKET) +
2) // min 4 buckets
#define MAX_BUCKETS_TO_PROBE 250

typedef struct ht_entry {
int64_t key;
int64_t value;
} entry_t;

typedef struct string {
uint32_t len;
char val[];
} string_t;

typedef struct hash_table_i {
volatile entry_t *table;
hash_table_t *ht; // parent ht;
struct hash_table_i *next;
struct hash_table_i *next_free;
unsigned int scale;
int max_probe;
int count; // TODO: make these counters distributed
int num_entries_copied;
int scan;
} hash_table_i_t;

static int hti_copy_entry
(hash_table_i_t *old_hti, volatile entry_t *e, uint32_t
e_key_hash, hash_table_i_t *new_hti);

// Choose the next bucket to probe using the high-order bits of
<key_hash>.
static inline int get_next_ndx(int old_ndx, uint32_t key_hash, int
ht_scale) {
int incr = (key_hash >> (32 - ht_scale));
incr += !incr; // If the increment is 0, make it 1.
return (old_ndx + incr) & MASK(ht_scale);
}

// Compare two keys.
//
// A key is made up of two parts. The 48 low-order bits are a pointer
to a null terminated string.
// The high-order 16 bits are taken from the hash of that string. The
bits from the hash are used
// as a quick check to rule out non-equal keys without doing a
complete string compare.
static inline int ht_key_equals (uint64_t a, uint32_t b_hash, const
char *b_value, uint32_t b_len) {
if ((b_hash >> 16) != (a >> 48)) // high-order 16 bits are from
the hash value
return FALSE;
const string_t *a_key = (string_t *)(a & MASK(48)); // low-order
48 bits is a pointer
assert(a_key);
return a_key->len == b_len && memcmp(a_key->val, b_value, b_len)
== 0;
}

// Lookup <key> in <hti>.
//
// Return the entry that <key> is in, or if <key> isn't in <hti>
return the entry that it would be
// in if it were inserted into <hti>. If there is no room for <key> in
<hti> then return NULL, to
// indicate that the caller should look in <hti->next>.
//
// Record if the entry being returned is empty. Otherwise the caller
will have to waste time with
// ht_key_equals() to confirm that it did not lose a race to fill an
empty entry.
static volatile entry_t *hti_lookup (hash_table_i_t *hti, uint32_t
key_hash, const char *key_val, uint32_t key_len, int *is_empty) {
TRACE("h0", "hti_lookup(key \"%s\" in hti %p)", key_val, hti);
*is_empty = 0;

// Probe one cache line at a time
int ndx = key_hash & MASK(hti->scale); // the first entry to
search
int i;
for (i = 0; i < hti->max_probe; ++i) {

// The start of the bucket is the first entry in the cache
line.
volatile entry_t *bucket = hti->table + (ndx &
~(ENTRIES_PER_BUCKET-1));

// Start searching at the indexed entry. Then loop around to
the begining of the cache line.
int j;
for (j = 0; j < ENTRIES_PER_BUCKET; ++j) {
volatile entry_t *e = bucket + ((ndx + j) &
(ENTRIES_PER_BUCKET-1));

uint64_t e_key = e->key;
if (e_key == DOES_NOT_EXIST) {
TRACE("h0", "hti_lookup: empty entry %p found on probe
%d", e, i*ENTRIES_PER_BUCKET+j+1);
// we tag the pointer so the caller can avoid an
expensive ht_key_equals()
*is_empty = 1;
return e;
}

if (ht_key_equals(e_key, key_hash, key_val, key_len)) {
TRACE("h0", "hti_lookup: entry %p found on probe %d",
e, i*ENTRIES_PER_BUCKET+j+1);
TRACE("h0", "hti_lookup: with key \"%s\" value %p",
((string_t *)(e_key & MASK(48)))->val, e-
>value);
return e;
}
}

ndx = get_next_ndx(ndx, key_hash, hti->scale);
}

// maximum number of probes exceeded
TRACE("h0", "hti_lookup: maximum number of probes exceeded
returning 0x0", 0, 0);
return NULL;
}

// Allocate and initialize a hash_table_i_t with 2^<scale> entries.
static hash_table_i_t *hti_alloc (hash_table_t *parent, int scale) {
// Include enough slop to align the actual table on a cache line
boundry
size_t n = sizeof(hash_table_i_t)
+ sizeof(entry_t) * (1 << scale)
+ (CACHE_LINE_SIZE - 1);
hash_table_i_t *hti = (hash_table_i_t *)calloc(n, 1);

// Align the table of hash entries on a cache line boundry.
hti->table = (entry_t *)(((uint64_t)hti + sizeof(hash_table_i_t) +
(CACHE_LINE_SIZE-1))
& ~(CACHE_LINE_SIZE-1));

hti->scale = scale;

// When searching for a key probe a maximum of 1/4 of the buckets
up to 1000 buckets.
hti->max_probe = ((1 << (hti->scale - 2)) / ENTRIES_PER_BUCKET) +
2;
if (hti->max_probe > MAX_BUCKETS_TO_PROBE) {
hti->max_probe = MAX_BUCKETS_TO_PROBE;
}

hti->ht = parent;

assert(hti->scale >= MIN_SCALE && hti->scale < 63); // size must
be a power of 2
assert(sizeof(entry_t) * ENTRIES_PER_BUCKET % CACHE_LINE_SIZE ==
0); // divisible into cache
assert((size_t)hti->table % CACHE_LINE_SIZE == 0); // cache
aligned

return hti;
}

// Called when <hti> runs out of room for new keys.
//
// Initiates a copy by creating a larger hash_table_i_t and installing
it in <hti->next>.
static void hti_start_copy (hash_table_i_t *hti) {
TRACE("h0", "hti_start_copy(hti %p hti->next %p)", hti, hti-
>next);
if (hti->next)
return; // another thread beat us to it

// heuristics to determine the size of the new table
uint64_t count = ht_count(hti->ht);
unsigned int new_scale = hti->scale;
new_scale += (count > (1 << (new_scale - 2))); // double size if
more than 1/4 full
new_scale += (count > (1 << (new_scale - 2))); // double size
again if more than 1/2 full

// Allocate the new table and attempt to install it.
hash_table_i_t *next = hti_alloc(hti->ht, hti->scale + 1);
hash_table_i_t *old_next = SYNC_CAS(&hti->next, NULL, next);
if (old_next != NULL) {
TRACE("h0", "hti_start_copy: lost race to install new hti;
found %p", old_next, 0);
// Another thread beat us to it.
nbd_free(next);
return;
}
TRACE("h0", "hti_start_copy: new hti is %p", next, 0);
}

// Copy the key and value stored in <old_e> (which must be an entry in
<old_hti>) to <new_hti>.
//
// Return 1 unless <old_e> is already copied (then return 0), so the
caller can account for the total
// number of entries left to copy.
static int hti_copy_entry (hash_table_i_t *old_hti, volatile entry_t
*old_e, uint32_t key_hash,
hash_table_i_t *new_hti) {
TRACE("h0", "hti_copy_entry(copy entry from %p to %p)", old_hti,
new_hti);
assert(old_hti);
assert(old_hti->next);
assert(new_hti);
assert(old_e >= old_hti->table && old_e < old_hti->table + (1 <<
old_hti->scale));

int64_t old_e_value = old_e->value;
TRACE("h0", "hti_copy_entry: entry %p current value %p", old_e,
old_e_value);
if (EXPECT_FALSE(old_e_value == COPIED_VALUE))
return FALSE; // already copied

// Kill empty entries.
if (EXPECT_FALSE(old_e_value == DOES_NOT_EXIST)) {
old_e_value = SYNC_CAS(&old_e->value, DOES_NOT_EXIST,
COPIED_VALUE);
if (old_e_value == DOES_NOT_EXIST) {
TRACE("h0", "hti_copy_entry: old entry killed", 0, 0);
return TRUE;
}
if (old_e_value == COPIED_VALUE) {
TRACE("h0", "hti_copy_entry: lost race to kill empty entry
in old hti", 0, 0);
return FALSE; // another thread beat us to it
}
TRACE("h0", "hti_copy_entry: lost race to kill empty entry in
old hti; "
"the entry is now being used", 0, 0);
}

// Tag the value in the old entry to indicate a copy is in
progress.
old_e_value = SYNC_FETCH_AND_OR(&old_e->value, TAG_VALUE(0));
TRACE("h0", "hti_copy_entry: tagged the value %p in old entry %p",
old_e_value, old_e);
if (old_e_value == COPIED_VALUE)
return FALSE; // <value> was already copied by another thread.

// Deleted entries don't need to be installed into to the new
table, but their keys do need to
// be freed.
assert(COPIED_VALUE == TAG_VALUE(TOMBSTONE));
if (old_e_value == TOMBSTONE) {
nbd_defer_free((string_t *)(old_e->key & MASK(48)));
return TRUE;
}
old_e_value = STRIP_TAG(old_e_value);

// Install the key in the new table.
uint64_t old_e_key = old_e->key;
string_t *key = (string_t *)(old_e_key & MASK(48));
TRACE("h0", "hti_copy_entry: key %p is %s", old_e_key, key->val);

// We use 0 to indicate that <key_hash> isn't initiallized.
Occasionally the <key_hash> will
// really be 0 and we will waste time recomputing it. That is rare
enough that it is OK.
if (key_hash == 0) {
key_hash = murmur32(key->val, key->len);
}

int is_empty;
volatile entry_t *new_e = hti_lookup(new_hti, key_hash, key->val,
key->len, &is_empty);

// it is possible that there is not any room in the new table
either
if (EXPECT_FALSE(new_e == NULL)) {
hti_start_copy(new_hti); // initiate nested copy, if not
already started
return hti_copy_entry(old_hti, old_e, key_hash, new_hti-
>next); // recursive tail-call
}

// a tagged entry returned from hti_lookup() means it is either
empty or has a new key
if (is_empty) {
uint64_t new_e_key = SYNC_CAS(&new_e->key, DOES_NOT_EXIST,
old_e_key);
if (new_e_key != DOES_NOT_EXIST) {
TRACE("h0", "hti_copy_entry: lost race to CAS key %p into
new entry; found %p",
old_e_key, new_e_key);
return hti_copy_entry(old_hti, old_e, key_hash,
new_hti); // recursive tail-call
}
}
assert(ht_key_equals(new_e->key, key_hash, key->val, key->len));
TRACE("h0", "hti_copy_entry: key %p installed in new old hti %p",
key->val, new_hti);

// Copy the value to the entry in the new table.
uint64_t new_e_value = SYNC_CAS(&new_e->value, DOES_NOT_EXIST,
old_e_value);

// If there is a nested copy in progress, we might have installed
the key into a dead entry.
if (new_e_value == COPIED_VALUE)
return hti_copy_entry(old_hti, old_e, key_hash, new_hti-
>next); // recursive tail-call

// Mark the old entry as dead.
old_e->value = COPIED_VALUE;

// Update the count if we were the one that completed the copy.
if (new_e_value == DOES_NOT_EXIST) {
TRACE("h0", "hti_copy_entry: value %p installed in new hti
%p", old_e_value, new_hti);
SYNC_ADD(&old_hti->count, -1);
SYNC_ADD(&new_hti->count, 1);
return TRUE;
}

TRACE("h0", "hti_copy_entry: lost race to CAS value %p in new hti;
found %p",
old_e_value, new_e_value);
return FALSE; // another thread completed the copy
}

// Compare <expected> with the existing value associated with <key>.
If the values match then
// replace the existing value with <new>. If <new> is TOMBSTONE,
delete the value associated with
// the key by replacing it with a TOMBSTONE.
//
// Return the previous value associated with <key>, or DOES_NOT_EXIST
if <key> is not in the table
// or associated with a TOMBSTONE. If a copy is in progress and <key>
has been copied to the next
// table then return COPIED_VALUE.
//
// NOTE: the returned value matches <expected> iff the set succeeds
//
// Certain values of <expected> have special meaning. If <expected> is
HT_EXPECT_EXISTS then any
// real value matches (i.e. not a TOMBSTONE or DOES_NOT_EXIST) as long
as <key> is in the table. If
// <expected> is HT_EXPECT_WHATEVER then skip the test entirely.
//
static int64_t hti_compare_and_set (hash_table_i_t *hti, uint32_t
key_hash, const char *key_val,
uint32_t key_len, int64_t
expected, int64_t new) {
TRACE("h0", "hti_compare_and_set(hti %p key \"%s\")", hti,
key_val);
TRACE("h0", "hti_compare_and_set(new value %p; caller expects
value %p)", new, expected);
assert(hti);
assert(new != DOES_NOT_EXIST && !IS_TAGGED(new));
assert(key_val);

int is_empty;
volatile entry_t *e = hti_lookup(hti, key_hash, key_val, key_len,
&is_empty);

// There is no room for <key>, grow the table and try again.
if (e == NULL) {
hti_start_copy(hti);
return COPIED_VALUE;
}

// Install <key> in the table if it doesn't exist.
if (is_empty) {
TRACE("h0", "hti_compare_and_set: entry %p is empty", e, 0);
if (expected != HT_EXPECT_WHATEVER && expected !=
HT_EXPECT_NOT_EXISTS)
return DOES_NOT_EXIST;

// No need to do anything, <key> is already deleted.
if (new == TOMBSTONE)
return DOES_NOT_EXIST;

// allocate <key>.
string_t *key = nbd_malloc(sizeof(uint32_t) + key_len);
key->len = key_len;
memcpy(key->val, key_val, key_len);

// CAS <key> into the table.
uint64_t e_key = SYNC_CAS(&e->key, DOES_NOT_EXIST, ((uint64_t)
(key_hash >> 16) << 48) | (uint64_t)key);

// Retry if another thread stole the entry out from under us.
if (e_key != DOES_NOT_EXIST) {
TRACE("h0", "hti_compare_and_set: key in entry %p is \"%s
\"", e, e_key & MASK(48));
TRACE("h0", "hti_compare_and_set: lost race to install key
\"%s\" in %p", key->val, e);
nbd_free(key);
return hti_compare_and_set(hti, key_hash, key_val,
key_len, expected, new); // tail-call
}
TRACE("h0", "hti_compare_and_set: installed key \"%s\" in
entry %p", key_val, e);
}

// If the entry is in the middle of a copy, the copy must be
completed first.
int64_t e_value = e->value;
TRACE("h0", "hti_compare_and_set: value in entry %p is %p", e,
e_value);
if (EXPECT_FALSE(IS_TAGGED(e_value))) {
int did_copy = hti_copy_entry(hti, e, key_hash, ((volatile
hash_table_i_t *)hti)->next);
if (did_copy) {
SYNC_ADD(&hti->num_entries_copied, 1);
}
return COPIED_VALUE;
}

// Fail if the old value is not consistent with the caller's
expectation.
int old_existed = (e_value != TOMBSTONE && e_value !=
DOES_NOT_EXIST);
if (EXPECT_FALSE(expected != HT_EXPECT_WHATEVER && expected !=
e_value)) {
if (EXPECT_FALSE(expected != (old_existed ? HT_EXPECT_EXISTS :
HT_EXPECT_NOT_EXISTS))) {
TRACE("h0", "hti_compare_and_set: value expected by caller
for key \"%s\" not found; "
"found value %p", key_val, e_value);
return e_value;
}
}

// CAS the value into the entry. Retry if it fails.
uint64_t v = SYNC_CAS(&e->value, e_value, new);
if (EXPECT_FALSE(v != e_value)) {
TRACE("h0", "hti_compare_and_set: value CAS failed; expected
%p found %p", e_value, v);
return hti_compare_and_set(hti, key_hash, key_val, key_len,
expected, new); // recursive tail-call
}

// The set succeeded. Adjust the value count.
if (old_existed && new == TOMBSTONE) {
SYNC_ADD(&hti->count, -1);
} else if (!old_existed && new != TOMBSTONE) {
SYNC_ADD(&hti->count, 1);
}

// Return the previous value.
TRACE("h0", "hti_compare_and_set: CAS succeeded; old value %p new
value %p", e_value, new);
return e_value;
}

//
static int64_t hti_get (hash_table_i_t *hti, uint32_t key_hash, const
char *key_val, uint32_t key_len) {
assert(key_val);

int is_empty;
volatile entry_t *e = hti_lookup(hti, key_hash, key_val, key_len,
&is_empty);

// When hti_lookup() returns NULL it means we hit the reprobe
limit while
// searching the table. In that case, if a copy is in progress the
key
// might exist in the copy.
if (EXPECT_FALSE(e == NULL)) {
if (((volatile hash_table_i_t *)hti)->next != NULL)
return hti_get(hti->next, key_hash, key_val, key_len); //
recursive tail-call
return DOES_NOT_EXIST;
}

if (is_empty)
return DOES_NOT_EXIST;

// If the entry is being copied, finish the copy and retry on the
next table.
int64_t e_value = e->value;
if (EXPECT_FALSE(IS_TAGGED(e_value))) {
if (EXPECT_FALSE(e_value != COPIED_VALUE)) {
int did_copy = hti_copy_entry(hti, e, key_hash, ((volatile
hash_table_i_t *)hti)->next);
if (did_copy) {
SYNC_ADD(&hti->num_entries_copied, 1);
}
}
return hti_get(((volatile hash_table_i_t *)hti)->next,
key_hash, key_val, key_len); // tail-call
}

return (e_value == TOMBSTONE) ? DOES_NOT_EXIST : e_value;
}

//
int64_t ht_get (hash_table_t *ht, const char *key_val, uint32_t
key_len) {
return hti_get(*ht, murmur32(key_val, key_len), key_val, key_len);
}

//
int64_t ht_compare_and_set (hash_table_t *ht, const char *key_val,
uint32_t key_len,
int64_t expected_val, int64_t new_val) {

assert(key_val);
assert(!IS_TAGGED(new_val) && new_val != DOES_NOT_EXIST);

hash_table_i_t *hti = *ht;

// Help with an ongoing copy.
if (EXPECT_FALSE(hti->next != NULL)) {

// Reserve some entries for this thread to copy. There is a
race condition here because the
// fetch and add isn't atomic, but that is ok.
int x = hti->scan;
hti->scan = x + ENTRIES_PER_COPY_CHUNK;

// <hti->scan> might be larger than the size of the table, if
some thread stalls while
// copying. In that case we just wrap around to the begining
and make another pass through
// the table.
volatile entry_t *e = hti->table + (x & MASK(hti->scale));

// Copy the entries
int num_copied = 0, i;
for (i = 0; i < ENTRIES_PER_COPY_CHUNK; ++i) {
num_copied += hti_copy_entry(hti, e++, 0, hti->next);
assert(e <= hti->table + (1 << hti->scale));
}
if (num_copied != 0) {
SYNC_ADD(&hti->num_entries_copied, num_copied);
}
}

// Dispose of fully copied tables.
while (hti->num_entries_copied == (1 << hti->scale)) {
assert(hti->next);
if (SYNC_CAS(ht, hti, hti->next) == hti) {
nbd_defer_free(hti);
}
hti = *ht;
}

int64_t old_val;
uint32_t key_hash = murmur32(key_val, key_len);
while ((old_val = hti_compare_and_set(hti, key_hash, key_val,
key_len, expected_val, new_val))
== COPIED_VALUE) {
assert(hti->next);
hti = hti->next;
}

return old_val == TOMBSTONE ? DOES_NOT_EXIST : old_val;
}

// Remove the value in <ht> associated with <key_val>. Returns the
value removed, or
// DOES_NOT_EXIST if there was no value for that key.
int64_t ht_remove (hash_table_t *ht, const char *key_val, uint32_t
key_len) {
hash_table_i_t *hti = *ht;
int64_t val;
uint32_t key_hash = murmur32(key_val, key_len);
do {
val = hti_compare_and_set(hti, key_hash, key_val, key_len,
HT_EXPECT_WHATEVER, TOMBSTONE);
if (val != COPIED_VALUE)
return val == TOMBSTONE ? DOES_NOT_EXIST : val;
assert(hti->next);
hti = hti->next;
assert(hti);
} while (1);
}

// Returns the number of key-values pairs in <ht>
uint64_t ht_count (hash_table_t *ht) {
hash_table_i_t *hti = *ht;
uint64_t count = 0;
while (hti) {
count += hti->count;
hti = hti->next;
}
return count;
}

// Allocate and initialize a new hash table.
hash_table_t *ht_alloc (void) {
hash_table_t *ht = nbd_malloc(sizeof(hash_table_t));
*ht = (hash_table_i_t *)hti_alloc(ht, MIN_SCALE);
return ht;
}

// Free <ht> and its internal structures.
void ht_free (hash_table_t *ht) {
hash_table_i_t *hti = *ht;
do {
hash_table_i_t *next = hti->next;
nbd_free(hti);
hti = next;
} while (hti);
nbd_free(ht);
}

Chris M. Thomasson

unread,

Nov 12, 2008, 2:53:56 AM11/12/08

to

"Josh Dybnis" <jdy...@gmail.com> wrote in message
news:347303c8-4446-4a7b...@d36g2000prf.googlegroups.com...

>C implementation of Cliff Click's lock-free hash table. I built it
> with current generation x86 systems as a target, so it uses full-
> fenced versions of the atomic ops. Comments appreciated. The code is
> in the public domain. The full source with makefile and supporting
> libraries are at http://code.google.com/p/nbds/

[...]

One quick question, can you quickly explain how your user-space RCU
algorithm efficiently detects per-thread quiescent states? Can a thread
enter and exit a non-quiescent state without using any locked RMW and/or
MFENCE instructions?

Josh Dybnis

unread,

Nov 12, 2008, 4:16:45 AM11/12/08

to

On Nov 11, 11:53 pm, "Chris M. Thomasson" <n...@spam.invalid> wrote:
> "Josh Dybnis" <jdyb...@gmail.com> wrote in message

>
> news:347303c8-4446-4a7b...@d36g2000prf.googlegroups.com...>C implementation of Cliff Click's lock-free hash table. I built it
> > with current generation x86 systems as a target, so it uses full-
> > fenced versions of the atomic ops. Comments appreciated. The code is
> > in the public domain. The full source with makefile and supporting

> > libraries are athttp://code.google.com/p/nbds/

>
> [...]
>
> One quick question, can you quickly explain how your user-space RCU
> algorithm efficiently detects per-thread quiescent states? Can a thread
> enter and exit a non-quiescent state without using any locked RMW and/or
> MFENCE instructions?

I think the answer to your question is yes...but let me elaborate. The
threads are arranged in a circle. When a thread wants to defer a free
it creates a new token and passes it to its successor. A thread
announces it is quiescent by passing the token it last received from
its predecessor, on to its successor. When a thread sees one of its
tokens come all the way back around it knows every thread must have
been in a quiescent state at least once. Therefore it is safe to free
whatever object(s) was associated with the token (the token is
actually a pointer into an array of objects waiting to be freed). All
the token passing between threads is single reader / single writer, so
no locked RMW instructions or mfence instructions are necessary.

-Josh

Chris M. Thomasson

unread,

Nov 12, 2008, 7:05:13 AM11/12/08

to

"Josh Dybnis" <jdy...@gmail.com> wrote in message

news:ed0bde58-67a6-4c6d...@a17g2000prm.googlegroups.com...

So, when a thread "leaves" a non-quiescent region, it explicitly flushes its
local "token" queue, and passes tokens that it does not own to its
successor? And, your saying that all dequeued tokens that the thread does
indeed "own" are in a persistent quiescent state because they would have
come "full circle"; right?

> All
> the token passing between threads is single reader / single writer, so
> no locked RMW instructions or mfence instructions are necessary.

Okay. Does this not require participation from all threads such that if one
is blocked, it will hold up an entire grace-period? How do you detect
grace-periods if N RCU threads are blocking, in say, I/O operations, or
something else? Also, how do you reliably confine RCU memory visibility
within a threads non-quiescent region?

Josh Dybnis

unread,

Nov 12, 2008, 6:27:00 PM11/12/08

to

Yes.

> > All
> > the token passing between threads is single reader / single writer, so
> > no locked RMW instructions or mfence instructions are necessary.
>
> Okay. Does this not require participation from all threads such that if one
> is blocked, it will hold up an entire grace-period?

Yes.

> How do you detect grace-periods if N RCU threads are blocking, in say,
> I/O operations, or something else?

In this implementation the system just gets backed up as you've
pointed out above. No memory gets reclaimed until the blocked threads
return. This is not insurmountable in a real-world implementation.
I'll describe why.

Any thread can detect if another takes too long to make progress and
holds up the RCU mechanism. The entire RCU state is visible to every
thread, so they could all see it if another thread were to stop
passing on tokens. If the system as a whole is robust enough, threads
can crash (or be killed) without damaging the integrity of the system.
In that case a thread can kill a successor that is taking too long. I
think this level of robustness is a safe assumption for any system
that is highly reliable.

In practice, extreme measures such as killing a thread are very very
rare. In the implementation I've provided there is only one function
in the RCU API. Every thread is always part of the RCU token-passing
circle. Threads call the one function to periodically announce that
they are quiescent (i.e. not holding onto any shared pointers).
However, an alternative API consists of a pair of functions that a
thread calls when it enters and exits a non-quiescent state,
respectively. These functions dynamically add and remove the thread
from the token-passing circle. When a thread is not in the circle it
can not hold up the RCU mechanism. Then it is reasonable to make rules
like, a thread can only make blocking system calls when it is
quiescent. In practice, this is enough to restrict the cases where a
thread holds up the RCU mechanism to true crashes and infinite loops.

> Also, how do you reliably confine RCU memory visibility
> within a threads non-quiescent region?

There is nothing restricting visibility of a memory object after it's
grace period expires. If a thread lies about being quiescent then all
bets are off. AFAIK this is a problem with every safe memory
reclamation system except for garbage collection. (E.g. you can fail
to register a hazard pointer, you can decrement a pointer's reference
count arbitrarily, etc.)

-Josh

Josh Dybnis

unread,

Nov 12, 2008, 6:42:16 PM11/12/08

to

Yes.

> > All
> > the token passing between threads is single reader / single writer, so
> > no locked RMW instructions or mfence instructions are necessary.
>
> Okay. Does this not require participation from all threads such that if one
> is blocked, it will hold up an entire grace-period?

Yes.

> How do you detect grace-periods if N RCU threads are blocking, in say,
> I/O operations, or something else?

In this implementation the system just gets backed up as you've

pointed out above. No memory gets reclaimed until the blocked threads
return. This is not insurmountable in a real-world implementation.
I'll describe why.

Any thread can detect if another takes too long to make progress and
holds up the RCU mechanism. The entire RCU state is visible to every
thread, so they could all see it if another thread were to stop
passing on tokens. If the system as a whole is robust enough, threads
can crash (or be killed) without damaging the integrity of the system.
In that case a thread can kill a successor that is taking too long. I
think this level of robustness is a safe assumption for any system
that is highly reliable.

In practice, extreme measures such as killing a thread are very very
rare. In the implementation I've provided there is only one function
in the RCU API. Every thread is always part of the RCU token-passing
circle. Threads call the one function to periodically announce that
they are quiescent (i.e. not holding onto any shared pointers).
However, an alternative API consists of a pair of functions that a
thread calls when it enters and exits a non-quiescent state,
respectively. These functions dynamically add and remove the thread
from the token-passing circle. When a thread is not in the circle it
can not hold up the RCU mechanism. Then it is reasonable to make rules

like, a thread cannot make blocking system calls when it is not

quiescent. In practice, this is enough to restrict the cases where a
thread holds up the RCU mechanism to true crashes and infinite loops.

> Also, how do you reliably confine RCU memory visibility

> within a threads non-quiescent region?

There is nothing restricting visibility of a memory object after it's

Dmitriy V'jukov

unread,

Nov 13, 2008, 8:36:42 AM11/13/08

to

On Nov 10, 9:57 am, Josh Dybnis <jdyb...@gmail.com> wrote:
> C implementation of Cliff Click's lock-free hash table. I built it
> with current generation x86 systems as a target, so it uses full-
> fenced versions of the atomic ops. Comments appreciated. The code is
> in the public domain. The full source with makefile and supporting
> libraries are athttp://code.google.com/p/nbds/

I've noticed you are using RCU for memory reclamation. It's nice to
see like-minded person here ;)

Btw, the problematic moment in Click's hash map is that it has to do
constant "resizes" even if hash map itself is already not growing.
Resizes are needed to reclaim unused cells, because keys in cells must
be constant in his design.
I was thinking about following possibility. If we have manual
quiescence-detection mechanism (RCU), then we can reclaim individual
unused cells via that mechanism too. Something like this: when cell
removed it's marked as (Key, Removed), then we enqueue callback to
RCU, when the callback is fired it can change (Key, Removed) -> (Key,
Null) or (Null, Null). This means that this cell can be reused for
different Key.

--
Dmitriy V'jukov

Relacy Race Detector: Make your synchronization correct!
http://groups.google.ru/group/relacy/web

Dmitriy V'jukov

unread,

Nov 13, 2008, 9:30:48 AM11/13/08

to

On Nov 13, 2:42 am, Josh Dybnis <jdyb...@gmail.com> wrote:

> In practice, extreme measures such as killing a thread are very very
> rare. In the implementation I've provided there is only one function
> in the RCU API. Every thread is always part of the RCU token-passing
> circle. Threads call the one function to periodically announce that
> they are quiescent (i.e. not holding onto any shared pointers).
> However, an alternative API consists of a pair of functions that a
> thread calls when it enters and exits a non-quiescent state,
> respectively. These functions dynamically add and remove the thread
> from the token-passing circle.

When I was messing around with amortized proxy-collector (similar to
user-space RCU thing, particularly with the same problems) I've
decided on following API:

void enter_collected_region(); // relatively heavy
void leave_collected_region(); // relatively heavy
void i_am_in_quiescence(); // lightweight

Only single requirement: when thread is in collected region, it has to
call i_am_in_quiescence() periodically, otherwise system-wide progress
is blocked.

Usage example:

void thread_func()
{
enter_collected_region();
while (working)
{
do
{
do_some_processing();
i_am_in_quiescence();
}
while (something);

leave_collected_region();
do_some_blocking_IO();
enter_collected_region();
}
leave_collected_region();
}

Or with nice C++ wrappers:

void thread_func()
{
collected_region cr;
while (working)
{
do
{
do_some_processing();
i_am_in_quiescence();
}
while (something);

{
non_collected_region ncr;
do_some_blocking_IO();

Josh Dybnis

unread,

Nov 13, 2008, 7:02:56 PM11/13/08

to

> I've noticed you are using RCU for memory reclamation. It's nice to
> see like-minded person here ;)

I like RCU because it doesn't clutter up the code that uses it. Also
it's low overhead.

> Btw, the problematic moment in Click's hash map is that it has to do
> constant "resizes" even if hash map itself is already not growing.
> Resizes are needed to reclaim unused cells, because keys in cells must
> be constant in his design.
> I was thinking about following possibility. If we have manual
> quiescence-detection mechanism (RCU), then we can reclaim individual
> unused cells via that mechanism too. Something like this: when cell
> removed it's marked as (Key, Removed), then we enqueue callback to
> RCU, when the callback is fired it can change (Key, Removed) -> (Key,
> Null) or (Null, Null). This means that this cell can be reused for
> different Key.

I hadn't thought of trying to fix that limitation. It's an interesting
avenue to pursue. I'm not sure RCU would be enough though. There are
several places in the algorithm that depend on the keys being
constant. One of the problems is that you could be setting a key to
NULL at the same time as another thread is putting down a value in
that slot. Then you have a value sitting there without a key. I don't
think there is a way around that given Click's original constraints.

Click assumes a very weak memory model. Key's and values can appear in
any order. However, in my x86 only implementation we can assume that
keys are always going to appear before their corresponding values.
Maybe that could be exploited.

-Josh

Chris M. Thomasson

unread,

Nov 14, 2008, 6:12:37 AM11/14/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
news:aa590092-bd04-4237...@a26g2000prf.googlegroups.com...

> On Nov 10, 9:57 am, Josh Dybnis <jdyb...@gmail.com> wrote:
> > C implementation of Cliff Click's lock-free hash table. I built it
> > with current generation x86 systems as a target, so it uses full-
> > fenced versions of the atomic ops. Comments appreciated. The code is
> > in the public domain. The full source with makefile and supporting
> > libraries are athttp://code.google.com/p/nbds/

> I've noticed you are using RCU for memory reclamation. It's nice to
> see like-minded person here ;)

:^D

> Btw, the problematic moment in Click's hash map is that it has to do
> constant "resizes" even if hash map itself is already not growing.
> Resizes are needed to reclaim unused cells, because keys in cells must
> be constant in his design.

Yes. That problem is mentioned in the Google Talks video. However, the is
another fairly significant problem I observed. His algorithm relies on a CAS
function that can __atomically__ return failure value. AFAICT, this is
simply not possible in Java where CAS returns boolean... x86/SPARC, any arch
that atomically returns CAS failure works. However, what do you do with
LL/SC? I can think of a way that leaves a dangling LL... As you know, I
posted example code on `comp.programming'.

> I was thinking about following possibility. If we have manual
> quiescence-detection mechanism (RCU), then we can reclaim individual
> unused cells via that mechanism too. Something like this: when cell
> removed it's marked as (Key, Removed), then we enqueue callback to
> RCU, when the callback is fired it can change (Key, Removed) -> (Key,
> Null) or (Null, Null). This means that this cell can be reused for
> different Key.

Sure; that should work out just fine...

Dmitriy V'jukov

unread,

Nov 14, 2008, 6:24:16 AM11/14/08

to

Because of the RCU there will be no "old" mutators, which are relying
on old key value.
Assume initial cell's value is (K1, V1).
Thread tries to update (K1, V1) to (K1, V2). But before CAS it's
preempted.
Now another thread updates to (K1, Deleted).
Then RCU callback updates to (Null, Null).
Then another thread reuses this cell and writes (K2, V1).
Now first thread wakes-up and executes CAS and writes (K2, V2).
This is impossible, RCU will assure that there are no such "old"
writers.

So the problem is when (K1, Deleted) is updated back to (K1, V2) and
then again to (K1, Deleted). And then RCU callback updates (K1,
Deleted) -> (Null, Null). RCU grace-period is NOT aged for that last
(K1, Deleted).
The easy solution is to prohibit updating (K1, Deleted) to (K1, V2).
I.e. writers have to wait until RCU callback will update (K1, Deleted)
to (Null, Null), and only then writer can update (Null, Null) to (K1,
V2).
By "writers have to wait until RCU callback" I mean they just must use
another cell to put (K1, V2). So hash map can contain (K1, V) and
arbitrary number of (K1, Deleted) (probably this will rise new wave of
problems :) )
This will also solve scenario you've described.

Though I don't think out all the details, definitely there will be
some subtle moments. But I believe it's possible to implement such
cell reclamation with RCU.

Dmitriy V'jukov

unread,

Nov 14, 2008, 6:41:08 AM11/14/08

to

On Nov 14, 2:12 pm, "Chris M. Thomasson" <n...@spam.invalid> wrote:

> However, the is
> another fairly significant problem I observed. His algorithm relies on a CAS
> function that can __atomically__ return failure value.

I remember you were posting this on comp.programming, but I never
understood why it's so critical for Click's algorithm. The algo is
based on state machine, so if CAS will return value not atomically, it
must be not critical. It will just result in another unnecessary "CAS
loop". Not?
I am thinking about it this way:
1. Thread loads some value (probably not actual)
2. Thread tries to execute CAS, CAS fails, CAS returns some value
(probably not actual)
3. Thread tries to execute CAS again...
We can remove step 2, and nothing changes. We can just assume that
value, which thread get on step 2, was loaded on step 1.
What I am missing? In what way algo relies exactly on CAS returning
value atomically?

--
Dmitriy V'jukov

Chris M. Thomasson

unread,

Nov 14, 2008, 8:02:42 PM11/14/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:c6f679e3-abe8-4c0d...@h23g2000prf.googlegroups.com...

On Nov 14, 2:12 pm, "Chris M. Thomasson" <n...@spam.invalid> wrote:

> > However, the is
> > another fairly significant problem I observed. His algorithm relies on a
> > CAS
> > function that can __atomically__ return failure value.

> I remember you were posting this on comp.programming, but I never
> understood why it's so critical for Click's algorithm. The algo is
> based on state machine, so if CAS will return value not atomically, it
> must be not critical. It will just result in another unnecessary "CAS
> loop". Not?

Doesn't he claim his algorithm is wait-free such that there are no
CAS-loops?

http://www.azulsystems.com/events/stanford_2007/2007_LockFreeHash.pdf
(refer to page 4...)

AFAICT, he says that the reason its wait-free is because there are no
CAS-loops such that CAS failure determines the next action in the
state-machine. Also, he depends on CAS not failing spuriously. His
state-macine needs logic like:

atomicword value = g_shared;

atomicword nvalue = CAS(&g_shared, value, new_value);

if (value == nvalue) {
// state shift committed by this thread
} else {
// state shift committed by other thread
// `nvalue' determines next action
}

> I am thinking about it this way:
> 1. Thread loads some value (probably not actual)
> 2. Thread tries to execute CAS, CAS fails, CAS returns some value
> (probably not actual)

CAS on Java returns only true or false right? I still think he needs the
actual value which caused the CAS to fail.

> 3. Thread tries to execute CAS again...

But he says that his algorithm executes no CAS-loops... Humm, interesting.

> We can remove step 2, and nothing changes. We can just assume that
> value, which thread get on step 2, was loaded on step 1.
> What I am missing? In what way algo relies exactly on CAS returning
> value atomically?

AFAICT, his algorithm needs the actual "real" reason for CAS failure.
x86/SPARC provides this functionality. So there is no problem on those
archs. Perhaps I am taking his word to literally. I need to study the
algorithm some more. However, he does say that there are no CAS-loops, and
that reason for CAS failure dictates next action. AFAICT, this reason
represents the new value that got swapped in and caused the CAS to fail.

Chris M. Thomasson

unread,

Nov 14, 2008, 8:07:12 PM11/14/08

to

"Josh Dybnis" <jdy...@gmail.com> wrote in message

news:3d3fbaa1-7687-4c66...@x16g2000prn.googlegroups.com...

[...]

Okay. Well, AFAICT, this should work out just fine indeed; if all the rules
are followed...

;^D

Chris M. Thomasson

unread,

Nov 14, 2008, 8:41:42 PM11/14/08

to

"Chris M. Thomasson" <n...@spam.invalid> wrote in message
news:vapTk.7231$us4....@newsfe14.iad...

> "Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
> news:c6f679e3-abe8-4c0d...@h23g2000prf.googlegroups.com...
> On Nov 14, 2:12 pm, "Chris M. Thomasson" <n...@spam.invalid> wrote:
>
>> > However, the is
>> > another fairly significant problem I observed. His algorithm relies on
>> > a
>> > CAS
>> > function that can __atomically__ return failure value.
>
>> I remember you were posting this on comp.programming, but I never
>> understood why it's so critical for Click's algorithm. The algo is
>> based on state machine, so if CAS will return value not atomically, it
>> must be not critical. It will just result in another unnecessary "CAS
>> loop". Not?
>
> Doesn't he claim his algorithm is wait-free such that there are no
> CAS-loops?

[...]

I have to go, but one question before I do:

If the next action is not atomically acquired on CAS failure, will that lead
the thread down a false path within the state machine? Is a "false state" is
harmless?

Josh Dybnis

unread,

Nov 15, 2008, 5:57:41 AM11/15/08

to

On Nov 14, 3:24 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:

[...]

> Assume initial cell's value is (K1, V1).
> Thread tries to update (K1, V1) to (K1, V2). But before CAS it's
> preempted.
> Now another thread updates to (K1, Deleted).

All OK.

> Then RCU callback updates to (Null, Null).

That step is problematic. The RCU callback is not performing an atomic
operation. With the memory model Click is assuming it is possible for
another thread to come along and see (K1,Null) while the callback is
"in progress". Now we have something that looks like it is in the
progress of being written for the first time. The new thread could
update the value, thinking the key is valid, then the RCU's update
completes and we get something like (Null,V2).

> The easy solution is to prohibit updating (K1, Deleted) to (K1, V2).
> I.e. writers have to wait until RCU callback will update (K1, Deleted)
> to (Null, Null), and only then writer can update (Null, Null) to (K1,
> V2).
> By "writers have to wait until RCU callback" I mean they just must use
> another cell to put (K1, V2). So hash map can contain (K1, V) and
> arbitrary number of (K1, Deleted) (probably this will rise new wave of
> problems :) )

The problem with inserting a key in an alternate slot, is that now
readers can't stop probing when they hit an empty slot or deleted
value. They have to probe the maximum distance before they can
conclude a key is not in the table.

> Though I don't think out all the details, definitely there will be
> some subtle moments. But I believe it's possible to implement such
> cell reclamation with RCU.

There are too many subtleties for my intuition to be effective. I'm
not going to place a bet either way.