C implementation of Cliff Click's lock-free hash table. I built it with current generation x86 systems as a target, so it uses full- fenced versions of the atomic ops. Comments appreciated. The code is in the public domain. The full source with makefile and supporting libraries are at http://code.google.com/p/nbds/
typedef struct hash_table_i { volatile entry_t *table; hash_table_t *ht; // parent ht; struct hash_table_i *next; struct hash_table_i *next_free; unsigned int scale; int max_probe; int count; // TODO: make these counters distributed int num_entries_copied; int scan;
// Choose the next bucket to probe using the high-order bits of <key_hash>. static inline int get_next_ndx(int old_ndx, uint32_t key_hash, int ht_scale) { int incr = (key_hash >> (32 - ht_scale)); incr += !incr; // If the increment is 0, make it 1. return (old_ndx + incr) & MASK(ht_scale);
}
// Compare two keys. // // A key is made up of two parts. The 48 low-order bits are a pointer to a null terminated string. // The high-order 16 bits are taken from the hash of that string. The bits from the hash are used // as a quick check to rule out non-equal keys without doing a complete string compare. static inline int ht_key_equals (uint64_t a, uint32_t b_hash, const char *b_value, uint32_t b_len) { if ((b_hash >> 16) != (a >> 48)) // high-order 16 bits are from the hash value return FALSE; const string_t *a_key = (string_t *)(a & MASK(48)); // low-order 48 bits is a pointer assert(a_key); return a_key->len == b_len && memcmp(a_key->val, b_value, b_len) == 0;
}
// Lookup <key> in <hti>. // // Return the entry that <key> is in, or if <key> isn't in <hti> return the entry that it would be // in if it were inserted into <hti>. If there is no room for <key> in <hti> then return NULL, to // indicate that the caller should look in <hti->next>. // // Record if the entry being returned is empty. Otherwise the caller will have to waste time with // ht_key_equals() to confirm that it did not lose a race to fill an empty entry. static volatile entry_t *hti_lookup (hash_table_i_t *hti, uint32_t key_hash, const char *key_val, uint32_t key_len, int *is_empty) { TRACE("h0", "hti_lookup(key \"%s\" in hti %p)", key_val, hti); *is_empty = 0;
// Probe one cache line at a time int ndx = key_hash & MASK(hti->scale); // the first entry to search int i; for (i = 0; i < hti->max_probe; ++i) {
// The start of the bucket is the first entry in the cache line. volatile entry_t *bucket = hti->table + (ndx & ~(ENTRIES_PER_BUCKET-1));
// Start searching at the indexed entry. Then loop around to the begining of the cache line. int j; for (j = 0; j < ENTRIES_PER_BUCKET; ++j) { volatile entry_t *e = bucket + ((ndx + j) & (ENTRIES_PER_BUCKET-1));
uint64_t e_key = e->key; if (e_key == DOES_NOT_EXIST) { TRACE("h0", "hti_lookup: empty entry %p found on probe %d", e, i*ENTRIES_PER_BUCKET+j+1); // we tag the pointer so the caller can avoid an expensive ht_key_equals() *is_empty = 1; return e; }
if (ht_key_equals(e_key, key_hash, key_val, key_len)) { TRACE("h0", "hti_lookup: entry %p found on probe %d", e, i*ENTRIES_PER_BUCKET+j+1); TRACE("h0", "hti_lookup: with key \"%s\" value %p", ((string_t *)(e_key & MASK(48)))->val, e-
>value);
return e; } }
ndx = get_next_ndx(ndx, key_hash, hti->scale); }
// maximum number of probes exceeded TRACE("h0", "hti_lookup: maximum number of probes exceeded returning 0x0", 0, 0); return NULL;
}
// Allocate and initialize a hash_table_i_t with 2^<scale> entries. static hash_table_i_t *hti_alloc (hash_table_t *parent, int scale) { // Include enough slop to align the actual table on a cache line boundry size_t n = sizeof(hash_table_i_t) + sizeof(entry_t) * (1 << scale) + (CACHE_LINE_SIZE - 1); hash_table_i_t *hti = (hash_table_i_t *)calloc(n, 1);
// Align the table of hash entries on a cache line boundry. hti->table = (entry_t *)(((uint64_t)hti + sizeof(hash_table_i_t) + (CACHE_LINE_SIZE-1)) & ~(CACHE_LINE_SIZE-1));
hti->scale = scale;
// When searching for a key probe a maximum of 1/4 of the buckets up to 1000 buckets. hti->max_probe = ((1 << (hti->scale - 2)) / ENTRIES_PER_BUCKET) + 2; if (hti->max_probe > MAX_BUCKETS_TO_PROBE) { hti->max_probe = MAX_BUCKETS_TO_PROBE; }
hti->ht = parent;
assert(hti->scale >= MIN_SCALE && hti->scale < 63); // size must be a power of 2 assert(sizeof(entry_t) * ENTRIES_PER_BUCKET % CACHE_LINE_SIZE == 0); // divisible into cache assert((size_t)hti->table % CACHE_LINE_SIZE == 0); // cache aligned
return hti;
}
// Called when <hti> runs out of room for new keys. // // Initiates a copy by creating a larger hash_table_i_t and installing it in <hti->next>. static void hti_start_copy (hash_table_i_t *hti) { TRACE("h0", "hti_start_copy(hti %p hti->next %p)", hti, hti-
>next);
if (hti->next) return; // another thread beat us to it
// heuristics to determine the size of the new table uint64_t count = ht_count(hti->ht); unsigned int new_scale = hti->scale; new_scale += (count > (1 << (new_scale - 2))); // double size if more than 1/4 full new_scale += (count > (1 << (new_scale - 2))); // double size again if more than 1/2 full
// Allocate the new table and attempt to install it. hash_table_i_t *next = hti_alloc(hti->ht, hti->scale + 1); hash_table_i_t *old_next = SYNC_CAS(&hti->next, NULL, next); if (old_next != NULL) { TRACE("h0", "hti_start_copy: lost race to install new hti; found %p", old_next, 0); // Another thread beat us to it. nbd_free(next); return; } TRACE("h0", "hti_start_copy: new hti is %p", next, 0);
}
// Copy the key and value stored in <old_e> (which must be an entry in <old_hti>) to <new_hti>. // // Return 1 unless <old_e> is already copied (then return 0), so the caller can account for the total // number of entries left to copy. static int hti_copy_entry (hash_table_i_t *old_hti, volatile entry_t *old_e, uint32_t key_hash, hash_table_i_t *new_hti) { TRACE("h0", "hti_copy_entry(copy entry from %p to %p)", old_hti, new_hti); assert(old_hti); assert(old_hti->next); assert(new_hti); assert(old_e >= old_hti->table && old_e < old_hti->table + (1 << old_hti->scale));
int64_t old_e_value = old_e->value; TRACE("h0", "hti_copy_entry: entry %p current value %p", old_e, old_e_value); if (EXPECT_FALSE(old_e_value == COPIED_VALUE)) return FALSE; // already copied
// Kill empty entries. if (EXPECT_FALSE(old_e_value == DOES_NOT_EXIST)) { old_e_value = SYNC_CAS(&old_e->value, DOES_NOT_EXIST, COPIED_VALUE); if (old_e_value == DOES_NOT_EXIST) { TRACE("h0", "hti_copy_entry: old entry killed", 0, 0); return TRUE; } if (old_e_value == COPIED_VALUE) { TRACE("h0", "hti_copy_entry: lost race to kill empty entry in old hti", 0, 0); return FALSE; // another thread beat us to it } TRACE("h0", "hti_copy_entry: lost race to kill empty entry in old hti; " "the entry is now being used", 0, 0); }
// Tag the value in the old entry to indicate a copy is in progress. old_e_value = SYNC_FETCH_AND_OR(&old_e->value, TAG_VALUE(0)); TRACE("h0", "hti_copy_entry: tagged the value %p in old entry %p", old_e_value, old_e); if (old_e_value == COPIED_VALUE) return FALSE; // <value> was already copied by another thread.
// Deleted entries don't need to be installed into to the new table, but their keys do need to // be freed. assert(COPIED_VALUE == TAG_VALUE(TOMBSTONE)); if (old_e_value == TOMBSTONE) { nbd_defer_free((string_t *)(old_e->key & MASK(48))); return TRUE; } old_e_value = STRIP_TAG(old_e_value);
// Install the key in the new table. uint64_t old_e_key = old_e->key; string_t *key = (string_t *)(old_e_key & MASK(48)); TRACE("h0", "hti_copy_entry: key %p is %s", old_e_key, key->val);
// We use 0 to indicate that <key_hash> isn't initiallized. Occasionally the <key_hash> will // really be 0 and we will waste time recomputing it. That is rare enough that it is OK. if (key_hash == 0) { key_hash = murmur32(key->val, key->len); }
// it is possible that there is not any room in the new table either if (EXPECT_FALSE(new_e == NULL)) { hti_start_copy(new_hti); // initiate nested copy, if not already started return hti_copy_entry(old_hti, old_e, key_hash, new_hti-
>next); // recursive tail-call
}
// a tagged entry returned from hti_lookup() means it is either empty or has a new key if (is_empty) { uint64_t new_e_key = SYNC_CAS(&new_e->key, DOES_NOT_EXIST, old_e_key); if
...
>C implementation of Cliff Click's lock-free hash table. I built it > with current generation x86 systems as a target, so it uses full- > fenced versions of the atomic ops. Comments appreciated. The code is > in the public domain. The full source with makefile and supporting > libraries are at http://code.google.com/p/nbds/
[...]
One quick question, can you quickly explain how your user-space RCU algorithm efficiently detects per-thread quiescent states? Can a thread enter and exit a non-quiescent state without using any locked RMW and/or MFENCE instructions?
> One quick question, can you quickly explain how your user-space RCU > algorithm efficiently detects per-thread quiescent states? Can a thread > enter and exit a non-quiescent state without using any locked RMW and/or > MFENCE instructions?
I think the answer to your question is yes...but let me elaborate. The threads are arranged in a circle. When a thread wants to defer a free it creates a new token and passes it to its successor. A thread announces it is quiescent by passing the token it last received from its predecessor, on to its successor. When a thread sees one of its tokens come all the way back around it knows every thread must have been in a quiescent state at least once. Therefore it is safe to free whatever object(s) was associated with the token (the token is actually a pointer into an array of objects waiting to be freed). All the token passing between threads is single reader / single writer, so no locked RMW instructions or mfence instructions are necessary.
> > "Josh Dybnis" <jdyb...@gmail.com> wrote in message
> > news:347303c8-4446-4a7b-8a39-d413b013a3e5@d36g2000prf.googlegroups.com...>C > > > implementation of Cliff Click's lock-free hash table. I built it > > > with current generation x86 systems as a target, so it uses full- > > > fenced versions of the atomic ops. Comments appreciated. The code is > > > in the public domain. The full source with makefile and supporting > > > libraries are athttp://code.google.com/p/nbds/
> > [...]
> > One quick question, can you quickly explain how your user-space RCU > > algorithm efficiently detects per-thread quiescent states? Can a thread > > enter and exit a non-quiescent state without using any locked RMW and/or > > MFENCE instructions? > I think the answer to your question is yes...but let me elaborate. The > threads are arranged in a circle. When a thread wants to defer a free > it creates a new token and passes it to its successor. A thread > announces it is quiescent by passing the token it last received from > its predecessor, on to its successor. When a thread sees one of its > tokens come all the way back around it knows every thread must have > been in a quiescent state at least once. Therefore it is safe to free > whatever object(s) was associated with the token (the token is > actually a pointer into an array of objects waiting to be freed).
So, when a thread "leaves" a non-quiescent region, it explicitly flushes its local "token" queue, and passes tokens that it does not own to its successor? And, your saying that all dequeued tokens that the thread does indeed "own" are in a persistent quiescent state because they would have come "full circle"; right?
> All > the token passing between threads is single reader / single writer, so > no locked RMW instructions or mfence instructions are necessary.
Okay. Does this not require participation from all threads such that if one is blocked, it will hold up an entire grace-period? How do you detect grace-periods if N RCU threads are blocking, in say, I/O operations, or something else? Also, how do you reliably confine RCU memory visibility within a threads non-quiescent region?
> > > "Josh Dybnis" <jdyb...@gmail.com> wrote in message
> > >news:347303c8-4446-4a7b-8a39-d413b013a3e5@d36g2000prf.googlegroups.com...>C> > > > implementation of Cliff Click's lock-free hash table. I built it > > > > with current generation x86 systems as a target, so it uses full- > > > > fenced versions of the atomic ops. Comments appreciated. The code is > > > > in the public domain. The full source with makefile and supporting > > > > libraries are athttp://code.google.com/p/nbds/
> > > [...]
> > > One quick question, can you quickly explain how your user-space RCU > > > algorithm efficiently detects per-thread quiescent states? Can a thread > > > enter and exit a non-quiescent state without using any locked RMW and/or > > > MFENCE instructions? > > I think the answer to your question is yes...but let me elaborate. The > > threads are arranged in a circle. When a thread wants to defer a free > > it creates a new token and passes it to its successor. A thread > > announces it is quiescent by passing the token it last received from > > its predecessor, on to its successor. When a thread sees one of its > > tokens come all the way back around it knows every thread must have > > been in a quiescent state at least once. Therefore it is safe to free > > whatever object(s) was associated with the token (the token is > > actually a pointer into an array of objects waiting to be freed).
> So, when a thread "leaves" a non-quiescent region, it explicitly flushes its > local "token" queue, and passes tokens that it does not own to its > successor? And, your saying that all dequeued tokens that the thread does > indeed "own" are in a persistent quiescent state because they would have > come "full circle"; right?
Yes.
> > All > > the token passing between threads is single reader / single writer, so > > no locked RMW instructions or mfence instructions are necessary.
> Okay. Does this not require participation from all threads such that if one > is blocked, it will hold up an entire grace-period?
Yes.
> How do you detect grace-periods if N RCU threads are blocking, in say, > I/O operations, or something else?
In this implementation the system just gets backed up as you've pointed out above. No memory gets reclaimed until the blocked threads return. This is not insurmountable in a real-world implementation. I'll describe why.
Any thread can detect if another takes too long to make progress and holds up the RCU mechanism. The entire RCU state is visible to every thread, so they could all see it if another thread were to stop passing on tokens. If the system as a whole is robust enough, threads can crash (or be killed) without damaging the integrity of the system. In that case a thread can kill a successor that is taking too long. I think this level of robustness is a safe assumption for any system that is highly reliable.
In practice, extreme measures such as killing a thread are very very rare. In the implementation I've provided there is only one function in the RCU API. Every thread is always part of the RCU token-passing circle. Threads call the one function to periodically announce that they are quiescent (i.e. not holding onto any shared pointers). However, an alternative API consists of a pair of functions that a thread calls when it enters and exits a non-quiescent state, respectively. These functions dynamically add and remove the thread from the token-passing circle. When a thread is not in the circle it can not hold up the RCU mechanism. Then it is reasonable to make rules like, a thread can only make blocking system calls when it is quiescent. In practice, this is enough to restrict the cases where a thread holds up the RCU mechanism to true crashes and infinite loops.
> Also, how do you reliably confine RCU memory visibility > within a threads non-quiescent region?
There is nothing restricting visibility of a memory object after it's grace period expires. If a thread lies about being quiescent then all bets are off. AFAIK this is a problem with every safe memory reclamation system except for garbage collection. (E.g. you can fail to register a hazard pointer, you can decrement a pointer's reference count arbitrarily, etc.)
> > > "Josh Dybnis" <jdyb...@gmail.com> wrote in message
> > >news:347303c8-4446-4a7b-8a39-d413b013a3e5@d36g2000prf.googlegroups.com...>C> > > > implementation of Cliff Click's lock-free hash table. I built it > > > > with current generation x86 systems as a target, so it uses full- > > > > fenced versions of the atomic ops. Comments appreciated. The code is > > > > in the public domain. The full source with makefile and supporting > > > > libraries are athttp://code.google.com/p/nbds/
> > > [...]
> > > One quick question, can you quickly explain how your user-space RCU > > > algorithm efficiently detects per-thread quiescent states? Can a thread > > > enter and exit a non-quiescent state without using any locked RMW and/or > > > MFENCE instructions? > > I think the answer to your question is yes...but let me elaborate. The > > threads are arranged in a circle. When a thread wants to defer a free > > it creates a new token and passes it to its successor. A thread > > announces it is quiescent by passing the token it last received from > > its predecessor, on to its successor. When a thread sees one of its > > tokens come all the way back around it knows every thread must have > > been in a quiescent state at least once. Therefore it is safe to free > > whatever object(s) was associated with the token (the token is > > actually a pointer into an array of objects waiting to be freed).
> So, when a thread "leaves" a non-quiescent region, it explicitly flushes its > local "token" queue, and passes tokens that it does not own to its > successor? And, your saying that all dequeued tokens that the thread does > indeed "own" are in a persistent quiescent state because they would have > come "full circle"; right?
Yes.
> > All > > the token passing between threads is single reader / single writer, so > > no locked RMW instructions or mfence instructions are necessary.
> Okay. Does this not require participation from all threads such that if one > is blocked, it will hold up an entire grace-period?
Yes.
> How do you detect grace-periods if N RCU threads are blocking, in say, > I/O operations, or something else?
In this implementation the system just gets backed up as you've pointed out above. No memory gets reclaimed until the blocked threads return. This is not insurmountable in a real-world implementation. I'll describe why.
Any thread can detect if another takes too long to make progress and holds up the RCU mechanism. The entire RCU state is visible to every thread, so they could all see it if another thread were to stop passing on tokens. If the system as a whole is robust enough, threads can crash (or be killed) without damaging the integrity of the system. In that case a thread can kill a successor that is taking too long. I think this level of robustness is a safe assumption for any system that is highly reliable.
In practice, extreme measures such as killing a thread are very very rare. In the implementation I've provided there is only one function in the RCU API. Every thread is always part of the RCU token-passing circle. Threads call the one function to periodically announce that they are quiescent (i.e. not holding onto any shared pointers). However, an alternative API consists of a pair of functions that a thread calls when it enters and exits a non-quiescent state, respectively. These functions dynamically add and remove the thread from the token-passing circle. When a thread is not in the circle it can not hold up the RCU mechanism. Then it is reasonable to make rules like, a thread cannot make blocking system calls when it is not quiescent. In practice, this is enough to restrict the cases where a thread holds up the RCU mechanism to true crashes and infinite loops.
> Also, how do you reliably confine RCU memory visibility > within a threads non-quiescent region?
There is nothing restricting visibility of a memory object after it's grace period expires. If a thread lies about being quiescent then all bets are off. AFAIK this is a problem with every safe memory reclamation system except for garbage collection. (E.g. you can fail to register a hazard pointer, you can decrement a pointer's reference count arbitrarily, etc.)
On Nov 10, 9:57 am, Josh Dybnis <jdyb...@gmail.com> wrote:
> C implementation of Cliff Click's lock-free hash table. I built it > with current generation x86 systems as a target, so it uses full- > fenced versions of the atomic ops. Comments appreciated. The code is > in the public domain. The full source with makefile and supporting > libraries are athttp://code.google.com/p/nbds/
I've noticed you are using RCU for memory reclamation. It's nice to see like-minded person here ;)
Btw, the problematic moment in Click's hash map is that it has to do constant "resizes" even if hash map itself is already not growing. Resizes are needed to reclaim unused cells, because keys in cells must be constant in his design. I was thinking about following possibility. If we have manual quiescence-detection mechanism (RCU), then we can reclaim individual unused cells via that mechanism too. Something like this: when cell removed it's marked as (Key, Removed), then we enqueue callback to RCU, when the callback is fired it can change (Key, Removed) -> (Key, Null) or (Null, Null). This means that this cell can be reused for different Key.
On Nov 13, 2:42 am, Josh Dybnis <jdyb...@gmail.com> wrote:
> In practice, extreme measures such as killing a thread are very very > rare. In the implementation I've provided there is only one function > in the RCU API. Every thread is always part of the RCU token-passing > circle. Threads call the one function to periodically announce that > they are quiescent (i.e. not holding onto any shared pointers). > However, an alternative API consists of a pair of functions that a > thread calls when it enters and exits a non-quiescent state, > respectively. These functions dynamically add and remove the thread > from the token-passing circle.
When I was messing around with amortized proxy-collector (similar to user-space RCU thing, particularly with the same problems) I've decided on following API:
void enter_collected_region(); // relatively heavy void leave_collected_region(); // relatively heavy void i_am_in_quiescence(); // lightweight
Only single requirement: when thread is in collected region, it has to call i_am_in_quiescence() periodically, otherwise system-wide progress is blocked.
Usage example:
void thread_func() { enter_collected_region(); while (working) { do { do_some_processing(); i_am_in_quiescence(); } while (something);
> I've noticed you are using RCU for memory reclamation. It's nice to > see like-minded person here ;)
I like RCU because it doesn't clutter up the code that uses it. Also it's low overhead.
> Btw, the problematic moment in Click's hash map is that it has to do > constant "resizes" even if hash map itself is already not growing. > Resizes are needed to reclaim unused cells, because keys in cells must > be constant in his design. > I was thinking about following possibility. If we have manual > quiescence-detection mechanism (RCU), then we can reclaim individual > unused cells via that mechanism too. Something like this: when cell > removed it's marked as (Key, Removed), then we enqueue callback to > RCU, when the callback is fired it can change (Key, Removed) -> (Key, > Null) or (Null, Null). This means that this cell can be reused for > different Key.
I hadn't thought of trying to fix that limitation. It's an interesting avenue to pursue. I'm not sure RCU would be enough though. There are several places in the algorithm that depend on the keys being constant. One of the problems is that you could be setting a key to NULL at the same time as another thread is putting down a value in that slot. Then you have a value sitting there without a key. I don't think there is a way around that given Click's original constraints.
Click assumes a very weak memory model. Key's and values can appear in any order. However, in my x86 only implementation we can assume that keys are always going to appear before their corresponding values. Maybe that could be exploited.
> On Nov 10, 9:57 am, Josh Dybnis <jdyb...@gmail.com> wrote: > > C implementation of Cliff Click's lock-free hash table. I built it > > with current generation x86 systems as a target, so it uses full- > > fenced versions of the atomic ops. Comments appreciated. The code is > > in the public domain. The full source with makefile and supporting > > libraries are athttp://code.google.com/p/nbds/ > I've noticed you are using RCU for memory reclamation. It's nice to > see like-minded person here ;)
:^D
> Btw, the problematic moment in Click's hash map is that it has to do > constant "resizes" even if hash map itself is already not growing. > Resizes are needed to reclaim unused cells, because keys in cells must > be constant in his design.
Yes. That problem is mentioned in the Google Talks video. However, the is another fairly significant problem I observed. His algorithm relies on a CAS function that can __atomically__ return failure value. AFAICT, this is simply not possible in Java where CAS returns boolean... x86/SPARC, any arch that atomically returns CAS failure works. However, what do you do with LL/SC? I can think of a way that leaves a dangling LL... As you know, I posted example code on `comp.programming'.
> I was thinking about following possibility. If we have manual > quiescence-detection mechanism (RCU), then we can reclaim individual > unused cells via that mechanism too. Something like this: when cell > removed it's marked as (Key, Removed), then we enqueue callback to > RCU, when the callback is fired it can change (Key, Removed) -> (Key, > Null) or (Null, Null). This means that this cell can be reused for > different Key.
> > I've noticed you are using RCU for memory reclamation. It's nice to > > see like-minded person here ;)
> I like RCU because it doesn't clutter up the code that uses it. Also > it's low overhead.
> > Btw, the problematic moment in Click's hash map is that it has to do > > constant "resizes" even if hash map itself is already not growing. > > Resizes are needed to reclaim unused cells, because keys in cells must > > be constant in his design. > > I was thinking about following possibility. If we have manual > > quiescence-detection mechanism (RCU), then we can reclaim individual > > unused cells via that mechanism too. Something like this: when cell > > removed it's marked as (Key, Removed), then we enqueue callback to > > RCU, when the callback is fired it can change (Key, Removed) -> (Key, > > Null) or (Null, Null). This means that this cell can be reused for > > different Key.
> I hadn't thought of trying to fix that limitation. It's an interesting > avenue to pursue. I'm not sure RCU would be enough though. There are > several places in the algorithm that depend on the keys being > constant. One of the problems is that you could be setting a key to > NULL at the same time as another thread is putting down a value in > that slot. Then you have a value sitting there without a key. I don't > think there is a way around that given Click's original constraints.
> Click assumes a very weak memory model. Key's and values can appear in > any order. However, in my x86 only implementation we can assume that > keys are always going to appear before their corresponding values. > Maybe that could be exploited.
Because of the RCU there will be no "old" mutators, which are relying on old key value. Assume initial cell's value is (K1, V1). Thread tries to update (K1, V1) to (K1, V2). But before CAS it's preempted. Now another thread updates to (K1, Deleted). Then RCU callback updates to (Null, Null). Then another thread reuses this cell and writes (K2, V1). Now first thread wakes-up and executes CAS and writes (K2, V2). This is impossible, RCU will assure that there are no such "old" writers.
So the problem is when (K1, Deleted) is updated back to (K1, V2) and then again to (K1, Deleted). And then RCU callback updates (K1, Deleted) -> (Null, Null). RCU grace-period is NOT aged for that last (K1, Deleted). The easy solution is to prohibit updating (K1, Deleted) to (K1, V2). I.e. writers have to wait until RCU callback will update (K1, Deleted) to (Null, Null), and only then writer can update (Null, Null) to (K1, V2). By "writers have to wait until RCU callback" I mean they just must use another cell to put (K1, V2). So hash map can contain (K1, V) and arbitrary number of (K1, Deleted) (probably this will rise new wave of problems :) ) This will also solve scenario you've described.
Though I don't think out all the details, definitely there will be some subtle moments. But I believe it's possible to implement such cell reclamation with RCU.
On Nov 14, 2:12 pm, "Chris M. Thomasson" <n...@spam.invalid> wrote:
> However, the is > another fairly significant problem I observed. His algorithm relies on a CAS > function that can __atomically__ return failure value.
I remember you were posting this on comp.programming, but I never understood why it's so critical for Click's algorithm. The algo is based on state machine, so if CAS will return value not atomically, it must be not critical. It will just result in another unnecessary "CAS loop". Not? I am thinking about it this way: 1. Thread loads some value (probably not actual) 2. Thread tries to execute CAS, CAS fails, CAS returns some value (probably not actual) 3. Thread tries to execute CAS again... We can remove step 2, and nothing changes. We can just assume that value, which thread get on step 2, was loaded on step 1. What I am missing? In what way algo relies exactly on CAS returning value atomically?
> > However, the is > > another fairly significant problem I observed. His algorithm relies on a > > CAS > > function that can __atomically__ return failure value. > I remember you were posting this on comp.programming, but I never > understood why it's so critical for Click's algorithm. The algo is > based on state machine, so if CAS will return value not atomically, it > must be not critical. It will just result in another unnecessary "CAS > loop". Not?
Doesn't he claim his algorithm is wait-free such that there are no CAS-loops?
AFAICT, he says that the reason its wait-free is because there are no CAS-loops such that CAS failure determines the next action in the state-machine. Also, he depends on CAS not failing spuriously. His state-macine needs logic like:
if (value == nvalue) { // state shift committed by this thread
} else {
// state shift committed by other thread // `nvalue' determines next action
} > I am thinking about it this way: > 1. Thread loads some value (probably not actual) > 2. Thread tries to execute CAS, CAS fails, CAS returns some value > (probably not actual)
CAS on Java returns only true or false right? I still think he needs the actual value which caused the CAS to fail.
> 3. Thread tries to execute CAS again...
But he says that his algorithm executes no CAS-loops... Humm, interesting.
> We can remove step 2, and nothing changes. We can just assume that > value, which thread get on step 2, was loaded on step 1. > What I am missing? In what way algo relies exactly on CAS returning > value atomically?
AFAICT, his algorithm needs the actual "real" reason for CAS failure. x86/SPARC provides this functionality. So there is no problem on those archs. Perhaps I am taking his word to literally. I need to study the algorithm some more. However, he does say that there are no CAS-loops, and that reason for CAS failure dictates next action. AFAICT, this reason represents the new value that got swapped in and caused the CAS to fail.
>> > > "Josh Dybnis" <jdyb...@gmail.com> wrote in message
>> > >news:347303c8-4446-4a7b-8a39-d413b013a3e5@d36g2000prf.googlegroups.com...>C> >> > > implementation of Cliff Click's lock-free hash table. I built it >> > > > with current generation x86 systems as a target, so it uses full- >> > > > fenced versions of the atomic ops. Comments appreciated. The code >> > > > is >> > > > in the public domain. The full source with makefile and supporting >> > > > libraries are athttp://code.google.com/p/nbds/
>> > > [...]
>> > > One quick question, can you quickly explain how your user-space RCU >> > > algorithm efficiently detects per-thread quiescent states? Can a >> > > thread >> > > enter and exit a non-quiescent state without using any locked RMW >> > > and/or >> > > MFENCE instructions? >> > I think the answer to your question is yes...but let me elaborate. The >> > threads are arranged in a circle. When a thread wants to defer a free >> > it creates a new token and passes it to its successor. A thread >> > announces it is quiescent by passing the token it last received from >> > its predecessor, on to its successor. When a thread sees one of its >> > tokens come all the way back around it knows every thread must have >> > been in a quiescent state at least once. Therefore it is safe to free >> > whatever object(s) was associated with the token (the token is >> > actually a pointer into an array of objects waiting to be freed).
>> So, when a thread "leaves" a non-quiescent region, it explicitly flushes >> its >> local "token" queue, and passes tokens that it does not own to its >> successor? And, your saying that all dequeued tokens that the thread does >> indeed "own" are in a persistent quiescent state because they would have >> come "full circle"; right?
> Yes.
>> > All >> > the token passing between threads is single reader / single writer, so >> > no locked RMW instructions or mfence instructions are necessary.
>> Okay. Does this not require participation from all threads such that if >> one >> is blocked, it will hold up an entire grace-period?
> Yes.
>> How do you detect grace-periods if N RCU threads are blocking, in say, >> I/O operations, or something else?
> In this implementation the system just gets backed up as you've > pointed out above. No memory gets reclaimed until the blocked threads > return. This is not insurmountable in a real-world implementation. > I'll describe why.
[...]
Okay. Well, AFAICT, this should work out just fine indeed; if all the rules are followed...
>> > However, the is >> > another fairly significant problem I observed. His algorithm relies on >> > a >> > CAS >> > function that can __atomically__ return failure value.
>> I remember you were posting this on comp.programming, but I never >> understood why it's so critical for Click's algorithm. The algo is >> based on state machine, so if CAS will return value not atomically, it >> must be not critical. It will just result in another unnecessary "CAS >> loop". Not?
> Doesn't he claim his algorithm is wait-free such that there are no > CAS-loops?
[...]
I have to go, but one question before I do:
If the next action is not atomically acquired on CAS failure, will that lead the thread down a false path within the state machine? Is a "false state" is harmless?
On Nov 14, 3:24 am, "Dmitriy V'jukov" <dvyu...@gmail.com> wrote:
[...]
> Assume initial cell's value is (K1, V1). > Thread tries to update (K1, V1) to (K1, V2). But before CAS it's > preempted. > Now another thread updates to (K1, Deleted).
All OK.
> Then RCU callback updates to (Null, Null).
That step is problematic. The RCU callback is not performing an atomic operation. With the memory model Click is assuming it is possible for another thread to come along and see (K1,Null) while the callback is "in progress". Now we have something that looks like it is in the progress of being written for the first time. The new thread could update the value, thinking the key is valid, then the RCU's update completes and we get something like (Null,V2).
> The easy solution is to prohibit updating (K1, Deleted) to (K1, V2). > I.e. writers have to wait until RCU callback will update (K1, Deleted) > to (Null, Null), and only then writer can update (Null, Null) to (K1, > V2). > By "writers have to wait until RCU callback" I mean they just must use > another cell to put (K1, V2). So hash map can contain (K1, V) and > arbitrary number of (K1, Deleted) (probably this will rise new wave of > problems :) )
The problem with inserting a key in an alternate slot, is that now readers can't stop probing when they hit an empty slot or deleted value. They have to probe the maximum distance before they can conclude a key is not in the table.
> Though I don't think out all the details, definitely there will be > some subtle moments. But I believe it's possible to implement such > cell reclamation with RCU.
There are too many subtleties for my intuition to be effective. I'm not going to place a bet either way.