/* Simple, crude read/write mutex test by: Chris M. Thomasson __________________________________________*/ #include <thread> #include <atomic> #include <shared_mutex> #include <condition_variable> #include <iostream> #include <functional> #include <cassert> #include <cstdlib> #include <ctime> #define THREADS 16UL #define ITERS 10000000UL #define COUNT (THREADS * ITERS) // undefine to test std::shared_mutex #define CT_TEST_FAST_MUTEX 1 // bare bones mutex/condvar based semaphore struct ct_slow_semaphore { unsigned long m_state; std::mutex m_mutex; std::condition_variable m_cond; ct_slow_semaphore(unsigned long state) : m_state(state) {} void inc() { { std::unique_lock<std::mutex> lock(m_mutex); ++m_state; } m_cond.notify_one(); } void add(unsigned long addend) { { std::unique_lock<std::mutex> lock(m_mutex); m_state += addend; } m_cond.notify_all(); } void dec() { std::unique_lock<std::mutex> lock(m_mutex); while (m_state == 0) m_cond.wait(lock); --m_state; } }; // bin-sema struct ct_auto_reset_event { bool m_state; std::mutex m_mutex; std::condition_variable m_cond; ct_auto_reset_event() : m_state(false) {} void signal() { std::unique_lock<std::mutex> lock(m_mutex); m_state = true; m_cond.notify_one(); } void wait() { std::unique_lock<std::mutex> lock(m_mutex); while (m_state == false) m_cond.wait(lock); m_state = false; // auto-reset } }; // just a layer over an auto-reset event struct ct_fast_mutex { std::atomic<unsigned int> m_state; ct_auto_reset_event m_waitset; ct_fast_mutex() : m_state(0) {} void lock() { if (m_state.exchange(1, std::memory_order_acquire)) { while (m_state.exchange(2, std::memory_order_acquire)) { m_waitset.wait(); } } } void unlock() { if (m_state.exchange(0, std::memory_order_release) == 2) { m_waitset.signal(); } } }; // Chris M. Thomassons Experimental Read/Write Mutex // Yeah, it is pretty damn fat wrt the state, however // it has some interesting properties... // The state can be compressed a bit... // btw, it has no loops... // Take a look at the lock_shared and unlock_shared functions #define RWMUTEX_COUNT_MAX LONG_MAX struct ct_rwmutex { // shared state std::atomic<long> m_wrstate; std::atomic<long> m_count; std::atomic<long> m_rdwake; ct_slow_semaphore m_rdwset; ct_slow_semaphore m_wrwset; ct_fast_mutex m_wrlock; ct_rwmutex() : m_wrstate(1), m_count(RWMUTEX_COUNT_MAX), m_rdwake(0), m_rdwset(0), m_wrwset(0) { } // READ, pretty slim... void lock_shared() { if (m_count.fetch_add(-1, std::memory_order_acquire) < 1) { m_rdwset.dec(); } } void unlock_shared() { if (m_count.fetch_add(1, std::memory_order_release) < 0) { if (m_rdwake.fetch_add(-1, std::memory_order_acq_rel) == 1) { m_wrwset.inc(); } } } // WRITE, more hefty void lock() { m_wrlock.lock(); long count = m_count.fetch_add(-RWMUTEX_COUNT_MAX, std::memory_order_acquire); if (count < RWMUTEX_COUNT_MAX) { long rdwake = m_rdwake.fetch_add(RWMUTEX_COUNT_MAX - count, std::memory_order_acquire); if (rdwake + RWMUTEX_COUNT_MAX - count) { m_wrwset.dec(); } } } // write unlock void unlock() { long count = m_count.fetch_add(RWMUTEX_COUNT_MAX, std::memory_order_release); if (count < 0) { m_rdwset.add(-count); } m_wrlock.unlock(); } }; struct ct_shared { std::atomic<unsigned long> m_state; #if defined (CT_TEST_FAST_MUTEX) ct_rwmutex m_std_rwmutex; #else std::shared_mutex m_std_rwmutex; #endif ct_shared() : m_state(0) {} }; void ct_thread(ct_shared& shared, std::size_t index) { for (unsigned int i = 0; i < ITERS; ++i) { shared.m_std_rwmutex.lock(); if (i % 256 == 0) std::this_thread::yield(); shared.m_state += 1; shared.m_std_rwmutex.unlock(); shared.m_std_rwmutex.lock_shared(); if (i % 512 == 0) std::this_thread::yield(); //shared.m_state += 1; shared.m_std_rwmutex.unlock_shared(); } } int main() { ct_shared shared; std::cout << "Testing: "; #if defined (CT_TEST_FAST_MUTEX) std::cout << "Chris M. Thomasson's Experimental Read/Write Mutex\n\n"; #else std::cout << "std::shared_mutex\n\n"; #endif { std::thread threads[THREADS]; std::clock_t start = std::clock(); for (std::size_t i = 0; i < THREADS; ++i) { threads[i] = std::thread(ct_thread, std::ref(shared), i); } for (std::size_t i = 0; i < THREADS; ++i) { threads[i].join(); } std::clock_t diff = clock() - start; unsigned long msec = diff * 1000 / CLOCKS_PER_SEC; std::cout << "msec = " << msec << "\n"; } std::cout << "shared.m_state = " << shared.m_state << "\n"; std::cout << "\n\nFin!\n"; assert(shared.m_state == COUNT); return 0; }
On Sun, Feb 17, 2019 at 12:34 AM Chris M. Thomasson <cri...@charter.net> wrote:
>
> Fwiw, here is a simple benchmark for an older read/write algorithm I invented:
>
>
> https://pastebin.com/raw/xCBHY9qd
>
>
> It should compile fine on MSVC 2017 and GCC with -std=c++17. There is a macro in the code called CT_TEST_FAST_MUTEX. If it is defined then the code will test my algorithm. If not defined then it will test against std::shared_mutex. I am wondering if anybody can compile and run the code? Test it out on as many operating systems as you can. Can you please show the output?
>
>
> Thanks everybody. :^)
>
> Here is the code:
>
> /* Simple, crude read/write mutex test
> by: Chris M. Thomasson
> __________________________________________*/
[...]
Hey Chris!
Here are my results on i7-8650U (4 HT cores) on Debian 4.19.16-1 gcc 7.3.0
std::shared_mutex
msec = 56118
msec = 49149
msec = 69416
msec = 68737
msec = 59174
msec = 65915
ct_rwmutex
msec = 62772
msec = 71139
msec = 68110
msec = 66038
msec = 60436
msec = 74238
I can also test on 2 CPU system with 88 hardware threads in total, but
the bench hardcodes 16 threads (there is std::hardware_concurrency()
or something like that).
I've found that it's critical to model realistic/target ratio between
reads/writes and amount of local work and work inside of critical
section in such benchmarks. Otherwise in 100% synthetic hammering
usually the worst mutex shows the best results. Synthetic benchmarks
produce extreme contention, so a mutex that blocks threads as much as
possible and allows as few threads as possible to work, shows the best
result because contention is minimized. The best would be to run all
threads sequentially one-by-one. But that's counter productive for
real life because blocked threads don't do useful work too.
Also this benchmark has fixed amount of work per thread, so I suspect
it may be subject to load imbalance when an unlucky outliner delays
total result. A better option would be to run all threads for X
seconds and then set a global var for all of them to stop and then
join all of them and measure number of iterations.
Also for me it lacked #include <climits>.
On Saturday, February 16, 2019 at 11:24:42 PM UTC-8, Dmitry Vyukov wrote:On Sun, Feb 17, 2019 at 12:34 AM Chris M. Thomasson <cri...@charter.net> wrote:
[...]
Thank you so much! Okay, not great, but all that "radically" hardcore terrible when compared to a std::shared_mutex on your end. Also, my work can benefit from many sessions of padding and alignment therapy wrt data-structure layout.
[...]
// READ, pretty slim... void lock_shared() { if (m_count.fetch_add(-1, std::memory_order_acquire) < 1) { m_rdwset.dec(); } } void unlock_shared() { if (m_count.fetch_add(1, std::memory_order_release) < 0) { if (m_rdwake.fetch_add(-1, std::memory_order_acq_rel) == 1) { m_wrwset.inc(); } } }
______
// Experimental Read-Write Mutex Test // More "realistic" test, in Relacy... // By: Chris M. Thomasson //_______________________________________________ //#define RL_DEBUGBREAK_ON_ASSERT //#define RL_MSVC_OUTPUT //#define RL_FORCE_SEQ_CST //#define RL_GC #include <relacy/relacy_std.hpp> #include <iostream> // Simple macro based redirection of the verbose std membars. #define CT_MB_ACQ std::memory_order_acquire #define CT_MB_REL std::memory_order_release #define CT_MB_RLX std::memory_order_relaxed #define CT_MB_ACQ_REL std::memory_order_acq_rel #define CT_MB_SEQ_CST std::memory_order_seq_cst #define CT_MB(mp_0) std::atomic_thread_fence(mp_0) #define READERS 4 #define WRITERS 2 #define THREADS (READERS + WRITERS) #define ITERS 3 //////////////// // bare bones mutex/condvar based semaphore struct ct_slow_semaphore { VAR_T(unsigned long) m_state; std::mutex m_mutex; std::condition_variable m_cond; ct_slow_semaphore(unsigned long state) : m_state(state) {} void inc() { m_mutex.lock($); ++VAR(m_state); m_mutex.unlock($); m_cond.notify_one($); } void add(unsigned long addend) { m_mutex.lock($); VAR(m_state) += addend; m_mutex.unlock($); m_cond.notify_all($); } void dec() { m_mutex.lock($); while (VAR(m_state) == 0) m_cond.wait(m_mutex, $); --VAR(m_state); m_mutex.unlock($); } }; // bin-sema struct ct_auto_reset_event { bool m_state; std::mutex m_mutex; std::condition_variable m_cond; ct_auto_reset_event() : m_state(false) {} void signal() { m_mutex.lock($); m_state = true; m_cond.notify_one($); m_mutex.unlock($); } void wait() { m_mutex.lock($); while (m_state == false) m_cond.wait(m_mutex, $); m_state = false; // auto-reset m_mutex.unlock($); } }; // just a layer over an auto-reset event struct ct_fast_mutex { std::atomic<unsigned int> m_state; ct_auto_reset_event m_waitset; ct_fast_mutex() : m_state(0) {} void lock() { if (m_state.exchange(1, std::memory_order_acquire)) { while (m_state.exchange(2, std::memory_order_acquire)) { m_waitset.wait(); } } } void unlock() { if (m_state.exchange(0, std::memory_order_release) == 2) { m_waitset.signal(); } } }; // Chris M. Thomassons Experimental Read/Write Mutex // Yeah, it is pretty damn fat wrt the state, however // it has some interesting properties... // The state can be compressed a bit... // btw, it has no loops... // Take a look at the lock_shared and unlock_shared functions #define RWMUTEX_COUNT_MAX LONG_MAX struct ct_rwmutex { // shared state std::atomic<long> m_wrstate; std::atomic<long> m_count; std::atomic<long> m_rdwake; ct_slow_semaphore m_rdwset; ct_slow_semaphore m_wrwset; ct_fast_mutex m_wrlock; ct_rwmutex() : m_wrstate(1), m_count(RWMUTEX_COUNT_MAX), m_rdwake(0), m_rdwset(0), m_wrwset(0) { } // READ, pretty slim... void lock_shared() { if (m_count.fetch_add(-1, std::memory_order_acquire) < 1) { m_rdwset.dec(); } } void unlock_shared() { if (m_count.fetch_add(1, std::memory_order_release) < 0) { if (m_rdwake.fetch_add(-1, std::memory_order_acq_rel) == 1) { m_wrwset.inc(); } } } // WRITE, more hefty void lock() { m_wrlock.lock(); long count = m_count.fetch_add(-RWMUTEX_COUNT_MAX, std::memory_order_acquire); if (count < RWMUTEX_COUNT_MAX) { long rdwake = m_rdwake.fetch_add(RWMUTEX_COUNT_MAX - count, std::memory_order_acquire); if (rdwake + RWMUTEX_COUNT_MAX - count) { m_wrwset.dec(); } } } // write unlock void unlock() { long count = m_count.fetch_add(RWMUTEX_COUNT_MAX, std::memory_order_release); if (count < 0) { m_rdwset.add(-count); } m_wrlock.unlock(); } }; //////////////// struct ct_simple_stack { struct node { VAR_T(node*) m_next; VAR_T(unsigned int) m_tid; node(unsigned int tid) : m_tid(tid) {} }; VAR_T(node*) m_head; ct_simple_stack() : m_head(nullptr) {} void push(node* n) { VAR(n->m_next) = VAR(m_head); VAR(m_head) = n; } node* pop() { node* n = VAR(m_head); if (n) { VAR(m_head) = VAR(n->m_next); } return n; } node* flush() { node* n = VAR(m_head); VAR(m_head) = nullptr; return n; } }; void ct_destroy_nodes(ct_simple_stack& s) { ct_simple_stack::node* n = s.flush(); while (n) { ct_simple_stack::node* next = VAR(n->m_next); delete n; n = next; } } // Relacy Stack Test... struct ct_fast_mutex_test : rl::test_suite<ct_fast_mutex_test, THREADS> { ct_rwmutex g_rwmutex; ct_simple_stack g_stack; void before() { } void after() { ct_destroy_nodes(g_stack); } // reader void reader(unsigned int tidx) { g_rwmutex.lock_shared(); for (unsigned int i = 0; i < ITERS; ++i) { ct_simple_stack::node* h = VAR(g_stack.m_head); while (h) { ct_simple_stack::node* next = VAR(h->m_next); if (i % 512 == 0) { rl::yield(1, $); } h = next; } } g_rwmutex.unlock_shared(); } // writer void writer(unsigned int tidx) { g_rwmutex.lock(); g_stack.push(new ct_simple_stack::node(tidx)); g_stack.push(new ct_simple_stack::node(tidx)); g_rwmutex.unlock(); g_rwmutex.lock(); ct_simple_stack::node* n = g_stack.pop(); g_rwmutex.unlock(); if (n) { // destroy delete n; } } void thread(unsigned int tidx) { if (tidx < READERS) { reader(tidx); } else { writer(tidx); } } }; // Test away... Or fly? Humm... int main() { { rl::test_params p; p.iteration_count = 10000000; //p.execution_depth_limit = 33333; //p.search_type = rl::sched_bound; //p.search_type = rl::fair_full_search_scheduler_type; //p.search_type = rl::fair_context_bound_scheduler_type; rl::simulate<ct_fast_mutex_test>(p); } return 0; }
___________________________
Well, readers iterate, and writer push and pop. What do you think about this test Dmitry? Good "enough"?
This is the basic logic of the test I am going to write up in 100% pure standard C++11. Well, C++17 when test against std::shared_mutex... ;^)
On Sun, Feb 17, 2019 at 11:51 PM Chris M. Thomasson <cri...@charter.net> wrote:
>
> On Saturday, February 16, 2019 at 11:24:42 PM UTC-8, Dmitry Vyukov wrote:
>>
>> On Sun, Feb 17, 2019 at 12:34 AM Chris M. Thomasson <cri...@charter.net> wrote:
[...]
>> I can also test on 2 CPU system with 88 hardware threads in total, but
>> the bench hardcodes 16 threads (there is std::hardware_concurrency()
>> or something like that).
>
>
> Indeed! Will correct this issue. Make it dynamic. :^)
>
> Although, it can be beneficial to create more threads just to see how the algorithm handles these cases of "oversubscribed" contention.
Agree. So std::hardware_concurrency(), 2*std::hardware_concurrency(),
4*std::hardware_concurrency().
>> I've found that it's critical to model realistic/target ratio between
>> reads/writes and amount of local work and work inside of critical
>> section in such benchmarks. Otherwise in 100% synthetic hammering
>> usually the worst mutex shows the best results. Synthetic benchmarks
>> produce extreme contention, so a mutex that blocks threads as much as
>> possible and allows as few threads as possible to work, shows the best
>> result because contention is minimized. The best would be to run all
>> threads sequentially one-by-one. But that's counter productive for
>> real life because blocked threads don't do useful work too.
>
>
> Agreed. Just wondering why it performs so much better in some high contention scenarios. Load imbalance is a factor for sure.
>
> https://groups.google.com/d/msg/comp.lang.c++/g6veUOvOAu0/L7Sjs0xOBAAJ
>
> https://groups.google.com/d/msg/comp.lang.c++/g6veUOvOAu0/hMIu5VBSBAAJ
I don't need to tell you that performance of sync primitives can be
very non-linear and counter-intuitive :)
Perhaps the other mutexes were not super dumb, so they did not perform
the best under very high contention.
>> Also this benchmark has fixed amount of work per thread, so I suspect
>> it may be subject to load imbalance when an unlucky outliner delays
>> total result. A better option would be to run all threads for X
>> seconds and then set a global var for all of them to stop and then
>> join all of them and measure number of iterations.
>
>
> Indeed. A benckmark that just modeled a list that readers iterated, and writers pushed and popped from. Then we can show how many operations, reads or writes, they performed per-second. So it would be the number of reads and writes per-second, per-thread.
>
> This reminds of some of Joe Seighs tests back on comp.programming.threads.
>
>
>>
>>
>> Also for me it lacked #include <climits>.
>
>
> Fixed that. Thanks again:
>
> https://pastebin.com/raw/xCBHY9qd
> (reload the page...)
>
>
> Fwiw, I am wondering what you think about the algorithm itself? Is it crap, or decent? :^)
I think it's one of the best algorithms out there.
The complete fairness for both readers and writers is notable.
And the wait-free fast-path for readers.
It still suffers from high
cache line contention even on read-only workload, but it's as good as
one can get with a centralized design (i.e. not per-thread/cpu
distributed stuff which has own problems).
I would expect a
CAS-loop-based read lock to degrade significantly under high read
load.
Btw your algorithm is used as the standard Go RWMutex for the past 7+ years:
https://github.com/golang/go/commit/daaf29cf932
(I should have mentioned your authorship somewhere!)
As for potential improvements I can only think of optimizing
uncontented writer lock/unlock to be 1 RMW each, i.e. not offloading
writer competition resolution to a separate mutex, but rather
incorporate it into the same atomic variable readers and writers use
for synchronization with each other. Do you think it's possible? With
CAS-loop? Or maybe with some smart atomic_fetch_or? Wait,
atomic_fetch_or is compiled to a CAS loop on x86 anyway. These new
C/C++ atomic interfaces sometimes make me forget that.
Fwiw, here is a more realistic benchmark, in the Relacy stage for verification purposes:
On Wed, Feb 20, 2019 at 7:51 AM Chris M. Thomasson <cri...@charter.net> wrote:
>
> Fwiw, I wrote a crude new benchmark that measures how many reads and writes can be performed in a given amount of time. My algorithm vs std::shared_mutex. So, we are _primarily_ looking for how many reads can be performed in this test at 60 seconds. The number of threads is variable and is determined by std::hardware_concurrency * THREADS, set at 8 in the test. So on my system the setup is:
> ___________________________________
> cpu_threads_n = 4
> threads_n = 32
> writers = 16
> readers = 16
> test duration = 60 seconds
> ___________________________________
[...]
Benchmarked this on 2 systems (3 alternating runs for each mutex):
[...]
Now it is your problem to interpret this :)
Another interesting data point is time output.
On the large system for your mutex:
848.76user 10.83system 1:00.26elapsed 1426%CPU
for std mutex:
4238.27user 26.79system 1:00.18elapsed 7086%CPU
So whatever your mutex did, it used 5 times less CPU time for that.
I could get my hands on an AMD EPYC 7351P system and a Xeon Phi (Knights Corner) if you are interested in more results...
On Friday, March 1, 2019 at 7:11:20 AM UTC-8, Manuel Pöter wrote:I could get my hands on an AMD EPYC 7351P system and a Xeon Phi (Knights Corner) if you are interested in more results...I would be grateful if you can find the time to do this Manuel: Thanks. :^)
Really like the "difference" between the Intel and SPARC results. Afaict, std::shared_mutex, on Solaris? Needs to be fixed? I do not like the way it seems to "tank" the reads and obtain a shi% load of writes. Humm...
On Wed, Feb 20, 2019 at 1:31 AM Chris M. Thomasson <cri...@charter.net> wrote:
[...]
We can try with some new change :)
I never seen any names in Go sources before. And everything is usually
very consistent, it's not that everybody can do whatever they want in
a new file just because they wrote it and nobody else will care.
People actually care a lot about consistency and common style for
everything.
But now grepping the source I've found an existing precedent:
https://github.com/golang/go/blob/master/src/go/printer/nodes.go#L662
So we may try too :)
On Tuesday, February 26, 2019 at 12:16:24 AM UTC-8, Dmitry Vyukov wrote:On Wed, Feb 20, 2019 at 1:31 AM Chris M. Thomasson <cri...@charter.net> wrote:
[...]
We can try with some new change :)Just as a first start, I changed m_count to unsigned long, and is "integrated" with lock and unlock, and still has no loops, just a first exploratory experiment. Here is the code:It still only uses fetch_add. Humm, not sure what to make of it. Also, I am pissed that fetch_or goes to CMPXCHG loop on x86._________________________________// Chris M. Thomassons Experimental Read/Write Mutex
[...]
// WRITE, more heftyvoid wrlock(){unsigned long wrcount = m_count.fetch_add(WRITE, std::memory_order_acquire);if ((wrcount & WRITE_MASK) > 0){m_wrmtx.dec();}
unsigned long count = m_count.fetch_add(WRITE_BIT, std::memory_order_acquire);
On Friday, 1 March 2019 23:29:41 UTC+1, Chris M. Thomasson wrote:On Friday, March 1, 2019 at 7:11:20 AM UTC-8, Manuel Pöter wrote:I could get my hands on an AMD EPYC 7351P system and a Xeon Phi (Knights Corner) if you are interested in more results...I would be grateful if you can find the time to do this Manuel: Thanks. :^)Will do! :)
Really like the "difference" between the Intel and SPARC results. Afaict, std::shared_mutex, on Solaris? Needs to be fixed? I do not like the way it seems to "tank" the reads and obtain a shi% load of writes. Humm...Yes, the SPARC system is running Solaris 11.3; the binaries were built using a self-compiled GCC 6.3. I have to admit that I did not really look at the results before posting them here, but now that you pointed it out I agree - the shared_mutex results on Solaris are definitely not what I would have expected!