[Cross-post from USENET comp.programming.threads]
http://groups.google.ru/group/comp.programming.threads/browse_frm/thread/8c6c19698964d1f6
rw_mutex with costless read_lock()/read_unlock()
All overhead is moved to writer side (therefore called 'asymmetric')
Based on my previous ideas:
http://groups.google.com/group/comp.programming.threads/browse_frm/thread/1348064e4838e871
http://groups.google.com/group/comp.programming.threads/browse_frm/thread/b7d3b8c08f9ca3c6
And uses this epoch auto detection algorithm:
http://groups.google.com/group/comp.programming.threads/browse_frm/thread/f8f78ff07df6df5f
Now asymmetric rw_mutex is free of any caveats like possible
deadlocks. And not use any OS/hardware specific features.
Implementation:
int const max_reader_count = 1024;
// unique thread index (0 .. (max_reader_count - 1))
__declspec(thread) int current_thread_index;
class arw_mutex_t
{
private:
    CRITICAL_SECTION cs;
    // private flag for every reader
    bool volatile reader_inside [max_reader_count];
    bool volatile writer_pending;
public:
    arw_mutex_t()
    {
        InitializeCriticalSection(&cs);
        writer_pending = false;
        for (int i = 0; i != max_reader_count; ++i)
            reader_inside[i] = false;
    }
    void reader_lock()
    {
        reader_inside[current_thread_index] = true;
        // no explicit #StoreLoad
        // but it will be implicitly executed
        // by sys_synch() in writer_lock()
        if (writer_pending)
        {
            // if writer is pending
            // than signal that we see it
            // and wait for writer to complete
            reader_inside[current_thread_index] = false;
            EnterCriticalSection(&cs);
            reader_inside[current_thread_index] = true;
            LeaveCriticalSection(&cs);
        }
        // need to order user code inside critical section
        // acquire compiler barrier
        _ReadWriteBarrier();
    }
    void reader_unlock()
    {
        // need to order user code inside critical section
        // release compiler barrier
        _ReadWriteBarrier();
        reader_inside[current_thread_index] = false;
    }
    void writer_lock()
    {
        EnterCriticalSection(&cs);
        // set that we are waiting
        writer_pending = true;
        // all magic is here:
        // implicitly execute full memory barrier
        // on all other processors
        sys_synch();
        // here we are sure that:
        // (1) writer will see (reader_inside[i] == true)
        // or (2) reader will see (writer_pending == true)
        // so no race conditions
        for (int i = 0; i != max_reader_count; ++i)
        {
            // wait for all readers to complete
            while (reader_inside[i])
                SwitchToThread();
        }
    }
    void writer_unlock()
    {
        writer_pending = false;
        LeaveCriticalSection(&cs);
    }
};
Dmitriy V'jukov