I propose the special lock-free allocator for allocating nodes for
fifo-queues.
The feature of this allocator is that allocation and deallocation
issues no atomic operations.
Here goes code:
template<typename type>
class node_allocator
{
private:
struct header
{
header* next_;
bool free_;
// actually this struct can locate in one word
// we can use lsb of next pointer as free flag
};
// head of fifo queue of allocated items
// here we place newly allocated item
header* head_;
// tail of fifo queue of allocated items
// here is situated the oldest item
header* tail_;
// count of allocated items
unsigned count_;
public:
node_allocator()
: head_()
, tail_()
, count_()
{}
type* alloc()
{
// this method must be called by only one thread
// that owns this allocator - the "producer" -
// that allocates the nodes and enqueue them into
// the fifo-queue
header* h;
if (tail_ && tail_->free_)
{
h = tail_;
tail_ = tail_->next_;
}
else
{
h = reinterpret_cast<header*>(new
char[sizeof(header) +
sizeof(type)]);
++count_;
}
h->free_ = false;
h->next_ = 0;
if (head_)
head_->next_ = h;
if (!tail_)
tail_ = h;
head_ = h;
return reinterpret_cast<type*>(h + 1);
}
static void free(type* node)
{
// this method is called by any thread
// that free the node that owned by this allocator -
// the "consumer"
header* h = reinterpret_cast<header*>(node) - 1;
// release_membar
h->free_ = true;
}
};
This version doesn't free any nodes to system allocator and only
accumulates nodes. But it seems that freeing of nodes can be added to
algorithm - especially since most of the data is touched by only one
thread - so addition of freeing should not add any "CAS-overhead".
Nevertheless if we want really high performance we must accumulate
_many_ nodes anyway.
Usage example:
template<typename type>
class mpsc_queue
{
public:
struct node
{
node* next_;
type* data_;
};
typedef node_allocator<node> allocator_t;
private:
node* head_;
node* tail_;
public:
mpsc_queue()
: head_()
, tail_()
{
}
void enqueue(allocator_t& a, type* v)
{
// here we allocate from thread-local allocator
node* n = a.alloc();
n->data_ = v;
do {n->next_ = head_;}
while (n->next_ != (node*)atomic_cas((long
volatile*)&head_,
(long)n, (long)n->next_));
}
type* dequeue()
{
if (!tail_)
{
node* head = (node*)atomic_exchange((long
volatile*)&head_, 0);
if (!head)
return 0;
node* prev = 0;
node* curr = head;
do
{
node* next = curr->next_;
curr->next_ = prev;
prev = curr;
curr = next;
}
while (curr);
tail_ = prev;
}
type* v = tail_->data_;
node* n = tail_;
tail_ = n->next_;
// here we free the node
allocator_t::free(n);
return v;
}
};
int main()
{
typedef mpsc_queue<int> queue_t;
typedef queue_t::allocator_t allocator_t;
queue_t q;
allocator_t a;
int i1 = 11;
int i2 = 12;
int i3 = 13;
q.enqueue(a, &i1);
q.enqueue(a, &i2);
q.enqueue(a, &i3);
int* ii1 = q.dequeue();
int* ii2 = q.dequeue();
q.enqueue(a, &i1);
q.enqueue(a, &i2);
q.enqueue(a, &i3);
int* ii3 = (int*)q.dequeue();
int* ii4 = (int*)q.dequeue();
}
I think that usage of such allocator can be very beneficial in some
scenarios, particular with various heavy load fifo queues.
Chris Thomasson distributed lock-free slab-allocator:
http://groups.google.com/group/comp.programming.threads/msg/e3efa5628...
doesn't handle such situations very good - requires one CAS on
deallocation and zero CAS'es on allocation amortized.
It seems that with my node_allocator mpsc_queue::dequeue() operation
doesn't issue any atomic operations at all (amortized :) ).
Dmitriy V'jukov
I think it is new. And it is fast.
No one memory location is written by two threads. So no CASes at all.
Dmitriy V'jukov