Here is completely working code:
class hazard_allocator
{
public:
hazard_allocator(unsigned record_size, unsigned block_size)
: record_size_(record_size)
, record_count_((block_size - sizeof(block_header)) /
(sizeof(record_header) + record_size))
, block_size_(block_size)
, repair_treshold_(record_count_ * foreign_free_miss_coef)
, block_count_()
, first_block_()
, last_block_()
{
alloc_block();
}
~hazard_allocator()
{
for (block_header* block = first_block_, *next; block; block =
next)
{
next = block->next_block_;
::free(block);
}
}
void* alloc()
{
if (0 == current_block_->my_free_)
return slow_alloc();
record_header* rec = current_block_->my_free_;
current_block_->my_free_ = current_block_->my_free_->next_;
current_block_->alloc_count_ += 1;
return to_data(rec);
}
void free(void* p)
{
record_header* rec = to_rec(p);
block_header* block = rec->parent_;
if (block->parent_ == this)
my_free(rec);
else
block->parent_->foreign_free(rec);
}
private:
struct block_header;
struct record_header
{
block_header* parent_;
record_header* volatile next_;
int volatile status_;
int padding_;
};
struct block_header
{
hazard_allocator* parent_;
block_header* next_block_;
block_header* prev_block_;
record_header* volatile foreign_free_;
char pad [cacheline_size];
record_header* my_free_;
unsigned grab_count_;
unsigned alloc_count_;
};
block_header* first_block_;
block_header* last_block_;
block_header* current_block_;
unsigned block_count_;
unsigned const record_size_;
unsigned const record_count_;
unsigned const block_size_;
unsigned const repair_treshold_;
static unsigned const foreign_free_miss_coef = 1024;
record_header* begin(block_header* block)
{
return (record_header*)(block + 1);
}
record_header* end(block_header* block)
{
return (record_header*)((char*)(block + 1) +
(sizeof(record_header) + record_size_) * record_count_);
}
record_header* next(record_header* rec)
{
return (record_header*)((char*)rec + sizeof(record_header) +
record_size_);
}
record_header* to_rec(void* p)
{
return (record_header*)((char*)p - sizeof(record_header));
}
void* to_data(record_header* r)
{
return r + 1;
}
void my_free(record_header* rec)
{
block_header* block = rec->parent_;
grab_rec(block, rec);
if (0 == block->alloc_count_ && block_count_ > 1)
free_block(block);
}
void foreign_free(record_header* rec)
{
block_header* block = rec->parent_;
record_header* volatile top;
record_header* volatile* top_p = &block->foreign_free_;
_ReadWriteBarrier();
do
{
top = *top_p;
rec->next_ = top;
// #StoreStore
} while (!hazard_cas((void**)top_p, top, rec));
// #StoreStore
rec->status_ = 1;
}
void grab_rec(block_header* block, record_header* rec)
{
rec->status_ = 0;
rec->next_ = block->my_free_;
block->my_free_ = rec;
block->alloc_count_ -= 1;
}
block_header* alloc_block()
{
block_header* block = (block_header*)malloc(block_size_);
if (0 == block)
throw std::bad_alloc();
current_block_ = block;
block->next_block_ = 0;
block->prev_block_ = last_block_;
if (last_block_)
last_block_->next_block_ = block;
last_block_ = block;
if (0 == first_block_)
first_block_ = block;
block_count_ += 1;
block->parent_ = this;
block->foreign_free_ = 0;
block->my_free_ = begin(block);
block->grab_count_ = 0;
block->alloc_count_ = 0;
record_header* rec = begin(block);
record_header* const e = end(block);
record_header* nxt = 0;
for (; rec != e; rec = nxt)
{
nxt = next(rec);
rec->status_ = 0;
rec->parent_ = block;
rec->next_ = (nxt != e) ? nxt : 0;
}
return block;
}
void free_block(block_header* block)
{
if (block->prev_block_)
block->prev_block_->next_block_ = block->next_block_;
if (block->next_block_)
block->next_block_->prev_block_ = block->prev_block_;
if (first_block_ == block)
first_block_ = block->next_block_;
if (last_block_ == block)
last_block_ = block->prev_block_;
if (current_block_ == block)
current_block_ = last_block_;
block_count_ -= 1;
::free(block);
}
void* slow_alloc()
{
if (squeeze(current_block_))
return alloc();
block_header* block = last_block_;
block_header* prev = 0;
while (block)
{
prev = block->prev_block_;
if (squeeze(block))
{
if (0 == block->alloc_count_ && block_count_ > 1)
{
free_block(block);
}
else
{
current_block_ = block;
return alloc();
}
}
block = prev;
}
alloc_block();
return alloc();
}
bool squeeze(block_header* block)
{
if (block->my_free_)
return true;
if (grab(block))
return true;
if (repair(block, false))
return true;
return false;
}
bool grab(block_header* block)
{
if (0 == block->foreign_free_)
return false;
record_header* volatile top;
record_header* volatile* top_p = &block->foreign_free_;
_ReadWriteBarrier();
do
{
top = *top_p;
} while (!hazard_cas((void**)top_p, top, 0));
bool sucess = false;
record_header* current = top;
while (current)
{
record_header* next = current->next_;
if (1 == current->status_)
{
sucess = true;
block->grab_count_ += 1;
grab_rec(block, current);
}
current = next;
}
return sucess;
}
bool repair(block_header* block, bool force)
{
if (0 == block->alloc_count_)
return false;
if (false == force && block->grab_count_ < repair_treshold_)
return false;
block->grab_count_ = 0;
bool success = false;
record_header* rec = begin(block);
record_header* const e = end(block);
record_header* nxt = 0;
for (; rec != e; rec = nxt)
{
nxt = next(rec);
if (1 == rec->status_)
{
success = true;
grab_rec(block, rec);
}
}
return success;
}
hazard_allocator(hazard_allocator const&);
hazard_allocator& operator = (hazard_allocator const&);
};
Dmitriy V'jukov
> Multi-threaded memory allocator w/o atomic RMW operations or heavy
> memory barriers. At all.
> Basic outline:
> Allocator created per thread. Local free operations use simple stack.
> Remote free operations use mpsc-stack but with "hazard_cas" - cmpxchg
> w/o lock prefix. So this mpsc-stack used only as "hint" - I assume that
> I can find in this stack some nodes twice, and some nodes can lost. To
> resurrect lost nodes I use "repair" operation. Repair simply scan all
> nodes and search for lost nodes.
[...]
I can claim the following on the current vZOOM allocator:
__________________________________________________________________
Allocations
----------------------
1. Local malloc has single-thread performance
2. Local malloc to a completely exhausted local heap uses Atomic SWAP.
De-allocations
----------------------
1. Local free has single-thread performance
2. Remote free to a cache enabled heap has single-thread
performance.
3. Remote free to a non-cache enabled heap use Atomic CAS (LOCK prefix to
just to make the CAS work correctly)
__________________________________________________________________
All of that needs no membar. When I use the cache heap thing (a thread can
decide its local heap structures into cache mode) it does not even have to
use CAS, with or without LOCK prefix.
So, if no thread in an application sets any of their threads into cache
mode, then yours has lower overhead remove free function for sure because
you omit LOCK prefix.
I would ask about this over in Intel thread forum.
BTW, I have not read your code because it detailed and I need some time.
So, please correct me if I have misunderstood it at all.
Thanks Dmitriy.
> BTW, I have not read your code because it detailed and I need some time.
> So, please correct me if I have misunderstood it at all.
Yes, code is little obscured - allocator manages a set of superblocks,
free superblocks when they become empty and so on.
I think you are understood all correctly.
Dmitriy V'jukov
Main question - what will be loss rate (i.e. when hazard_cas() succeed
but node is not actually enqueued into stack). If loss rate is high
then we have to frequently repair block, to not lose too many nodes.
If loss rate is low than we can repair block rarely. Repair is
relatively expensive operation.
I test the allocator under sufficiently heavy load. On dual-core Intel
with shared L2 cache loss rate is about 0.01%, i.e. 1 lost node per
10000 successfully transfered nodes. So if I want to lose no more than
10 nodes, I can repair 1 time for every 100000 successful remote node
free() operations. I think it's very good results.
On quad-core Intel with 2 separete L2 caches loss rate is about 1%,
i.e. 1 lost node per 100 successfully transfered nodes. So if I want
to lose no more than 10 nodes, I can repair 1 time for every 1000
successful remote node free() operations. It's not so good.
So I think hybrid approach can be used.
For example. System with 2 processors and 8 hardware threads per
processor. hazard remote free operations can be used inside processor,
and accurate (with lock prefix) remote free operations can be used
between processors. So allocator will have 3 anchors - 1 for local
free, 1 for hazard remote free and 1 for accurate very remote free.
Dmitriy V'jukov
The allocator can be made headerless.
> struct record_header
> {
> block_header* parent_;
> record_header* volatile next_;
> int volatile status_;
> int padding_;
> };
parent_ field can be eliminated if blocks will be allocated aligned by
block_size.
next_ field is only valid when record is free, so next_ can be
overlapped with user object.
status_ field must be offloaded to user object's state. It can be
combined with some pointer or enum etc. We only need 1 unused bit.
padding_ is not needed at all.
Dmitriy V'jukov
Indeed! Read here:
http://groups.google.com/group/comp.programming.threads/msg/bc019dc04362be41
http://groups.google.com/group/comp.programming.threads/msg/1d3aeaa6ebf5181f
Is that similar solution?
Yes. I think it's the same.
The only problem in my allocator - status_ field.
Dmitriy V'jukov