Simple Virtually Zero-Overhead Memory Allocator...

Chris Thomasson

unread,

Apr 21, 2008, 11:35:15 AM4/21/08

to

This pseudo-code is released under the GNU GENERAL PUBLIC LICENSE:

http://www.gnu.org/licenses/gpl.html

My initial memory allocator version is so promising! All I can say is, WOW!
Well, here it is:

<pseudo-code VERSION 0.0.0 (Pre-Alpha)>
______________________________________________________________________
/*
Virtually Zero-Overhead Memory Allocator
Copyright (C) 2008 Christopher Michael Thomasson

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#define HEAP_DEPTH 1048576
#define GLOBAL_HEAP_COUNT 128

#define ALIGN_SIZE(mp_this, mp_type, mp_align) ((mp_type) \
(((ptrdiff_t const)((mp_this))) + \
((ptrdiff_t const)((mp_align)) - 1) & \
(-((ptrdiff_t const)((mp_align))))) \
)

enum heap_aligner_enum {
HEAP_ALIGNER_ENUM_1,
HEAP_ALIGNER_ENUM_2,
HEAP_ALIGNER_ENUM_3
};

union heap_aligner {
char char_;
short int short_;
int int_;
long int long_;
long long int longlong_;
unsigned char uchar_;
unsigned short int ushort_;
unsigned int uint_;
unsigned long int ulong_;
unsigned long long int ulonglong_;
double double_;
float float_;
heap_aligner* this_;
void* ptr_;
void (*fptr1_) (void);
void* (*fptr2_) (void);
void* (*fptr3_) (void*, void*, void*);
heap_aligner_enum enum_;
heap_aligner_enum* ptrenum_;
};

struct base_heap {
unsigned char base[HEAP_DEPTH];
int is_global;
};

struct thread_heap {
base_heap heap;
size_t offset;
int depth;
atomicword rdepth;
};

struct global_heap_anchor64 {
atomicword offset;
atomicword depth;
};

struct global_heap {
base_heap heap;
global_heap_anchor64 anchor;
};

/* Base Heap API
___________________________________________________________________*/
base_heap*
base_heap_from_ptr(
void const* const ptr
) {
return /* round 'ptr' down to 'HEAP_DEPTH' boundary */;
}

/* Global Heap API
___________________________________________________________________*/
global_heap*
global_heap_create(void) {
void* const ptr = calloc(1, sizeof(global_heap) + HEAP_DEPTH - 1);
if (ptr) {
global_heap* const _this = ALIGN_SIZE(ptr, global_heap*, HEAP_DEPTH);
_this->is_global = 1;
return _this;
}
return NULL;
}

void*
global_heap_allocate(
global_heap* _this
size_t sz
) {
global_heap_anchor64 xchg, cmp = *_this->anchor;
do {
if (cmp.offset + sz >= HEAP_DEPTH) {
return NULL;
}
xchg.offset = cmp.offset + sz;
xchg.depth = cmp.depth + 1;
} while (! ATOMIC_CAS64(&_this->anchor, &cmp, &xchg));
return _this->heap.base + cmp.offset;
}

void
global_heap_deallocate(
global_heap* _this
) {
global_heap_anchor64 xchg, cmp = *_this->anchor;
do {
xchg.depth = cmp.depth - 1;
xchg.offset = (! xchg.depth) ? 0 : xchg.offset;
} while (! ATOMIC_CAS64(&_this->anchor, &cmp, &xchg));
}

/* Global Heap Variables */
static global_heap* const g_allocator_gheaps[GLOBAL_HEAP_COUNT] = { NULL };

/* system startup */
void
allocator_system_startup(void) {
int i;
for (i = 0; i < GLOBAL_HEAP_COUNT; ++i) {
g_allocator_gheaps[i] = global_heap_create();
}
}

/* hash a global heap from a thread heap */
global_heap*
global_heap_from_thread_heap(
thread_heap* const _this
) {
return g_allocator_gheaps +
((((unsigned long int)(_this) >> 8) * 307) & (GLOBAL_HEAP_COUNT - 1));
}

/* Thread Heap API
___________________________________________________________________*/
thread_heap*
thread_heap_create(void) {
void* const ptr = calloc(1, sizeof(thread_heap) + HEAP_DEPTH - 1);
if (ptr) {
thread_heap* const _this = ALIGN_SIZE(ptr, thread_heap*, HEAP_DEPTH);
return _this;
}
return NULL;
}

void
thread_heap_deallocate_local(
thread_heap* const _this
) {
if (! (--_this->depth)) {
_this->offset = 0;
}
}

void
thread_heap_deallocate_remote(
thread_heap* const _this
) {
ATOMIC_DEC(&_this->rdepth);
}

void*
thread_heap_allocate_local(
thread_heap* const _this,
size_t sz
) {
_this->offset += sz;
++_this->depth;
ptr = _this->heap.base + offset;
}

void*
thread_heap_allocate_remote(
thread_heap* const _this,
size_t sz
) {
atomicword const adjust = ATOMIC_XCHG(&_this->rdepth, 0);
if (! (_this->depth + adjust)) {
_this->offset = 0;
return thread_heap_allocate_local(_this, sz);
} else {
return NULL;
}
}

/* C Malloc/Free API
___________________________________________________________________*/
void* malloc(size_t sz) {
thread_heap* const _this = pthread_getspecific(...);
size_t const offset = _this->offset;
sz = ALIGN_SIZE(sz, size_t, sizeof(heap_aligner));
if (offset + sz < HEAP_DEPTH - 1) {
return thread_heap_allocate_local(_this);
} else {
void* const ptr = thread_heap_allocate_remote(_this);
if (ptr) {
return ptr;
} else {
global_heap* const gheap = global_heap_from_thread_heap(_this);
return global_heap_allocate(gheap, sz);
}
}
}

void free(void* ptr) {
base_heap* const heap = base_heap_from_ptr(ptr);
if (! heap->is_global) {
thread_heap* const _this = pthread_getspecific(...);
thread_heap* const ptr_thread = (thread_heap*)heap;
if (ptr_thread == _this) {
thread_heap_deallocate_local(_this);
} else {
thread_heap_deallocate_remote(ptr_thread);
}
} else {
global_heap_deallocate((global_heap*)heap);
}
}
______________________________________________________________________

That's all folks!

Well, what do you think? Doesn't this thing kick major as$! I am working on
a full-blown implementation. The only things I left out of this is initial
pseudo-code implementation are the thread and system shutdown procedures;
they are fairly trivial. When I get this completed, I will be offering it
for commercial use under a special license.

:^D

--
Chris M. Thomasson
http://appcore.home.comcast.net

Chris Thomasson

unread,

Apr 21, 2008, 12:08:46 PM4/21/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message
news:zfqdnenXdtJaLpHV...@comcast.com...

> This pseudo-code is released under the GNU GENERAL PUBLIC LICENSE:
>
> http://www.gnu.org/licenses/gpl.html
>
> My initial memory allocator version is so promising! All I can say is,
> WOW! Well, here it is:
>
>
> <pseudo-code VERSION 0.0.0 (Pre-Alpha)>
> ______________________________________________________________________
> /*
> Virtually Zero-Overhead Memory Allocator
> Copyright (C) 2008 Christopher Michael Thomasson
>
> This program is free software: you can redistribute it and/or modify
> it under the terms of the GNU General Public License as published by
> the Free Software Foundation, either version 3 of the License, or
> (at your option) any later version.
>
> This program is distributed in the hope that it will be useful,
> but WITHOUT ANY WARRANTY; without even the implied warranty of
> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> GNU General Public License for more details.
>
> You should have received a copy of the GNU General Public License
> along with this program. If not, see <http://www.gnu.org/licenses/>.
> */

[...]

> void
> global_heap_deallocate(
> global_heap* _this
> ) {
> global_heap_anchor64 xchg, cmp = *_this->anchor;
> do {
> xchg.depth = cmp.depth - 1;
> xchg.offset = (! xchg.depth) ? 0 : xchg.offset;
> } while (! ATOMIC_CAS64(&_this->anchor, &cmp, &xchg));
> }

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Needs to be:

void
global_heap_deallocate(
global_heap* _this
) {
global_heap_anchor64 xchg, cmp = *_this->anchor;
do {
xchg.depth = cmp.depth - 1;

xchg.offset = (! xchg.depth) ? 0 : cmp.offset;

} while (! ATOMIC_CAS64(&_this->anchor, &cmp, &xchg));
}

[...]

> /* Global Heap Variables */
> static global_heap* const g_allocator_gheaps[GLOBAL_HEAP_COUNT] = {
> NULL };

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Needs to be:

static global_heap* g_allocator_gheaps[GLOBAL_HEAP_COUNT] = { NULL };

[...]

Okay... My next post will be FULL-BLOWN source-code which should compile
fine on VC++ and GCC. Please be patient! Thanks everybody. I am interested
in how this compares to Hoard and/or StreamFlow. It should be very
interesting indeed!

;^)

Chris Thomasson

unread,

Apr 21, 2008, 1:53:23 PM4/21/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message
news:zfqdnenXdtJaLpHV...@comcast.com...

> This pseudo-code is released under the GNU GENERAL PUBLIC LICENSE:
>
> http://www.gnu.org/licenses/gpl.html
>
> My initial memory allocator version is so promising! All I can say is,
> WOW! Well, here it is:

[...]

I have thought of many optimizations to this thing. Like combining the
region allocator aspect with a slab allocator such that threads have a
region offset and a free-list. This would still allow for remote
deallocations that only require ATOMIC_DEC. Pretty cool.

Markus Elfring

unread,

Apr 22, 2008, 5:39:32 AM4/22/08

to

> This pseudo-code is released under the GNU GENERAL PUBLIC LICENSE:
> http://www.gnu.org/licenses/gpl.html

Would you like to consider the use of the "GNU General Public License with
runtime exception" (RGPL)?
http://gcc.gnu.org/onlinedocs/libstdc++/17_intro/license.html

How do you think about the details from the discussion "(L)GPL and C++ templates
issue"?
http://groups.google.de/groups?threadm=4174046c%241_2%40news.bluewin.ch

> My initial memory allocator version is so promising!

I am also curious about its performance in comparison to other approaches like
Hoard or TLSF.

Regards,
Markus

Markus Elfring

unread,

Apr 22, 2008, 6:16:26 AM4/22/08

to

> Would you like to consider the use of the "GNU General Public License
> with runtime exception" (RGPL)?
> http://gcc.gnu.org/onlinedocs/libstdc++/17_intro/license.html

Does this link need an update?

Are other variants applicable or more appropriate?
Examples:
http://sdcc.sourceforge.net/wiki/index.php?page=Library+License+Selection
http://algo2.iti.uni-karlsruhe.de/singler/mcstl/

Regards,
Markus

Chris Thomasson

unread,

Apr 22, 2008, 8:01:23 PM4/22/08

to

"Markus Elfring" <Markus....@web.de> wrote in message
news:675q3eF...@mid.individual.net...

>> This pseudo-code is released under the GNU GENERAL PUBLIC LICENSE:
>> http://www.gnu.org/licenses/gpl.html
>
> Would you like to consider the use of the "GNU General Public License with
> runtime exception" (RGPL)?
> http://gcc.gnu.org/onlinedocs/libstdc++/17_intro/license.html
>
> How do you think about the details from the discussion "(L)GPL and C++
> templates issue"?
> http://groups.google.de/groups?threadm=4174046c%241_2%40news.bluewin.ch

Humm... Well, I was thinking I would use the exact same license model as
Hoard. Is that a bad idea?

>> My initial memory allocator version is so promising!
>
> I am also curious about its performance in comparison to other approaches
> like Hoard or TLSF.

My initial code is almost done. I choose to use C++, anyway I would very
much appreciate it if you could test drive it when I release it. Thanks
Markus.

Markus Elfring

unread,

Apr 23, 2008, 7:19:29 AM4/23/08

to

> Humm... Well, I was thinking I would use the exact same license model as
> Hoard. Is that a bad idea?

Mostly not - I appreciate that you choose an open source licence now.

> My initial code is almost done. I choose to use C++, anyway I would very
> much appreciate it if you could test drive it when I release it.

I would like to point out special consequences from the amount of source code
that will be included from C(++) header files.
http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01s02.html

I assume that you would also like to allow the reuse of your memory allocator
library by non-GPL software. Does any lawyer get concerned if the
included/inlined code and data structures will be too much?

Another use case is a header-only class template library.
http://synesis.com.au/software/stlsoft/

Regards,
Markus

Chris Thomasson

unread,

Apr 24, 2008, 1:08:52 AM4/24/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message
news:zfqdnenXdtJaLpHV...@comcast.com...

> This pseudo-code is released under the GNU GENERAL PUBLIC LICENSE:
>
> http://www.gnu.org/licenses/gpl.html
>
> My initial memory allocator version is so promising! All I can say is,
> WOW! Well, here it is:
>
>
> <pseudo-code VERSION 0.0.0 (Pre-Alpha)>
> ______________________________________________________________________
> /*
> Virtually Zero-Overhead Memory Allocator
> Copyright (C) 2008 Christopher Michael Thomasson
>
> This program is free software: you can redistribute it and/or modify
> it under the terms of the GNU General Public License as published by
> the Free Software Foundation, either version 3 of the License, or
> (at your option) any later version.
>
> This program is distributed in the hope that it will be useful,
> but WITHOUT ANY WARRANTY; without even the implied warranty of
> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> GNU General Public License for more details.
>
> You should have received a copy of the GNU General Public License
> along with this program. If not, see <http://www.gnu.org/licenses/>.
> */

[...]

> /* Base Heap API
> ___________________________________________________________________*/
> base_heap*
> base_heap_from_ptr(
> void const* const ptr
> ) {
> return /* round 'ptr' down to 'HEAP_DEPTH' boundary */;
> }

[...]

The code for this function is simple... It could look something like this:

base_heap*
base_heap_from_ptr(
void const* const ptr
) {

return (base_heap*)(ALIGN_SIZE(ptr, unsigned char*, HEAP_DEPTH) -
HEAP_DEPTH);
}

This extremely simple technique allows for highly efficient header-less
memory blocks. There is absolutely no need for per-block meta-data...

Chris Thomasson

unread,

Apr 24, 2008, 1:53:07 AM4/24/08

to

"Markus Elfring" <Markus....@web.de> wrote in message

news:678kbfF...@mid.individual.net...

>> Humm... Well, I was thinking I would use the exact same license model as
>> Hoard. Is that a bad idea?
>
> Mostly not - I appreciate that you choose an open source licence now.

:^)

I am thinking about releasing the entire vZOOM library under GPL.

>> My initial code is almost done. I choose to use C++, anyway I would very
>> much appreciate it if you could test drive it when I release it.
>
> I would like to point out special consequences from the amount of source
> code that will be included from C(++) header files.
> http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01s02.html
>
> I assume that you would also like to allow the reuse of your memory
> allocator library by non-GPL software. Does any lawyer get concerned if
> the included/inlined code and data structures will be too much?
>
> Another use case is a header-only class template library.
> http://synesis.com.au/software/stlsoft/

Humm, interesting indeed... Need to think some more on this... Thanks
Markus.

Anyway, my ultimate goal would be to allow anybody to freely use the
allocator, as long as they release all the source-code for the product which
makes any use of it... If they don't want to release source-code, they
cannot freely use the allocator. I assume that Hoard's license model tries
to enforce the same restriction: release full product source-code, or don't
use/link against the allocator.

Markus Elfring

unread,

Apr 24, 2008, 3:37:42 AM4/24/08

to

> I am thinking about releasing the entire vZOOM library under GPL.

Great!

> Humm, interesting indeed... Need to think some more on this...

I guess that more software developers should consider when to make a kind of
runtime exception to their open source licences to enable broader reuse of (C++)
function/class libraries.
This detail aspect depends on the understanding and agreement of inline
expansion consequences on licence issues.

Regards,
Markus

Dmitriy V'jukov

unread,

Apr 24, 2008, 7:28:55 AM4/24/08

to

On 21 апр, 19:35, "Chris Thomasson" <cris...@comcast.net> wrote:

> Well, what do you think? Doesn't this thing kick major as$! I am working on
> a full-blown implementation. The only things I left out of this is initial
> pseudo-code implementation are the thread and system shutdown procedures;
> they are fairly trivial. When I get this completed, I will be offering it
> for commercial use under a special license.

In full-blown implementation every thread will have several
thread_heaps and they will be allocated dynamically, and global_heaps
will be allocated dynamically. Right? Otherwise... mmm... it's too
easy to get NULL from allocation function.

Btw, I'm trying to compare it to your slab-allocator algorithm. slab-
allocator has 1 atomic rmw per remote deallocation too. If using bunch
of spsc queues then remote deallocations are "atomic-free". And slab-
allocator is more 'fine-grained', it tracks every block of memory, not
big regions. So I'm trying to figure out how this algorithm is better
than slab-allocator...

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 24, 2008, 1:41:21 PM4/24/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message
news:2826e925-55ec-4473...@a70g2000hsh.googlegroups.com...

On 21 апр, 19:35, "Chris Thomasson" <cris...@comcast.net> wrote:

> > Well, what do you think? Doesn't this thing kick major as$! I am working
> > on
> > a full-blown implementation. The only things I left out of this is
> > initial
> > pseudo-code implementation are the thread and system shutdown
> > procedures;
> > they are fairly trivial. When I get this completed, I will be offering
> > it
> > for commercial use under a special license.

> In full-blown implementation every thread will have several
> thread_heaps and they will be allocated dynamically, and global_heaps
> will be allocated dynamically. Right? Otherwise... mmm... it's too
> easy to get NULL from allocation function.

Exactly correct. Basically, per-thread heaps are kind of based on this very
crude and simplistic region allocator I did for C++:

http://groups.google.com/group/comp.lang.c++/browse_frm/thread/68bdc76f9792de1f

One difference is that there is no find_heap function. Any block can find
its region by rounding down to heap boundary. I am going to keep actual
regions in a level higher than the global heaps. Basically I use
VirtualAlloc/mmap to reserve large area of memory aligned at heap boundary,
and commit and serve regions to global/thread-local heaps on demand.

> Btw, I'm trying to compare it to your slab-allocator algorithm. slab-
> allocator has 1 atomic rmw per remote deallocation too. If using bunch
> of spsc queues then remote deallocations are "atomic-free".

I am thinking about open-sourcing the vZOOM allocator and augmenting it with
this new region design:

http://groups.google.com/group/comp.programming.threads/msg/96d1826af5a87dcb

> And slab-
> allocator is more 'fine-grained', it tracks every block of memory, not
> big regions. So I'm trying to figure out how this algorithm is better
> than slab-allocator...

Well, IMVHO, this region-based algorithm makes it a _whole_lot_easier_ to
implement a general purpose allocator. I don't need to keep size segregated
block pools, e.g:

thread::slab[0].sz = sizeof(void*);
thread::slab[1].sz = thread::slab[0].sz * 2;
thread::slab[2].sz = thread::slab[1].sz * 2;
thread::slab[3].sz = thread::slab[2].sz * 2;
thread::slab[4].sz = thread::slab[3].sz * 2;

Lets say that the slab[4] are full and a subsequent memory request comes in
for that size, I cannot allocate from a lower slab[0/3] because they are
based on free-lists and I would need to be able to coalesce multiple blocks
into a larger one. The region allocator makes it so much simpler to
efficiently grant multiple requests that have a very diverse size ranges.
Anyway, the initial version 0.0.0 pre-alpha should be completed within 9-10
days. If I do open-source the vZOOM allocator, and merge the two designs,
well, that will probably come out in about 2-3 weeks. Then I will look
forward to your comments, since they are very helpful indeed.

Chris Thomasson

unread,

Apr 24, 2008, 2:26:24 PM4/24/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:JoidnTim4ohWWI3V...@comcast.com...

[...]

Actually, since this is a region allocator, it can use some knowledge from
the programmer... For instance, lets say that the programmer knows that
thread A, B and C only free memory that they allocated... Well, I could add
a function called:

void
malloc_reset() {

thread_heap* const _this = pthread_getspecific(...);

/* reset all regions in thread */
for_all_regions(_this) {
region->offset = 0;
region->depth = 0;
region->rdepth = 0;
}
}

And the programmer could use it like:

void threads_a_through_c() {
int i;
for (;;) {
int depth = wait_for_request();
for (i = 0; i < depth; ++i) {
void* mem = malloc(...);
do_something(mem);
}
malloc_reset();
}
}

A bunch of memory can be allocated, and it all can be freed in a single
call. This allocator is presenting me with so many diverse optimization
opportunities that I don't quite know which ones to use. Its a little
overwhelming. I need to calm down, and get this version 0.0.0 cranked out,
and listen to all the feedback..

Dmitriy V'jukov

unread,

Apr 24, 2008, 8:48:01 PM4/24/08

to

On 24 апр, 21:41, "Chris Thomasson" <cris...@comcast.net> wrote:

> Lets say that the slab[4] are full and a subsequent memory request comes in
> for that size, I cannot allocate from a lower slab[0/3] because they are
> based on free-lists and I would need to be able to coalesce multiple blocks
> into a larger one. The region allocator makes it so much simpler to
> efficiently grant multiple requests that have a very diverse size ranges.
> Anyway, the initial version 0.0.0 pre-alpha should be completed within 9-10
> days. If I do open-source the vZOOM allocator, and merge the two designs,
> well, that will probably come out in about 2-3 weeks. Then I will look
> forward to your comments, since they are very helpful indeed.

I'm thinking about the ways how we can amortize remote deallocations
in time- and space-efficient manner. Currently I see 2 ways.

-------------------------------------------------------
Cache list + global lock
-------------------------------------------------------

// crude sketch

int g_lock = 0;

__declspec(thread) void* t_remote_list_head = 0;
__declspec(thread) int t_remote_list_size = 0;

void free(void* p)
{
block* b = block_from_ptr(p);
if (is_my(p))
{
b->local_count += 1;
}
else
{
// push node into cache list
*(void**)p = t_remote_list_head;
t_remote_list_head = p;
t_remote_list_size += 1;
if (t_remote_list_size < 128)
return;
if (t_remote_list_size < 1024)
{
if (0 != g_lock || 0 != XCHG(&g_lock, 1))
// just retreat, will try again next time
return;
}
else
{
// we're very unlucky - failed to grab lock 896 times
while (0 != g_lock || 0 != XCHG(&g_lock, 1))
sched_yield();
}
// we grab g_lock
// flush cache list
void* cur = t_remote_list_head;
t_remote_list_head = 0;
t_remote_list_size = 0;
do
{
void* next = *(void**)cur;
block_from_ptr(cur)->remote_count += 1;
cur = next;
} while (cur);
g_lock = 0;
}
}

-------------------------------------------------------
Hashed cache array
-------------------------------------------------------

// crude sketch

block* block_from_ptr(void* p)
{
return (block*)((unsigned)p & 0xfff);
}

unsigned hash_from_ptr(void* p)
{
return ((unsigned)p & 0xf000) >> 12;
}

struct hash_cell
{
block* b;
int count;
};

int const hash_size = 16;

__declspec(thread) hash_cell t_hash_array [hash_size];
__declspec(thread) int t_flush_order = 0;

void free(void* p)
{
block* b = block_from_ptr(p);
if (is_my(p))
{
b->local_count += 1;
}
else
{
hash_cell* cell = &t_hash_array[hash_from_ptr(p)];
if (cell->b == b)
{
cell->count += 1;
}
else
{
if (cell->b)
// flush old value
XADD(&cell->b->remote_count, cell->count);
cell->b = b;
cell->count = 1;
}
t_flush_order += 1;
if (t_flush_order == 1024)
{
t_flush_order = 0;
// flush all values
for (int i = 0; i != hash_size; ++i)
{
hash_cell* c = &t_hash_array[i];
if (c->b)
{
XADD(&c->b->remote_count, c->count);
c->b = 0;
}
}
}
}
}

What do you think? Is it worth doing or not? Which variant do you
prefer? Or neither?
I am not sure yet whether it's worth doing or not. Your initial
variant is so straightforward and free of any caveats. Amortized
variants are not so straightforward and possibly contain some
caveats...
What bothering me in your initial variant is that it's completely
senseless to execute ATOMIC_DEC on *every* remote deallocation.
Because it's senseless to know whether there are 100 or 99 outstanding
deallocations. One only has to know whether there are 0 or not
outstanding deallocations.

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 24, 2008, 11:29:38 PM4/24/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:00e4af5c-4aca-4b8e...@l64g2000hse.googlegroups.com...

On 24 апр, 21:41, "Chris Thomasson" <cris...@comcast.net> wrote:

> > Lets say that the slab[4] are full and a subsequent memory request comes
> > in
> > for that size, I cannot allocate from a lower slab[0/3] because they are
> > based on free-lists and I would need to be able to coalesce multiple
> > blocks
> > into a larger one. The region allocator makes it so much simpler to
> > efficiently grant multiple requests that have a very diverse size
> > ranges.
> > Anyway, the initial version 0.0.0 pre-alpha should be completed within
> > 9-10
> > days. If I do open-source the vZOOM allocator, and merge the two
> > designs,
> > well, that will probably come out in about 2-3 weeks. Then I will look
> > forward to your comments, since they are very helpful indeed.

[...]

> What do you think? Is it worth doing or not? Which variant do you
> prefer? Or neither?
> I am not sure yet whether it's worth doing or not. Your initial
> variant is so straightforward and free of any caveats. Amortized
> variants are not so straightforward and possibly contain some
> caveats...
> What bothering me in your initial variant is that it's completely
> senseless to execute ATOMIC_DEC on *every* remote deallocation.
> Because it's senseless to know whether there are 100 or 99 outstanding
> deallocations. One only has to know whether there are 0 or not
> outstanding deallocations.

Great points. I need to think this over. Perhaps since this is going to be
open source, we can do State-of-the-Art Memory Allocation by V'jukov and
Thomasson...

:^D

Chris Thomasson

unread,

Apr 24, 2008, 11:32:04 PM4/24/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:hcadnWIs8Kw20ozV...@comcast.com...

> "Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

[...]

> Great points. I need to think this over. Perhaps since this is going to be
> open source, we can do State-of-the-Art Memory Allocation by V'jukov and
> Thomasson...

[...]

It may be bad timing... My house if filled with boxes and packing
material... Moving furniture from Bay Area to Tahoe Basin... Wow, this is
going to be FUN!!!!!!! I will try hard to keep on the ball.

:^|

Chris Thomasson

unread,

Apr 24, 2008, 11:33:39 PM4/24/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:ROCdncGbNuWjzYzV...@comcast.com...

Well, at least I know I can crank out version 0.0.0 before I start the major
moving work. Ack!

Chris Thomasson

unread,

Apr 25, 2008, 3:46:09 PM4/25/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:00e4af5c-4aca-4b8e...@l64g2000hse.googlegroups.com...

> On 24 апр, 21:41, "Chris Thomasson" <cris...@comcast.net> wrote:

> > Lets say that the slab[4] are full and a subsequent memory request comes
> > in
> > for that size, I cannot allocate from a lower slab[0/3] because they are
> > based on free-lists and I would need to be able to coalesce multiple
> > blocks
> > into a larger one. The region allocator makes it so much simpler to
> > efficiently grant multiple requests that have a very diverse size
> > ranges.
> > Anyway, the initial version 0.0.0 pre-alpha should be completed within
> > 9-10
> > days. If I do open-source the vZOOM allocator, and merge the two
> > designs,
> > well, that will probably come out in about 2-3 weeks. Then I will look
> > forward to your comments, since they are very helpful indeed.

> I'm thinking about the ways how we can amortize remote deallocations
> in time- and space-efficient manner. Currently I see 2 ways.

[...]

I like the hash method. It should work fine. I might add it into version
0.0.0. Or perhaps, I will add it in with an #ifdef so we can attempt to
measure any possible performance enhancements.

Dmitriy V'jukov

unread,

Apr 25, 2008, 4:02:32 PM4/25/08

to

On Apr 25, 11:46 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>
> I like the hash method. It should work fine. I might add it into version
> 0.0.0.

It must help in producer-consumer pattern, when thread steadily
deallocates blocks from one region of another thread.

> Or perhaps, I will add it in with an #ifdef so we can attempt to
> measure any possible performance enhancements.

It will be cool. The only problem - how you are going to measure
performance?

Dmitriy V'jukov

Chris Thomasson

unread,

Apr 25, 2008, 5:48:59 PM4/25/08

to

"Dmitriy V'jukov" <dvy...@gmail.com> wrote in message

news:36036563-5107-4aab...@w8g2000prd.googlegroups.com...

> On Apr 25, 11:46 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>>
>> I like the hash method. It should work fine. I might add it into version
>> 0.0.0.
>
> It must help in producer-consumer pattern, when thread steadily
> deallocates blocks from one region of another thread.

Right.

>> Or perhaps, I will add it in with an #ifdef so we can attempt to
>> measure any possible performance enhancements.
>
> It will be cool. The only problem - how you are going to measure
> performance?

Humm, I am not exactly sure. I think I will worry about all of that after I
release this thing.

Maybe Larson and Consume benchmarks.

;^)

Chris Thomasson

unread,

Apr 26, 2008, 4:11:44 PM4/26/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:LuydnQbJOeaTi43V...@comcast.com...

umm, if course it help to determine if it was aligned on a boundary before
the offset was rounded!

base_heap*
base_heap_from_ptr(
void const* const ptr
) {

if (! ((ptrdiff_t)ptr) % HEAP_DEPTH)) {

return (base_heap*)(ALIGN_SIZE(ptr, unsigned char*, HEAP_DEPTH) -
HEAP_DEPTH);
}

return (base_heap*)ptr;
}

ARGH!

Chris Thomasson

unread,

Apr 27, 2008, 3:37:34 AM4/27/08

to

Here is an early VERY QUICK and CRUDE prototype and "test" application which
should compile with VC 7.1 or later:

http://appcore.home.comcast.net/vzoom/malloc/vzmalloc_v000_cpp.html

This does not use virtual memory or global heaps. You can comment out the:

#define THREAD_MALLOC_OVERLOAD_NEW_DELETE

line to switch over to the native allocator. Barring any typos/bugs, what
time do you get when 'THREAD_MALLOC_OVERLOAD_NEW_DELETE' is defined, vs.
when it is not? The per-thread allocator should beat the allocator your
compiler is using. Once this gets cleaned up and I add virtual memory and
global heap support, well, it should perform and scale fairly well.

Any thoughts?

Chris Thomasson

unread,

Apr 27, 2008, 3:43:50 AM4/27/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:KNWdnZbCS-NdsYnV...@comcast.com...

You can make some crude adjustments using the following macros:

// Overload global new/delete operators with per-thread allocator
#define THREAD_MALLOC_OVERLOAD_NEW_DELETE

// The size of a regions heap
#if ! defined(THREAD_ALLOCATOR_HEAP_SIZE)
# define THREAD_ALLOCATOR_HEAP_SIZE 262144
#endif

// The atomic shutdown procedure mark
#if ! defined(THREAD_ALLOCATOR_DTOR_MARK)
# define THREAD_ALLOCATOR_DTOR_MARK 0x80000000
#endif

// A threads inital count of regions
#if ! defined(THREAD_ALLOCATOR_PRIME_COUNT)
# define THREAD_ALLOCATOR_PRIME_COUNT 1
#endif

// A threads low-watermark max of regions
#if ! defined(THREAD_ALLOCATOR_MAX_COUNT)
# define THREAD_ALLOCATOR_MAX_COUNT 2
#endif

> Any thoughts?

Mirek Fidler

unread,

Apr 28, 2008, 3:36:31 AM4/28/08

to

Hi, I wanted to benchmark it, but it does not compile. I guess
reasons are pretty simple,

#include <pthread.h>

fails.

Looks like your code is a strange mix of Linux and Win32 code (_asm on
the other hand is VC feature).

Mirek

Mirek Fidler

unread,

Apr 28, 2008, 4:39:59 AM4/28/08

to

Ah, using pthreads for Win32, right...

OK, maybe I have something wrong, but it is definitely slower than
current U++ allocator on Core2 Duo.

I have tested with this code:

String text[2];

void Work(int i = 0)
{
CppBase base;
MemReadStream in(text[i], text[i].GetLength());
Parse(in, Vector<String>(), base, "", Callback2<int, const
String&>());
}

void ThreadWork(int ii)
{
for(int i = 0; i < 1000; i++)
Work(ii);
}

CONSOLE_APP_MAIN
{
StdLogSetup(LOG_FILE|LOG_COUT);
String fn = GetDataFile("x.cpp");
fn = AppendFileName(GetFileFolder(GetFileFolder(GetFileFolder(fn))),
"uppsrc/CtrlLib/ArrayCtrl.cpp");
text[0] = LoadFile(fn);
text[1] = LoadFile(fn);
TimeStop tm;
for(int i = 0; i < 1000; i++)
Work();
RLOG("Single thread: " << tm.Elapsed());
#ifdef _MULTITHREADED
tm.Reset();
Thread b;
b.Run(callback1(ThreadWork, 1));
ThreadWork(0);
b.Wait();
RLOG("Two threads: " << tm.Elapsed());
#endif
}

(sorry for using U++, I hope our threading classes are easy to
understand ;)

Well, the aim of test is this:

We are running here a heurestic C++ parser parsing a file. First it is
run in single thread, then the same thing is run in two threads;
obviously there is NO synchronization or shared data between both
thread.

This is intended to check the quality of allocator, as there is no
contention. In ideal world, IMO, both times should be the same (times
are in ms).

Results:

Standard Visual C++ allocator (times are in ms):

Single thread: 6015
Two threads: 6641

Your allocator presented in this thread:

Single thread: 5688
Two threads: 6219

U++ allocator

Single thread: 5110
Two threads: 5343

Well, you have managed to beat MS, but obviously, U++ is still far
better :) Maybe with more cores the results would be different, I have
no chance to test it today.

OTOH, you are loosing almost 10% going from one to two threads, that
does not seem too promising. U++ has only 5% drop there...

Mirek

Mirek Fidler

unread,

Apr 28, 2008, 6:20:29 AM4/28/08

to

> Any thoughts?

OK, after my more "real world" test, I tested with your benchmark
code. In that case, I am getting pretty good results (4 times faster
than U++, 8 times faster than MS C). Obviously, your code is designed
to introduce as much contention to the allocator as possible :)

OTOH, the question now remains what is the real world benefit.

Mirek

Chris Thomasson

unread,

Apr 28, 2008, 5:04:23 PM4/28/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message
news:67a9fd1c-257a-4415...@d1g2000hsg.googlegroups.com...

The crude prototype code uses no windows headers. I use pthreads because it
provides portable threading substrate.

Mirek Fidler

unread,

Apr 28, 2008, 5:16:51 PM4/28/08

to

No problem. I have already figured that out. I am sorry bothering you
with such nonsence.

Speaking of it, I also had to comment out one assert:

void* allocate(std::size_t sz) throw(std::bad_alloc) {
// assert(m_startup);
startup();

Mirek

Chris Thomasson

unread,

Apr 28, 2008, 5:27:04 PM4/28/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:5699b040-b31b-4103...@d1g2000hsg.googlegroups.com...

> Ah, using pthreads for Win32, right...
>
> OK, maybe I have something wrong, but it is definitely slower than
> current U++ allocator on Core2 Duo.

[...]

> Well, the aim of test is this:
>
> We are running here a heurestic C++ parser parsing a file. First it is
> run in single thread, then the same thing is run in two threads;
> obviously there is NO synchronization or shared data between both
> thread.
>
> This is intended to check the quality of allocator, as there is no
> contention. In ideal world, IMO, both times should be the same (times
> are in ms).

I am not sure what the bottleneck is here. The allocator does not require
threads to use synchronization when accessing their private heap. It all
might boil down to a possibility of region allocation being to coarse for
per-thread heaps. Perhaps a per-processor heap would be more in line.

> Results:
>
> Standard Visual C++ allocator (times are in ms):
>
> Single thread: 6015
> Two threads: 6641
>
> Your allocator presented in this thread:
>
> Single thread: 5688
> Two threads: 6219
>
> U++ allocator
>
> Single thread: 5110
> Two threads: 5343
>
> Well, you have managed to beat MS, but obviously, U++ is still far
> better :)

Indeed.

> Maybe with more cores the results would be different, I have
> no chance to test it today.

Perhaps. It seems to "beat" other allocators when the system is put under
load and experiences some rapid cross thread deallocations.

> OTOH, you are loosing almost 10% going from one to two threads, that
> does not seem too promising. U++ has only 5% drop there...

Weird. On the per-thread level the allocator is basically a region
allocator. Could you test the following code:

http://groups.google.com/group/comp.lang.c++/browse_frm/thread/68bdc76f9792de1f

against U++. This is basic single-threaded allocation model of the threaded
allocator. The main difference is that the single-thread version does not
align its regions on heap-size boundaries. This means heap meta-data cannot
be extracted from a pointer to a location within it the heap.

I am thinking that I should only use region allocation in the global heaps,
and use slab-allocation for thread-local heaps.

Chris Thomasson

unread,

Apr 28, 2008, 5:27:54 PM4/28/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:54137544-9d4f-48ab...@f63g2000hsf.googlegroups.com...

>> Any thoughts?
>
> OK, after my more "real world" test, I tested with your benchmark
> code. In that case, I am getting pretty good results (4 times faster
> than U++, 8 times faster than MS C). Obviously, your code is designed
> to introduce as much contention to the allocator as possible :)

Its using a producer/consumer model at full speed. Its just an overall
scalability test under load.

> OTOH, the question now remains what is the real world benefit.

I don't know yet. This is still in highly experimental stage. Fair enough?

;^)

Mirek Fidler

unread,

Apr 28, 2008, 5:45:53 PM4/28/08

to

Definitely! And quite a useful experience too :) - I guess I know how
to fix U++ alloc now to match it in your high-contention scenario;)
(for starters, increasing per-thread cache seems to do the trick).

The question is why - I have run my original test (C++ parser) with up
to 64 theads (albeit on dual core machine) and so far it seems to
scale lineary with our alloc.

Mirek

Chris Thomasson

unread,

Apr 29, 2008, 5:50:51 PM4/29/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:35f5fbf8-54e9-43be...@m73g2000hsh.googlegroups.com...

> On Apr 28, 11:27 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>> "Mirek Fidler" <c...@ntllib.org> wrote in message
>>
>> news:54137544-9d4f-48ab...@f63g2000hsf.googlegroups.com...
>>
>> >> Any thoughts?
>>
>> > OK, after my more "real world" test, I tested with your benchmark
>> > code. In that case, I am getting pretty good results (4 times faster
>> > than U++, 8 times faster than MS C). Obviously, your code is designed
>> > to introduce as much contention to the allocator as possible :)
>>
>> Its using a producer/consumer model at full speed. Its just an overall
>> scalability test under load.
>>
>> > OTOH, the question now remains what is the real world benefit.
>>
>> I don't know yet. This is still in highly experimental stage. Fair
>> enough?
>>
>> ;^)
>
> Definitely! And quite a useful experience too :) - I guess I know how
> to fix U++ alloc now to match it in your high-contention scenario;)
> (for starters, increasing per-thread cache seems to do the trick).

:^)

> The question is why - I have run my original test (C++ parser) with up
> to 64 theads (albeit on dual core machine) and so far it seems to
> scale lineary with our alloc.

Perhaps there is an issue with remote deallocations. I don't know. Anyway,
before I release allocator round 2, I think I will release a purely single
threaded version of it just to see if it competes well with U++. You
mentioned that it was beating the allocator in single-thread test by around
600 ms. I need to reduce that. BTW, could you run your parser benchmark
against this single-threaded allocator:

http://groups.google.com/group/comp.lang.c++/browse_frm/thread/68bdc76f9792de1f

When I run this on one of my old boxs (p4 3.06ghz hyperthread 512mb ram) I
get:

Time: 266 ms

when 'TEST_USE_REGION_ALLOCATOR' is defined, and

Time: 2495 ms

when it is not.

Could you please run a simple single-threaded benchmark on this? I need to
focus on optimizing the region allocator before I multi-threaded it again.
The prototype code I posted is just a proof of concept for the wait-free
algorithm I use to coordinate remote deallocations with thread shutdowns.
That works. Its an easy way to multi-thread a region allocator. Now I need
to fine-tune the allocation algorithm.

Mirek Fidler

unread,

Apr 30, 2008, 3:11:22 AM4/30/08

to

> Could you please run a simple single-threaded benchmark on this?

U++ 4672 ms, task manager shows 3100KB used during the run.

Your region allocator 5109ms, 6900KB used during the run.

(I guess memory consumption numbers are pretty imprecise....).

Mirek

Mirek Fidler

unread,

Apr 30, 2008, 3:25:56 AM4/30/08

to

BTW, not sure if it helps... U++ allocator has a nice introspection
feature that is able to produce "memory profile" - list of size bins
and actual number of blocks allocated in that bin; either current or
(on demand) "peak". For this benchmark the peak kprofile looks like
this:

16 B, 762 allocated ( 11 KB), 0 fragmented ( 0 KB)
32 B, 635 allocated ( 19 KB), 0 fragmented ( 0 KB)
48 B, 588 allocated ( 27 KB), 0 fragmented ( 0 KB)
64 B, 567 allocated ( 35 KB), 0 fragmented ( 0 KB)
80 B, 550 allocated ( 42 KB), 0 fragmented ( 0 KB)
96 B, 546 allocated ( 51 KB), 0 fragmented ( 0 KB)
112 B, 540 allocated ( 59 KB), 0 fragmented ( 0 KB)
128 B, 527 allocated ( 65 KB), 0 fragmented ( 0 KB)
144 B, 532 allocated ( 74 KB), 0 fragmented ( 0 KB)
160 B, 523 allocated ( 81 KB), 2 fragmented ( 0 KB)
192 B, 521 allocated ( 97 KB), 4 fragmented ( 0 KB)
208 B, 551 allocated ( 111 KB), 0 fragmented ( 0 KB)
224 B, 521 allocated ( 113 KB), 1 fragmented ( 0 KB)
240 B, 511 allocated ( 119 KB), 1 fragmented ( 0 KB)
256 B, 518 allocated ( 129 KB), 7 fragmented ( 1 KB)
TOTAL, 8392 allocated ( 1043 KB), 15 fragmented ( 3 KB)
Free pages 0 (0 KB)
Large block count 3, total size 120 KB
312 B, 1 allocated ( 0 KB)
58KB, 1 allocated ( 58 KB)
62KB, 1 allocated ( 62 KB)
Large fragments count 1, total size 5 KB
5336 B, 1 fragments ( 5 KB)

This is BTW, pretty much unstandard; usually the profile has much
bigger numbers for smaller sizes, something like

16 B, 10032 allocated ( 156 KB), 128 fragmented ( 2 KB)
32 B, 18352 allocated ( 573 KB), 444 fragmented ( 13 KB)
48 B, 3264 allocated ( 153 KB), 516 fragmented ( 24 KB)
64 B, 2707 allocated ( 169 KB), 65 fragmented ( 4 KB)
80 B, 1250 allocated ( 97 KB), 0 fragmented ( 0 KB)
96 B, 1490 allocated ( 139 KB), 64 fragmented ( 6 KB)
112 B, 1064 allocated ( 116 KB), 16 fragmented ( 1 KB)
128 B, 1209 allocated ( 151 KB), 0 fragmented ( 0 KB)
144 B, 224 allocated ( 31 KB), 0 fragmented ( 0 KB)
160 B, 171 allocated ( 26 KB), 4 fragmented ( 0 KB)
176 B, 69 allocated ( 11 KB), 23 fragmented ( 3 KB)
192 B, 79 allocated ( 14 KB), 26 fragmented ( 4 KB)
208 B, 6727 allocated ( 1366 KB), 18 fragmented ( 3 KB)
224 B, 15 allocated ( 3 KB), 21 fragmented ( 4 KB)
240 B, 68 allocated ( 15 KB), 12 fragmented ( 2 KB)
256 B, 125 allocated ( 31 KB), 10 fragmented ( 2 KB)
TOTAL, 46846 allocated ( 3059 KB), 1347 fragmented ( 74 KB)
Free pages 7 (28 KB)
Large block count 29, total size 1343 KB
312 B, 3 allocated ( 0 KB)
536 B, 1 allocated ( 0 KB)
728 B, 1 allocated ( 0 KB)
952 B, 1 allocated ( 0 KB)
1048 B, 4 allocated ( 4 KB)
1144 B, 1 allocated ( 1 KB)
1656 B, 1 allocated ( 1 KB)
2392 B, 1 allocated ( 2 KB)
3640 B, 1 allocated ( 3 KB)
4280 B, 1 allocated ( 4 KB)
8376 B, 1 allocated ( 8 KB)
15KB, 1 allocated ( 15 KB)
32KB, 3 allocated ( 96 KB)
34KB, 2 allocated ( 68 KB)
38KB, 1 allocated ( 38 KB)
44KB, 1 allocated ( 44 KB)
52KB, 2 allocated ( 104 KB)
79KB, 1 allocated ( 79 KB)
179KB, 1 allocated ( 179 KB)
687KB, 1 allocated ( 687 KB)
Large fragments count 3, total size 143 KB
15KB, 1 fragments ( 15 KB)
63KB, 2 fragments ( 127 KB)

(the profile of theide - a GUI development environment).

Mirek

Chris Thomasson

unread,

Apr 30, 2008, 5:58:00 PM4/30/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:fa7d5e78-ed28-4223...@d45g2000hsc.googlegroups.com...

>> Could you please run a simple single-threaded benchmark on this?
>
> U++ 4672 ms, task manager shows 3100KB used during the run.
>
> Your region allocator 5109ms, 6900KB used during the run.

What numbers do you get when you run the test program as-is?

http://pastebin.com/m2dacea67

It just allocates a bunch of ints into a couple of std::list's.

> (I guess memory consumption numbers are pretty imprecise....).

It seems like correct numbers to me. The region allocator is very
coarse-grain. The heap size per-region for the single-thread allocator is
set for 4.7 mb. The heap size for the multi-thread regions is 256 kb. The
bottleneck is the region allocator itself, not the non-blocking algorithm
used in the multi-threaded variant. The difference of the overall times is
similar to your last test (e.g., the region allocator is 500-600 ms slower).
I think the answer would be to hoist the region allocator out into the
"global" allocator level.

It looks like your doing a segregated storage based on size. Something like:

per_thread.heap.bucket[0].sizeclass = 8;
per_thread.heap.bucket[1].sizeclass = 16;
per_thread.heap.bucket[2].sizeclass = 32;
[...]

This is how the vZOOM allocator is currently setup. The heaps are freelists
into slab objects which maintain freelists of the sizeclass. This is MUCH
more fine-grain than a region allocator. BTW, how are you handling remote
deallocations? I have designed an algorithm a while back that can handle
free-lists efficiently:

http://groups.google.com/group/comp.programming.threads/browse_frm/thread/8245f4b48591fc69

It beats the region allocator; its the only bottleneck I can detect, based
on your results. The vz allocator uses a tweaked version of the simple slab
algorithm above. I wonder if I should release it, or reprogram it with a
global region allocator. Humm.

Mirek Fidler

unread,

Apr 30, 2008, 6:23:34 PM4/30/08

to

On Apr 30, 11:58 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> It looks like your doing a segregated storage based on size. Something like:
>
> per_thread.heap.bucket[0].sizeclass = 8;
> per_thread.heap.bucket[1].sizeclass = 16;
> per_thread.heap.bucket[2].sizeclass = 32;
> [...]

Basically, yes. The trick is how to implement 'free' and maintain zero
per block overhead :)

> BTW, how are you handling remote
> deallocations? I have designed an algorithm a while back that can handle
> free-lists efficiently:

Frankly, I do not. I am considering this, but the only benefit I can
see is reduced cache-line contention. That would be nice, but I
suspect associated costs in code complexity would be bad trade-off.

Mirek

Mirek Fidler

unread,

Apr 30, 2008, 6:24:28 PM4/30/08

to

On Apr 30, 11:58 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> What numbers do you get when you run the test program as-is?
>
> http://pastebin.com/m2dacea67
>
> It just allocates a bunch of ints into a couple of std::list's.

P.S.: (I will test this ASAP).

Chris Thomasson

unread,

Apr 30, 2008, 9:49:52 PM4/30/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:4deded68-ef07-4e0a...@k13g2000hse.googlegroups.com...

> On Apr 30, 11:58 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> It looks like your doing a segregated storage based on size. Something
>> like:
>>
>> per_thread.heap.bucket[0].sizeclass = 8;
>> per_thread.heap.bucket[1].sizeclass = 16;
>> per_thread.heap.bucket[2].sizeclass = 32;
>> [...]
>
> Basically, yes. The trick is how to implement 'free' and maintain zero
> per block overhead :)

:^)

vZOOM aligns a buckets memory on a common (e.g., page) boundary. So, any
block can find a pointer to its bucket by rounding down to a boundary. Each
block is at least sizeof(aligner):

union aligner {
int i;
long l;
double d;
void* p;
[...];
};

struct block {
block* next;
};

struct slab {
block* freelist_head;
size_t sizeclass;
int count;
pthread_t id;
char heap[PAGE_SIZE];
};

If each slab object is aligned on a PAGE_SIZE boundary, then you can get a
slab pointer from a block like:

slab*
slab_from_ptr(

void const* const ptr
) {

assert(((ptrdiff_t)ptr) % PAGE_SIZE);
return (slab*)(ALIGN_SIZE(ptr, char*, PAGE_SIZE) - PAGE_SIZE);
}

So you can use this space to as a link into the freelists when a block is
unused by the application. To build malloc/free, you can do:

void free(void* ptr) {
slab* const this_slab = pthread_getspecific(...);
slab* const ptr_slab = slab_from_ptr(ptr);
block* const ptr_block = (block*)ptr;
if (this_slab == ptr_slab) {
/* LOCAL FREE */
ptr_block->next = ptr_slab->freelist_head;
ptr_slab->freelist_head = ptr_block;
++ptr_slab->count;
} else {
/* REMOTE FREE; CAS loop */
}
}

This is a very simple solution for headerless blocks. If the aligner union
was not used, then requests for memory => sizeof(void*) would not have any
wasted space. Now its memory >= sizeof(aligner), oh well, the aligner is
basically needed for general purpose malloc/free replacement.

>> BTW, how are you handling remote
>> deallocations? I have designed an algorithm a while back that can handle
>> free-lists efficiently:
>
> Frankly, I do not. I am considering this, but the only benefit I can
> see is reduced cache-line contention. That would be nice, but I
> suspect associated costs in code complexity would be bad trade-off.

In U++, you can allocate in thread_a and free in thread_b right? This is
what I mean by "remote" deallocations.

Mirek Fidler

unread,

May 1, 2008, 1:53:00 AM5/1/08

to

On May 1, 3:49 am, "Chris Thomasson" <cris...@comcast.net> wrote:

> slab*
> slab_from_ptr(
> void const* const ptr
> ) {
> assert(((ptrdiff_t)ptr) % PAGE_SIZE);
> return (slab*)(ALIGN_SIZE(ptr, char*, PAGE_SIZE) - PAGE_SIZE);
>
> }

What if allocated block > PAGE_SIZE? The only way I see is that you
are wasting quite a lot of memory here....

> In U++, you can allocate in thread_a and free in thread_b right? This is
> what I mean by "remote" deallocations.

Yes, but in U++, there is the same path used for "remote"
deallocation.

You can get the code and description by downloading U++, however very
short description:

Only "small" allocations are optimized. These are 16 - 256 bytes
blocks, size is always rounded by 16 bytes.

Like with vZoom, at the beginning of 4KB page there is block
bookkeeping info. Anyway, the main nasty trick is that >256 bytes
blocks are 16 byte *UN*aligned. So you can tell from the pointer value
whether block is small or large (you only need to test & 8 :)

You can also tell the block size without any locking (at least I
believe so :). With this info, when deallocating, you can store the
block to per-thread cache. If per-thread cache overflows / underflows,
the code to deal with it is normally serialized....

So it is not entirely lock free, but locks only quite rarely, with
current setup only once per 32 allocations in the worst case.

Mirek

Mirek Fidler

unread,

May 1, 2008, 1:22:16 PM5/1/08

to

On Apr 30, 11:58 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Mirek Fidler" <c...@ntllib.org> wrote in message
>
> news:fa7d5e78-ed28-4223...@d45g2000hsc.googlegroups.com...
>
> >> Could you please run a simple single-threaded benchmark on this?
>
> > U++ 4672 ms, task manager shows 3100KB used during the run.
>
> > Your region allocator 5109ms, 6900KB used during the run.
>
> What numbers do you get when you run the test program as-is?

U++ 200ms
Region 125ms

Mirek

Chris Thomasson

unread,

May 1, 2008, 5:36:17 PM5/1/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:30433097-a169-4f56...@25g2000hsx.googlegroups.com...

> On May 1, 3:49 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> slab*
>> slab_from_ptr(
>> void const* const ptr
>> ) {
>> assert(((ptrdiff_t)ptr) % PAGE_SIZE);
>> return (slab*)(ALIGN_SIZE(ptr, char*, PAGE_SIZE) - PAGE_SIZE);
>>
>> }
>
> What if allocated block > PAGE_SIZE?

I use the system to allocate blocks that are > PAGE_SIZE. There is one more
piece of logic that I did not add. The buffer passed to free checks if its
within the range of reserved virtual memory. Something like:

/* Reserve large amount of memory */

/* Point the following at the head and tail */
unsigned char* const vrange_head = ...;
unsigned char* const vrange_tail = ...;

bool vrange_check(void const* ptr) {
unsigned char const* const pos = ptr;
return (pos >= vrange_head && pos < vrange_tail);
}

/* malloc/free */
void* malloc(size_t sz)
{
if (sz <= PAGE_SIZE)
{
return /* from custom allocator; */
}
else
{
return HeapAlloc(GetProcessHeap(), 0, sz);
}
}

void free(void* ptr)
{
if (vrange_check(ptr))
{
/* use custom allocator */
}
else
{
HeapFree(GetProcessHeap(), 0, ptr);
}
}

> The only way I see is that you are wasting quite a lot of memory here....

wrt handling blocks > PAGE_SIZE, or aligning slabs on PAGE_SIZE boundaries?

>> In U++, you can allocate in thread_a and free in thread_b right? This is
>> what I mean by "remote" deallocations.
>
> Yes, but in U++, there is the same path used for "remote"
> deallocation.

Can you store blocks allocated by thread_a in free-lists residing in
threads_b-N?

> You can get the code and description by downloading U++, however very
> short description:
>
> Only "small" allocations are optimized. These are 16 - 256 bytes
> blocks, size is always rounded by 16 bytes.
>
> Like with vZoom, at the beginning of 4KB page there is block
> bookkeeping info. Anyway, the main nasty trick is that >256 bytes
> blocks are 16 byte *UN*aligned. So you can tell from the pointer value
> whether block is small or large (you only need to test & 8 :)

I see.

> You can also tell the block size without any locking (at least I
> believe so :). With this info, when deallocating, you can store the
> block to per-thread cache. If per-thread cache overflows / underflows,
> the code to deal with it is normally serialized....

I am assuming that you can store blocks in any threads free-list regardless
of which thread allocated it in the first place. Is that correct?

> So it is not entirely lock free, but locks only quite rarely, with
> current setup only once per 32 allocations in the worst case.

Sometimes lock-free techniques are not the right tools for the job.

;^)

Chris Thomasson

unread,

May 1, 2008, 5:40:15 PM5/1/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:aa3c62a7-4008-4d53...@w74g2000hsh.googlegroups.com...

Weird. Humm. I would think that U++ would beat the region allocator here.
Perhaps the region allocator as-is does not perform well under certain usage
patterns. Need to think about this... Thanks for all your help Mirek, I
appreciate it.

Chris Thomasson

unread,

May 2, 2008, 12:20:04 AM5/2/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:7OGdnS66acKvpYfV...@comcast.com...

I forgot to gradually expand the per-thread regions. Humm... Not good.

Chris Thomasson

unread,

May 2, 2008, 1:33:24 AM5/2/08

to

"Chris Thomasson" <cri...@comcast.net> wrote in message

news:k-GdnUE2RrKZEY7V...@comcast.com...
[...]

>
>
> base_heap*
> base_heap_from_ptr(
> void const* const ptr
> ) {
> if (! ((ptrdiff_t)ptr) % HEAP_DEPTH)) {
> return (base_heap*)(ALIGN_SIZE(ptr, unsigned char*, HEAP_DEPTH) -
> HEAP_DEPTH);
> }
> return (base_heap*)ptr;
> }

[...]

You can get around the expensive MOD by doing something like:

base_heap*
base_heap_from_ptr(
void* const ptr
) {
unsigned char* pos = (unsigned char*)ptr;
unsigned char* aligned = ALIGN_SIZE(ptr, unsigned char*, HEAP_DEPTH);
return (pos != aligned) ? (base_heap*)(aligned - HEAP_DEPTH)
: (base_heap*)pos;
}

Mirek Fidler

unread,

May 2, 2008, 2:29:47 AM5/2/08

to

On May 1, 11:36 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

> I use the system to allocate blocks that are > PAGE_SIZE. There is one more
> piece of logic that I did not add. The buffer passed to free checks if its
> within the range of reserved virtual memory. Something like:
>
> /* Reserve large amount of memory */
>
> /* Point the following at the head and tail */
> unsigned char* const vrange_head = ...;
> unsigned char* const vrange_tail = ...;
>
> bool vrange_check(void const* ptr) {
> unsigned char const* const pos = ptr;
> return (pos >= vrange_head && pos < vrange_tail);
>
> }
>
> /* malloc/free */
> void* malloc(size_t sz)
> {
> if (sz <= PAGE_SIZE)
> {
> return /* from custom allocator; */
> }
> else
> {
> return HeapAlloc(GetProcessHeap(), 0, sz);
> }
>
> }
>
> void free(void* ptr)
> {
> if (vrange_check(ptr))
> {
> /* use custom allocator */
> }
> else
> {
> HeapFree(GetProcessHeap(), 0, ptr);
> }
>
> }

Ah, I see. You reserve a constant amount of memory for small blocks.
But that is pretty non-generic, is not it? Especially taking into
account those nasty holes in virtual space of win32 (means you cannot
e.g. dedicate 1GB for small blocks and 1GB for large block). OTOH, not
a bad idea for 64 bit systems...

> wrt handling blocks > PAGE_SIZE, or aligning slabs on PAGE_SIZE boundaries?

Well, nope, it is OK, no memory wasted. My impression was that the
only way you can tell large blocks from small blocks is PAGE_SIZE
alignment. But you use that range....

> > Yes, but in U++, there is the same path used for "remote"
> > deallocation.
>
> Can you store blocks allocated by thread_a in free-lists residing in
> threads_b-N?

Sure. But it is "free-list" cache, there is much more complicated
logic behind the mutex.

> I am assuming that you can store blocks in any threads free-list regardless
> of which thread allocated it in the first place. Is that correct?

Definitely. And when there is too much blocks for specific size in
some thread's cache, it goes to mutex protected code and deallocates a
large number of them in single lock (now when more than 32 blocks in
cache, it is reduced to 15). BTW, your benchmark clearly showed me
there is room for improvement here; maybe I should these cache control
numbers adjust to block size (allow more blocks for smaller buckets).
Hard to say, at performance numbers where I am now, even that single
table dereference (to get maximum number for the bucket) will have
impact...

Mirek

Mirek Fidler

unread,

May 2, 2008, 2:35:06 AM5/2/08

to

On May 1, 11:40 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> "Mirek Fidler" <c...@ntllib.org> wrote in message
>
> news:aa3c62a7-4008-4d53...@w74g2000hsh.googlegroups.com...
>
> > On Apr 30, 11:58 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
> >> "Mirek Fidler" <c...@ntllib.org> wrote in message
>
> >>news:fa7d5e78-ed28-4223...@d45g2000hsc.googlegroups.com...
>
> >> >> Could you please run a simple single-threaded benchmark on this?
>
> >> > U++ 4672 ms, task manager shows 3100KB used during the run.
>
> >> > Your region allocator 5109ms, 6900KB used during the run.
>
> >> What numbers do you get when you run the test program as-is?
>
> > U++ 200ms
> > Region 125ms
>
> Weird. Humm. I would think that U++ would beat the region allocator here.
> Perhaps the region allocator as-is does not perform well under certain usage
> patterns. Need to think about this... Thanks for all your help Mirek, I
> appreciate it.

Ditto!

BTW, there is one thing I forgot to mention. As "monolitic" library, U+
+ can use some more advanced tricks with memory if its allocator is
used.

Namely

- String has "shortcut" access to allocator in certain cases
(basically skipping bucket-size logic).

- There is "void *MemoryAllocSz(size_t& size)" variant that adjust the
size up to the amount really allocated. This can have some very small
advantage for exponentially expanding arrays (in String or Vector).

I guess the real advantage there is less than 1%.

Mirek

Chris Thomasson

unread,

May 2, 2008, 3:12:02 AM5/2/08

to

Check this single-threaded region allocator for C out:

http://pastebin.com/m52ba914

This algorithm uses progressively increasing region sizes starting from
small, to large depending on current load. It uses less memory than the C++
region allocator posted here:

http://pastebin.com/m2dacea67

Just one simple tweak for enhancing the overall granularity characteristics.

Mirek Fidler

unread,

May 2, 2008, 3:24:53 AM5/2/08

to

BTW, do I understand well that region allocators cannot reuse the
block until the whole region is deallocated? (Or at least block in the
middle of region, I know you can tweak it for last block
deallocation).

If that is true, I think any attempts trying to use regions as general
purpose allocator is waste of time... You might try to use it for some
very specific applications, but then U++ style is still at least close
in performance..

Mirek

Chris Thomasson

unread,

May 2, 2008, 4:05:43 PM5/2/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:5c7c45a2-1b4c-4737...@m44g2000hsc.googlegroups.com...

> On May 2, 9:12 am, "Chris Thomasson" <cris...@comcast.net> wrote:
>> Check this single-threaded region allocator for C out:
>>
>> http://pastebin.com/m52ba914
>>
>> This algorithm uses progressively increasing region sizes starting from
>> small, to large depending on current load. It uses less memory than the
>> C++
>> region allocator posted here:
>>
>> http://pastebin.com/m2dacea67
>>
>> Just one simple tweak for enhancing the overall granularity
>> characteristics.
>
> BTW, do I understand well that region allocators cannot reuse the
> block until the whole region is deallocated?

This is true for a traditional region allocator, calls to free are generally
nops. There is usually a reset function which can be used to reclaim all of
the "in-use" memory requests. You can use the region allocator for C this
way:
___________________________________________________________________
int main(void) {
allocator this_allocator;

/* create allocator instance with initial region size of 8192 and
a low-watermark region count of 2
*/
if (! allocator_create(&this_allocator, 8192, 2)) {
for (;;) {
int i;
for (i = 1; i < 1024; ++i) {
allocator_request(&this_allocator, i);
}
allocator_reset(&this_allocator);
}
allocator_destroy(&this_allocator);
}
return 0;
}
___________________________________________________________________

The code will never run out of resources.

> (Or at least block in the
> middle of region, I know you can tweak it for last block
> deallocation).

If you store object meta-data you can blend regions and freelists. I think
there is an algorithm out there called "reaps" that did something like that.

> If that is true, I think any attempts trying to use regions as general
> purpose allocator is waste of time...

Perhaps. There granularity is fairly poor.

> You might try to use it for some
> very specific applications, but then U++ style is still at least close
> in performance..

It can be nice when one can render calls to free on locally allocated memory
into nops. Think of following scenario:

void thread() {
for (;;) {
// wait for work request

// do some lengthy work

// merge thread local allocations into single reset call.
mreset();
}
}

That's can be somewhat helpful.

Chris Thomasson

unread,

May 2, 2008, 4:21:19 PM5/2/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:b641cc0c-aa9c-4d09...@b1g2000hsg.googlegroups.com...

> On May 1, 11:40 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>> "Mirek Fidler" <c...@ntllib.org> wrote in message
>>
>> news:aa3c62a7-4008-4d53...@w74g2000hsh.googlegroups.com...
>>
>> > On Apr 30, 11:58 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>> >> "Mirek Fidler" <c...@ntllib.org> wrote in message
>>
>> >>news:fa7d5e78-ed28-4223...@d45g2000hsc.googlegroups.com...
>>
>> >> >> Could you please run a simple single-threaded benchmark on this?
>>
>> >> > U++ 4672 ms, task manager shows 3100KB used during the run.
>>
>> >> > Your region allocator 5109ms, 6900KB used during the run.
>>
>> >> What numbers do you get when you run the test program as-is?
>>
>> > U++ 200ms
>> > Region 125ms
>>
>> Weird. Humm. I would think that U++ would beat the region allocator here.
>> Perhaps the region allocator as-is does not perform well under certain
>> usage
>> patterns. Need to think about this... Thanks for all your help Mirek, I
>> appreciate it.
>
> Ditto!
>
> BTW, there is one thing I forgot to mention. As "monolitic" library, U+
> + can use some more advanced tricks with memory if its allocator is
> used.

Yeah. vZoom has the slab allocator sort-of "built" into certain sub-systems.

> Namely
>
> - String has "shortcut" access to allocator in certain cases
> (basically skipping bucket-size logic).

Sounds good.

> - There is "void *MemoryAllocSz(size_t& size)" variant that adjust the
> size up to the amount really allocated. This can have some very small
> advantage for exponentially expanding arrays (in String or Vector).
>
> I guess the real advantage there is less than 1%.

Every little bit counts! ;^)

Humm, it can be hard to compete with that level of integration. Is that why
U++ wins the parser benchmark? Humm, I think it could also be because region
allocator is total crap within that type of usage pattern. I think I will
just release a simple slab allocator.

One good thing about the design: The scalability factor of the non-blocking
algorithm used within the multi-threaded region allocator seems to be fairly
high. It does well in contentious scenarios...

Chris Thomasson

unread,

May 2, 2008, 5:22:51 PM5/2/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:ee377f72-0c47-4f7f...@a1g2000hsb.googlegroups.com...

> On May 1, 11:36 pm, "Chris Thomasson" <cris...@comcast.net> wrote:
>
>> I use the system to allocate blocks that are > PAGE_SIZE. There is one
>> more
>> piece of logic that I did not add. The buffer passed to free checks if
>> its
>> within the range of reserved virtual memory. Something like:

[...]

>
> Ah, I see. You reserve a constant amount of memory for small blocks.
> But that is pretty non-generic, is not it?

I also think that hoard does, or used to, defer large block allocations
spanning page(s) to the operating system. Yeah, its not really all the
generic.

>> wrt handling blocks > PAGE_SIZE, or aligning slabs on PAGE_SIZE
>> boundaries?
>
> Well, nope, it is OK, no memory wasted. My impression was that the
> only way you can tell large blocks from small blocks is PAGE_SIZE
> alignment. But you use that range....

Sometimes, using a simple range check can be fairly convenient.

>> > Yes, but in U++, there is the same path used for "remote"
>> > deallocation.
>>
>> Can you store blocks allocated by thread_a in free-lists residing in
>> threads_b-N?
>
> Sure. But it is "free-list" cache, there is much more complicated
> logic behind the mutex.
>
>> I am assuming that you can store blocks in any threads free-list
>> regardless
>> of which thread allocated it in the first place. Is that correct?
>
> Definitely. And when there is too much blocks for specific size in
> some thread's cache, it goes to mutex protected code and deallocates a
> large number of them in single lock (now when more than 32 blocks in
> cache, it is reduced to 15). BTW, your benchmark clearly showed me
> there is room for improvement here; maybe I should these cache control
> numbers adjust to block size (allow more blocks for smaller buckets).
> Hard to say, at performance numbers where I am now, even that single
> table dereference (to get maximum number for the bucket) will have
> impact...

Humm. You boiled down the problem it was initial having with my contention
benchmark to this mutex? Are you using a single lock for all threads or can
they move large numbers between different caches concurrently?

Mirek Fidler

unread,

May 4, 2008, 12:14:20 PM5/4/08

to

Yes. I was using similar style in 1990 to manage memory :)

I am still missing its simplicity, but I am afraid it has only a
little use with C++ (where destructors have to be called anyway).

OTOH, I now understand why you are trying this. I was just thinking
the final goal is general purpose allocator (with those overloaded new/
delete etc...)

Mirek

Mirek Fidler

unread,

May 4, 2008, 12:18:12 PM5/4/08

to

Sure, I was able to match you quite quickly. All I need to do is
increase per-thread cache watermarks :)

> Are you using a single lock for all threads or can

When it comes down to per-thread cache underflow/overflow, rest of
allocator is locked.

It is also locked for larger blocks.

Mirek

Chris Thomasson

unread,

May 4, 2008, 6:44:35 PM5/4/08

to

"Mirek Fidler" <c...@ntllib.org> wrote in message

news:55c76dde-49e0-4866...@56g2000hsm.googlegroups.com...

> On May 2, 10:05 pm, "Chris Thomasson" <cris...@comcast.net> wrote:

[...]

>> It can be nice when one can render calls to free on locally allocated
>> memory
>> into nops. Think of following scenario:
>>
>> void thread() {
>> for (;;) {
>> // wait for work request
>>
>> // do some lengthy work
>>
>> // merge thread local allocations into single reset call.
>> mreset();
>> }
>>
>> }
>>
>> That's can be somewhat helpful.
>
> Yes. I was using similar style in 1990 to manage memory :)

:^D

> I am still missing its simplicity, but I am afraid it has only a
> little use with C++ (where destructors have to be called anyway).

Well, that's a good point. Executing a bunch of destructors in the
'mreset()' call could end up negating any benefit from skipping all the
calls into free. That would be bad... I find that it seems to be more
appropriate in C, which is why I spent 10-15 minutes creating a version in
that language: http://pastebin.com/m52ba914.

> OTOH, I now understand why you are trying this.

IMVHO, region allocation can be _highly_ effective in "certain" scenarios.
Your parser does not seem to be one of them. Or, U++ is winning because its
providing "proprietary" fast-paths for certain objects (e.g., string) into
the system allocator...

;^)

> I was just thinking
> the final goal is general purpose allocator (with those overloaded new/
> delete etc...)

It's not really the best usage for a region allocator... ;^)

This type of memory management really works well when you can elude all
calls into free, and just use a single call to "reset" instead.