apart from the random page addresses obtained form mmap(2) malloc(3)
itself also randomizes cache en chunk operations. It uses a nibble of
randomness per call, so optimize that to not waste half the random
bits.
Please test, should be a bit faster.
-Otto
Index: malloc.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.121
diff -u -p -r1.121 malloc.c
--- malloc.c 27 Nov 2009 20:11:01 -0000 1.121
+++ malloc.c 30 Nov 2009 19:40:47 -0000
@@ -64,7 +64,7 @@
#define MALLOC_MAXCHUNK (1 << (MALLOC_PAGESHIFT-1))
#define MALLOC_MAXCACHE 256
-#define MALLOC_DELAYED_CHUNKS 16 /* should be power of 2 */
+#define MALLOC_DELAYED_CHUNKS 15 /* max of getrnibble() */
/*
* When the P option is active, we move allocations between half a page
* and a whole page towards the end, subject to alignment constraints.
@@ -110,7 +110,7 @@ struct dir_info {
/* free pages cache */
struct region_info free_regions[MALLOC_MAXCACHE];
/* delayed free chunk slots */
- void *delayed_chunks[MALLOC_DELAYED_CHUNKS];
+ void *delayed_chunks[MALLOC_DELAYED_CHUNKS + 1];
#ifdef MALLOC_STATS
size_t inserts;
size_t insert_collisions;
@@ -183,9 +183,9 @@ static int malloc_active; /* status of
static size_t malloc_guarded; /* bytes used for guards */
static size_t malloc_used; /* bytes allocated */
-static size_t rbytesused; /* random bytes used */
+static size_t rnibblesused; /* random nibbles used */
static u_char rbytes[512]; /* random bytes */
-static u_char getrbyte(void);
+static u_char getrnibble(void);
extern char *__progname;
@@ -378,6 +378,26 @@ wrterror(char *p)
abort();
}
+static void
+rbytes_init(void)
+{
+ arc4random_buf(rbytes, sizeof(rbytes));
+ rnibblesused = 0;
+}
+
+static inline u_char
+getrnibble(void)
+{
+ u_char x;
+
+ if (rnibblesused >= 2 * sizeof(rbytes))
+ rbytes_init();
+ x = rnibblesused < sizeof(rbytes) ? (rbytes[rnibblesused] & 0xf) :
+ (rbytes[rnibblesused - sizeof(rbytes)] >> 4);
+ rnibblesused++;
+ return x;
+}
+
/*
* Cache maintenance. We keep at most malloc_cache pages cached.
* If the cache is becoming full, unmap pages in the cache for real,
@@ -408,7 +428,7 @@ unmap(struct dir_info *d, void *p, size_
rsz = mopts.malloc_cache - d->free_regions_size;
if (psz > rsz)
tounmap = psz - rsz;
- offset = getrbyte();
+ offset = getrnibble();
for (i = 0; tounmap > 0 && i < mopts.malloc_cache; i++) {
r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
if (r->p != NULL) {
@@ -489,7 +509,7 @@ map(struct dir_info *d, size_t sz, int z
/* zero fill not needed */
return p;
}
- offset = getrbyte();
+ offset = getrnibble();
for (i = 0; i < mopts.malloc_cache; i++) {
r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
if (r->p != NULL) {
@@ -536,21 +556,6 @@ map(struct dir_info *d, size_t sz, int z
return p;
}
-static void
-rbytes_init(void)
-{
- arc4random_buf(rbytes, sizeof(rbytes));
- rbytesused = 0;
-}
-
-static u_char
-getrbyte(void)
-{
- if (rbytesused >= sizeof(rbytes))
- rbytes_init();
- return rbytes[rbytesused++];
-}
-
/*
* Initialize a dir_info, which should have been cleared by caller
*/
@@ -1010,7 +1015,7 @@ malloc_bytes(struct dir_info *d, size_t
}
/* advance a random # of positions */
- i = (getrbyte() & (MALLOC_DELAYED_CHUNKS - 1)) % bp->free;
+ i = getrnibble() % bp->free;
while (i > 0) {
u += u;
k++;
@@ -1273,7 +1278,7 @@ ofree(void *p)
if (mopts.malloc_junk && sz > 0)
memset(p, SOME_FREEJUNK, sz);
if (!mopts.malloc_freeprot) {
- i = getrbyte() & (MALLOC_DELAYED_CHUNKS - 1);
+ i = getrnibble();
tmp = p;
p = g_pool->delayed_chunks[i];
g_pool->delayed_chunks[i] = tmp;
> apart from the random page addresses obtained form mmap(2) malloc(3)
> itself also randomizes cache en chunk operations. It uses a nibble of
> randomness per call, so optimize that to not waste half the random
> bits.
arc4 random does not use entropy and is very cheap
almost the speed of memory copy. "caching" arc4 output
like this is the same as running rot13 twice (:
--
paranoic mickey (my employers have changed but, the name has remained)
> On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:
> > Hi,
> re
>
> > apart from the random page addresses obtained form mmap(2) malloc(3)
> > itself also randomizes cache en chunk operations. It uses a nibble of
> > randomness per call, so optimize that to not waste half the random
> > bits.
>
> arc4 random does not use entropy and is very cheap
> almost the speed of memory copy. "caching" arc4 output
> like this is the same as running rot13 twice (:
at least it saves on calls to arc4_stir() and thus on sysctl(2) and
getpid(2) calls.
-Otto
> On Wed, Dec 02, 2009 at 01:09:02PM +0000, Michael Shalayeff wrote:
>
> > On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:
> > > Hi,
> > re
> >
> > > apart from the random page addresses obtained form mmap(2) malloc(3)
> > > itself also randomizes cache en chunk operations. It uses a nibble of
> > > randomness per call, so optimize that to not waste half the random
> > > bits.
> >
> > arc4 random does not use entropy and is very cheap
> > almost the speed of memory copy. "caching" arc4 output
> > like this is the same as running rot13 twice (:
>
> at least it saves on calls to arc4_stir() and thus on sysctl(2) and
> getpid(2) calls.
I was a bit puzzled by mickeys comment, remenbering that the kernel
arc4 does use entropy, and indeed rc4random_buf() in the kernel which
provides the random data for stirring the userland arc4random(3) calls
arc4maybeinit() which potentially consumes entropy via arc4_stir() and
get_random_bytes(). The the trigger to stir in the kernel is timer
based, not consumption based.
-Otto
yes but that does not affect entropy consumption anyhow (:
amount of data required to reinitialise the arc4 pool is
almost nothing compared to amounts of data produced.
the pool is 256 bytes and we produce 1.6MEGAbytes from it.
what program would call malloc more than 1.6 million times?
perhaps if it does it shall be fixed (:
of course savings on getpid() might be worth fighting for.
cu
> Hi,
>
> apart from the random page addresses obtained form mmap(2) malloc(3)
> itself also randomizes cache en chunk operations. It uses a nibble of
> randomness per call, so optimize that to not waste half the random
> bits.
>
> Please test, should be a bit faster.
Anybody?
I don't really like this. It looks complicated (hard to judge). arc4
is fast enough I don't think we need to worry about conserving bits,
and I'd be surprised if it actually were faster considering how much
work you're doing per nibble compared to the work to just run arc4.
However fast it may be, I feel much safer with the current code.
Every call to arc4random_buf() causes a getpid() call, which is a
syscall with all the overhead. This saves half of those.
-Otto
Hit "send" too fast....
Yes, it is a bit more more complicated, but I think not too much.
-Otto
And that is why I suggested it in the first place.
I saw some sort of trace of a process.
I just built a recent checkout with and without this patch included. It
seems to be stable. Somewhat to my surprise, it also seems to be
measurably faster on my system.
Without patch:
$ gcc -Wall -O2 -DNDEBUG -o malloc-test malloc-test.c && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test
0m32.11s real 0m28.88s user 0m2.60s system
0m32.05s real 0m28.22s user 0m3.17s system
0m44.20s real 0m40.11s user 0m3.36s system
0m32.25s real 0m28.52s user 0m3.40s system
0m31.57s real 0m28.41s user 0m3.02s system
0m31.74s real 0m28.36s user 0m2.98s system
With patch:
$ gcc -Wall -O2 -DNDEBUG -o malloc-test malloc-test.c && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test
0m32.11s real 0m28.88s user 0m2.60s system
0m29.64s real 0m26.39s user 0m3.21s system
0m29.32s real 0m26.21s user 0m3.05s system
0m29.31s real 0m25.89s user 0m3.35s system
0m29.57s real 0m26.51s user 0m3.01s system
0m29.56s real 0m26.44s user 0m3.10s system
0m29.50s real 0m26.06s user 0m3.36s system
The program I used to test this is very much ad-hoc, does not use memory
in a realistic fashion, etc, so this is not a proper (hah!) benchmark.
But having some numbers may be better than nothing.
malloc-test.c:
#include <assert.h>
#include <err.h>
#include <stdlib.h>
int
main(void)
{
void *p[8][16 * 1024];
int i, j, k, l;
for (l = 0; l < 512; l++) {
for (i = 0; i < sizeof(p) / sizeof(*p); i++) {
for (j = 0; j < sizeof(p[i]) / sizeof(*p[i]); j++)
if ((p[i][j] = malloc(1 << i)) == NULL)
err(1, "malloc() failed");
for (k = i; k >= 0; k--) {
for (j = i; j < sizeof(p[i]) / sizeof(*p[i]); j += sizeof(p) / sizeof(*p)) {
assert(p[k][j] != NULL);
free(p[k][j]);
p[k][j] = NULL;
}
}
}
for (i = 0; i < sizeof(p) / sizeof(*p); i++)
for (j = 0; j < sizeof(p[i]) / sizeof(*p[i]); j++)
free(p[i][j]);
}
return 0;
}
For completeness, a dmesg may be found at
http://www.joachimschipper.nl/posts/20091215/dmesg.
Should anyone care, the program above may be used for any purpose
whatsoever, with or without attribution ("public domain").
Joachim
> On Mon, Dec 14, 2009 at 01:54:50PM +0100, Otto Moerbeek wrote:
>
> > On Mon, Dec 14, 2009 at 07:44:31AM -0500, Ted Unangst wrote:
> >
> > > On Mon, Dec 14, 2009 at 3:58 AM, Otto Moerbeek <ot...@drijf.net> wrote:
> > > >> apart from the random page addresses obtained form mmap(2) malloc(3)
> > > >> itself also randomizes cache en chunk operations. It uses a nibble of
> > > >> randomness per call, so optimize that to not waste half the random
> > > >> bits.
> > > >>
> > > >> Please test, should be a bit faster.
> > >
> > > I don't really like this. It looks complicated (hard to judge). arc4
> > > is fast enough I don't think we need to worry about conserving bits,
> > > and I'd be surprised if it actually were faster considering how much
> > > work you're doing per nibble compared to the work to just run arc4.
> > > However fast it may be, I feel much safer with the current code.
> >
> > Every call to arc4random_buf() causes a getpid() call, which is a
> > syscall with all the overhead. This saves half of those.
>
> Hit "send" too fast....
>
> Yes, it is a bit more more complicated, but I think not too much.
>
> -Otto
This is an alternative version from guenther@, which is a bit simpler
and generates samller code (as seen on i386). I intend to commit this
soon, the numbers from Joachim Schipper show a small, but significant
speedup.
-Otto
Index: malloc.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.122
diff -u -p -r1.122 malloc.c
--- malloc.c 7 Dec 2009 18:47:38 -0000 1.122
+++ malloc.c 15 Dec 2009 19:15:33 -0000
@@ -66,7 +66,7 @@
#define MALLOC_MAXCHUNK (1 << (MALLOC_PAGESHIFT-1))
#define MALLOC_MAXCACHE 256
-#define MALLOC_DELAYED_CHUNKS 16 /* should be power of 2 */
+#define MALLOC_DELAYED_CHUNKS 15 /* max of getrnibble() */
/*
* When the P option is active, we move allocations between half a page
* and a whole page towards the end, subject to alignment constraints.
@@ -112,7 +112,7 @@ struct dir_info {
/* free pages cache */
struct region_info free_regions[MALLOC_MAXCACHE];
/* delayed free chunk slots */
- void *delayed_chunks[MALLOC_DELAYED_CHUNKS];
+ void *delayed_chunks[MALLOC_DELAYED_CHUNKS + 1];
#ifdef MALLOC_STATS
size_t inserts;
size_t insert_collisions;
@@ -185,9 +185,9 @@ static int malloc_active; /* status of
static size_t malloc_guarded; /* bytes used for guards */
static size_t malloc_used; /* bytes allocated */
-static size_t rbytesused; /* random bytes used */
+static size_t rnibblesused; /* random nibbles used */
static u_char rbytes[512]; /* random bytes */
-static u_char getrbyte(void);
+static u_char getrnibble(void);
extern char *__progname;
@@ -380,6 +380,24 @@ wrterror(char *p)
abort();
}
+static void
+rbytes_init(void)
+{
+ arc4random_buf(rbytes, sizeof(rbytes));
+ rnibblesused = 0;
+}
+
+static inline u_char
+getrnibble(void)
+{
+ u_char x;
+
+ if (rnibblesused >= 2 * sizeof(rbytes))
+ rbytes_init();
+ x = rbytes[rnibblesused++ / 2];
+ return (rnibblesused & 1 ? x & 0xf : x >> 4);
+}
+
/*
* Cache maintenance. We keep at most malloc_cache pages cached.
* If the cache is becoming full, unmap pages in the cache for real,
@@ -410,7 +428,7 @@ unmap(struct dir_info *d, void *p, size_
rsz = mopts.malloc_cache - d->free_regions_size;
if (psz > rsz)
tounmap = psz - rsz;
- offset = getrbyte();
+ offset = getrnibble();
for (i = 0; tounmap > 0 && i < mopts.malloc_cache; i++) {
r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
if (r->p != NULL) {
@@ -491,7 +509,7 @@ map(struct dir_info *d, size_t sz, int z
/* zero fill not needed */
return p;
}
- offset = getrbyte();
+ offset = getrnibble();
for (i = 0; i < mopts.malloc_cache; i++) {
r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
if (r->p != NULL) {
@@ -538,21 +556,6 @@ map(struct dir_info *d, size_t sz, int z
return p;
}
-static void
-rbytes_init(void)
-{
- arc4random_buf(rbytes, sizeof(rbytes));
- rbytesused = 0;
-}
-
-static u_char
-getrbyte(void)
-{
- if (rbytesused >= sizeof(rbytes))
- rbytes_init();
- return rbytes[rbytesused++];
-}
-
/*
* Initialize a dir_info, which should have been cleared by caller
*/
@@ -1012,7 +1015,7 @@ malloc_bytes(struct dir_info *d, size_t
}
/* advance a random # of positions */
- i = (getrbyte() & (MALLOC_DELAYED_CHUNKS - 1)) % bp->free;
+ i = getrnibble() % bp->free;
while (i > 0) {
u += u;
k++;
@@ -1275,7 +1278,7 @@ ofree(void *p)