save some entropy

Otto Moerbeek

unread,

Dec 2, 2009, 4:38:10 AM12/2/09

to

Hi,

apart from the random page addresses obtained form mmap(2) malloc(3)
itself also randomizes cache en chunk operations. It uses a nibble of
randomness per call, so optimize that to not waste half the random
bits.

Please test, should be a bit faster.

-Otto

Index: malloc.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.121
diff -u -p -r1.121 malloc.c
--- malloc.c 27 Nov 2009 20:11:01 -0000 1.121
+++ malloc.c 30 Nov 2009 19:40:47 -0000
@@ -64,7 +64,7 @@

#define MALLOC_MAXCHUNK (1 << (MALLOC_PAGESHIFT-1))
#define MALLOC_MAXCACHE 256
-#define MALLOC_DELAYED_CHUNKS 16 /* should be power of 2 */
+#define MALLOC_DELAYED_CHUNKS 15 /* max of getrnibble() */
/*
* When the P option is active, we move allocations between half a page
* and a whole page towards the end, subject to alignment constraints.
@@ -110,7 +110,7 @@ struct dir_info {
/* free pages cache */
struct region_info free_regions[MALLOC_MAXCACHE];
/* delayed free chunk slots */
- void *delayed_chunks[MALLOC_DELAYED_CHUNKS];
+ void *delayed_chunks[MALLOC_DELAYED_CHUNKS + 1];
#ifdef MALLOC_STATS
size_t inserts;
size_t insert_collisions;
@@ -183,9 +183,9 @@ static int malloc_active; /* status of
static size_t malloc_guarded; /* bytes used for guards */
static size_t malloc_used; /* bytes allocated */

-static size_t rbytesused; /* random bytes used */
+static size_t rnibblesused; /* random nibbles used */
static u_char rbytes[512]; /* random bytes */
-static u_char getrbyte(void);
+static u_char getrnibble(void);

extern char *__progname;

@@ -378,6 +378,26 @@ wrterror(char *p)
abort();
}

+static void
+rbytes_init(void)
+{
+ arc4random_buf(rbytes, sizeof(rbytes));
+ rnibblesused = 0;
+}
+
+static inline u_char
+getrnibble(void)
+{
+ u_char x;
+
+ if (rnibblesused >= 2 * sizeof(rbytes))
+ rbytes_init();
+ x = rnibblesused < sizeof(rbytes) ? (rbytes[rnibblesused] & 0xf) :
+ (rbytes[rnibblesused - sizeof(rbytes)] >> 4);
+ rnibblesused++;
+ return x;
+}
+
/*
* Cache maintenance. We keep at most malloc_cache pages cached.
* If the cache is becoming full, unmap pages in the cache for real,
@@ -408,7 +428,7 @@ unmap(struct dir_info *d, void *p, size_
rsz = mopts.malloc_cache - d->free_regions_size;
if (psz > rsz)
tounmap = psz - rsz;
- offset = getrbyte();
+ offset = getrnibble();
for (i = 0; tounmap > 0 && i < mopts.malloc_cache; i++) {
r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
if (r->p != NULL) {
@@ -489,7 +509,7 @@ map(struct dir_info *d, size_t sz, int z
/* zero fill not needed */
return p;
}
- offset = getrbyte();
+ offset = getrnibble();
for (i = 0; i < mopts.malloc_cache; i++) {
r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
if (r->p != NULL) {
@@ -536,21 +556,6 @@ map(struct dir_info *d, size_t sz, int z
return p;
}

-static void
-rbytes_init(void)
-{
- arc4random_buf(rbytes, sizeof(rbytes));
- rbytesused = 0;
-}
-
-static u_char
-getrbyte(void)
-{
- if (rbytesused >= sizeof(rbytes))
- rbytes_init();
- return rbytes[rbytesused++];
-}
-
/*
* Initialize a dir_info, which should have been cleared by caller
*/
@@ -1010,7 +1015,7 @@ malloc_bytes(struct dir_info *d, size_t
}

/* advance a random # of positions */
- i = (getrbyte() & (MALLOC_DELAYED_CHUNKS - 1)) % bp->free;
+ i = getrnibble() % bp->free;
while (i > 0) {
u += u;
k++;
@@ -1273,7 +1278,7 @@ ofree(void *p)
if (mopts.malloc_junk && sz > 0)
memset(p, SOME_FREEJUNK, sz);
if (!mopts.malloc_freeprot) {
- i = getrbyte() & (MALLOC_DELAYED_CHUNKS - 1);
+ i = getrnibble();
tmp = p;
p = g_pool->delayed_chunks[i];
g_pool->delayed_chunks[i] = tmp;

Michael Shalayeff

unread,

Dec 2, 2009, 8:09:02 AM12/2/09

to

On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:
> Hi,
re

> apart from the random page addresses obtained form mmap(2) malloc(3)
> itself also randomizes cache en chunk operations. It uses a nibble of
> randomness per call, so optimize that to not waste half the random
> bits.

arc4 random does not use entropy and is very cheap
almost the speed of memory copy. "caching" arc4 output
like this is the same as running rot13 twice (:

--
paranoic mickey (my employers have changed but, the name has remained)

Otto Moerbeek

unread,

Dec 2, 2009, 8:25:57 AM12/2/09

to

On Wed, Dec 02, 2009 at 01:09:02PM +0000, Michael Shalayeff wrote:

> On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:
> > Hi,
> re
>
> > apart from the random page addresses obtained form mmap(2) malloc(3)
> > itself also randomizes cache en chunk operations. It uses a nibble of
> > randomness per call, so optimize that to not waste half the random
> > bits.
>
> arc4 random does not use entropy and is very cheap
> almost the speed of memory copy. "caching" arc4 output
> like this is the same as running rot13 twice (:

at least it saves on calls to arc4_stir() and thus on sysctl(2) and
getpid(2) calls.

-Otto

Otto Moerbeek

unread,

Dec 2, 2009, 8:44:21 AM12/2/09

to

On Wed, Dec 02, 2009 at 02:25:57PM +0100, Otto Moerbeek wrote:

> On Wed, Dec 02, 2009 at 01:09:02PM +0000, Michael Shalayeff wrote:
>
> > On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:
> > > Hi,
> > re
> >
> > > apart from the random page addresses obtained form mmap(2) malloc(3)
> > > itself also randomizes cache en chunk operations. It uses a nibble of
> > > randomness per call, so optimize that to not waste half the random
> > > bits.
> >
> > arc4 random does not use entropy and is very cheap
> > almost the speed of memory copy. "caching" arc4 output
> > like this is the same as running rot13 twice (:
>
> at least it saves on calls to arc4_stir() and thus on sysctl(2) and
> getpid(2) calls.

I was a bit puzzled by mickeys comment, remenbering that the kernel
arc4 does use entropy, and indeed rc4random_buf() in the kernel which
provides the random data for stirring the userland arc4random(3) calls
arc4maybeinit() which potentially consumes entropy via arc4_stir() and
get_random_bytes(). The the trigger to stir in the kernel is timer
based, not consumption based.

-Otto

Michael Shalayeff

unread,

Dec 2, 2009, 12:29:20 PM12/2/09

to

On Wed, Dec 02, 2009 at 02:25:57PM +0100, Otto Moerbeek wrote:
> On Wed, Dec 02, 2009 at 01:09:02PM +0000, Michael Shalayeff wrote:
>
> > On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:
> > > Hi,
> > re
> >
> > > apart from the random page addresses obtained form mmap(2) malloc(3)
> > > itself also randomizes cache en chunk operations. It uses a nibble of
> > > randomness per call, so optimize that to not waste half the random
> > > bits.
> >
> > arc4 random does not use entropy and is very cheap
> > almost the speed of memory copy. "caching" arc4 output
> > like this is the same as running rot13 twice (:
>
> at least it saves on calls to arc4_stir() and thus on sysctl(2) and
> getpid(2) calls.

yes but that does not affect entropy consumption anyhow (:

Michael Shalayeff

unread,

Dec 2, 2009, 12:36:13 PM12/2/09

to

On Wed, Dec 02, 2009 at 02:44:21PM +0100, Otto Moerbeek wrote:
> On Wed, Dec 02, 2009 at 02:25:57PM +0100, Otto Moerbeek wrote:
>
> > On Wed, Dec 02, 2009 at 01:09:02PM +0000, Michael Shalayeff wrote:
> >
> > > On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:
> > > > Hi,
> > > re
> > >
> > > > apart from the random page addresses obtained form mmap(2) malloc(3)
> > > > itself also randomizes cache en chunk operations. It uses a nibble of
> > > > randomness per call, so optimize that to not waste half the random
> > > > bits.
> > >
> > > arc4 random does not use entropy and is very cheap
> > > almost the speed of memory copy. "caching" arc4 output
> > > like this is the same as running rot13 twice (:
> >
> > at least it saves on calls to arc4_stir() and thus on sysctl(2) and
> > getpid(2) calls.
>

> I was a bit puzzled by mickeys comment, remenbering that the kernel
> arc4 does use entropy, and indeed rc4random_buf() in the kernel which
> provides the random data for stirring the userland arc4random(3) calls
> arc4maybeinit() which potentially consumes entropy via arc4_stir() and
> get_random_bytes(). The the trigger to stir in the kernel is timer
> based, not consumption based.

amount of data required to reinitialise the arc4 pool is
almost nothing compared to amounts of data produced.
the pool is 256 bytes and we produce 1.6MEGAbytes from it.
what program would call malloc more than 1.6 million times?
perhaps if it does it shall be fixed (:

of course savings on getpid() might be worth fighting for.
cu

Otto Moerbeek

unread,

Dec 14, 2009, 3:58:19 AM12/14/09

to

On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:

> Hi,
>

> apart from the random page addresses obtained form mmap(2) malloc(3)
> itself also randomizes cache en chunk operations. It uses a nibble of
> randomness per call, so optimize that to not waste half the random
> bits.
>

> Please test, should be a bit faster.

Anybody?

Ted Unangst

unread,

Dec 14, 2009, 7:44:31 AM12/14/09

to

On Mon, Dec 14, 2009 at 3:58 AM, Otto Moerbeek <ot...@drijf.net> wrote:
>> apart from the random page addresses obtained form mmap(2) malloc(3)
>> itself also randomizes cache en chunk operations. It uses a nibble of
>> randomness per call, so optimize that to not waste half the random
>> bits.
>>
>> Please test, should be a bit faster.

I don't really like this. It looks complicated (hard to judge). arc4
is fast enough I don't think we need to worry about conserving bits,
and I'd be surprised if it actually were faster considering how much
work you're doing per nibble compared to the work to just run arc4.
However fast it may be, I feel much safer with the current code.

Otto Moerbeek

unread,

Dec 14, 2009, 7:54:50 AM12/14/09

to

Every call to arc4random_buf() causes a getpid() call, which is a
syscall with all the overhead. This saves half of those.

-Otto

Otto Moerbeek

unread,

Dec 14, 2009, 7:58:34 AM12/14/09

to

Hit "send" too fast....

Yes, it is a bit more more complicated, but I think not too much.

-Otto

Theo de Raadt

unread,

Dec 14, 2009, 12:11:46 PM12/14/09

to

> > On Mon, Dec 14, 2009 at 3:58 AM, Otto Moerbeek <ot...@drijf.net> wrote:
> > >> apart from the random page addresses obtained form mmap(2) malloc(3)
> > >> itself also randomizes cache en chunk operations. It uses a nibble of
> > >> randomness per call, so optimize that to not waste half the random
> > >> bits.
> > >>
> > >> Please test, should be a bit faster.
> >
> > I don't really like this. It looks complicated (hard to judge). arc4
> > is fast enough I don't think we need to worry about conserving bits,
> > and I'd be surprised if it actually were faster considering how much
> > work you're doing per nibble compared to the work to just run arc4.
> > However fast it may be, I feel much safer with the current code.
>
> Every call to arc4random_buf() causes a getpid() call, which is a
> syscall with all the overhead. This saves half of those.

And that is why I suggested it in the first place.

I saw some sort of trace of a process.

Joachim Schipper

unread,

Dec 15, 2009, 5:34:25 AM12/15/09

to

On Mon, Dec 14, 2009 at 09:58:19AM +0100, Otto Moerbeek wrote:
> On Wed, Dec 02, 2009 at 10:38:10AM +0100, Otto Moerbeek wrote:

> > [A]part from the random page addresses obtained form mmap(2) malloc(3)

> > itself also randomizes cache en chunk operations. It uses a nibble of
> > randomness per call, so optimize that to not waste half the random
> > bits.
> >
> > Please test, should be a bit faster.
>

> Anybody?

I just built a recent checkout with and without this patch included. It
seems to be stable. Somewhat to my surprise, it also seems to be
measurably faster on my system.

Without patch:
$ gcc -Wall -O2 -DNDEBUG -o malloc-test malloc-test.c && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test
0m32.11s real 0m28.88s user 0m2.60s system
0m32.05s real 0m28.22s user 0m3.17s system
0m44.20s real 0m40.11s user 0m3.36s system
0m32.25s real 0m28.52s user 0m3.40s system
0m31.57s real 0m28.41s user 0m3.02s system
0m31.74s real 0m28.36s user 0m2.98s system

With patch:
$ gcc -Wall -O2 -DNDEBUG -o malloc-test malloc-test.c && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test && time ./malloc-test
0m32.11s real 0m28.88s user 0m2.60s system
0m29.64s real 0m26.39s user 0m3.21s system
0m29.32s real 0m26.21s user 0m3.05s system
0m29.31s real 0m25.89s user 0m3.35s system
0m29.57s real 0m26.51s user 0m3.01s system
0m29.56s real 0m26.44s user 0m3.10s system
0m29.50s real 0m26.06s user 0m3.36s system

The program I used to test this is very much ad-hoc, does not use memory
in a realistic fashion, etc, so this is not a proper (hah!) benchmark.
But having some numbers may be better than nothing.

malloc-test.c:
#include <assert.h>
#include <err.h>
#include <stdlib.h>

int
main(void)
{
void *p[8][16 * 1024];
int i, j, k, l;

for (l = 0; l < 512; l++) {
for (i = 0; i < sizeof(p) / sizeof(*p); i++) {
for (j = 0; j < sizeof(p[i]) / sizeof(*p[i]); j++)
if ((p[i][j] = malloc(1 << i)) == NULL)
err(1, "malloc() failed");

for (k = i; k >= 0; k--) {
for (j = i; j < sizeof(p[i]) / sizeof(*p[i]); j += sizeof(p) / sizeof(*p)) {
assert(p[k][j] != NULL);
free(p[k][j]);
p[k][j] = NULL;
}
}
}

for (i = 0; i < sizeof(p) / sizeof(*p); i++)
for (j = 0; j < sizeof(p[i]) / sizeof(*p[i]); j++)
free(p[i][j]);
}

return 0;
}

For completeness, a dmesg may be found at
http://www.joachimschipper.nl/posts/20091215/dmesg.

Should anyone care, the program above may be used for any purpose
whatsoever, with or without attribution ("public domain").

Joachim

Otto Moerbeek

unread,

Dec 15, 2009, 3:45:17 PM12/15/09

to

On Mon, Dec 14, 2009 at 01:58:34PM +0100, Otto Moerbeek wrote:

> On Mon, Dec 14, 2009 at 01:54:50PM +0100, Otto Moerbeek wrote:
>
> > On Mon, Dec 14, 2009 at 07:44:31AM -0500, Ted Unangst wrote:
> >

> > > On Mon, Dec 14, 2009 at 3:58 AM, Otto Moerbeek <ot...@drijf.net> wrote:

> > > >> apart from the random page addresses obtained form mmap(2) malloc(3)

> > > >> itself also randomizes cache en chunk operations. It uses a nibble of
> > > >> randomness per call, so optimize that to not waste half the random
> > > >> bits.
> > > >>
> > > >> Please test, should be a bit faster.
> > >

> > > I don't really like this. It looks complicated (hard to judge). arc4
> > > is fast enough I don't think we need to worry about conserving bits,
> > > and I'd be surprised if it actually were faster considering how much
> > > work you're doing per nibble compared to the work to just run arc4.
> > > However fast it may be, I feel much safer with the current code.
> >
> > Every call to arc4random_buf() causes a getpid() call, which is a
> > syscall with all the overhead. This saves half of those.
>

> Hit "send" too fast....
>
> Yes, it is a bit more more complicated, but I think not too much.
>
> -Otto

This is an alternative version from guenther@, which is a bit simpler
and generates samller code (as seen on i386). I intend to commit this
soon, the numbers from Joachim Schipper show a small, but significant
speedup.

-Otto

Index: malloc.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v

retrieving revision 1.122
diff -u -p -r1.122 malloc.c
--- malloc.c 7 Dec 2009 18:47:38 -0000 1.122
+++ malloc.c 15 Dec 2009 19:15:33 -0000
@@ -66,7 +66,7 @@

#define MALLOC_MAXCHUNK (1 << (MALLOC_PAGESHIFT-1))
#define MALLOC_MAXCACHE 256
-#define MALLOC_DELAYED_CHUNKS 16 /* should be power of 2 */
+#define MALLOC_DELAYED_CHUNKS 15 /* max of getrnibble() */
/*
* When the P option is active, we move allocations between half a page
* and a whole page towards the end, subject to alignment constraints.

@@ -112,7 +112,7 @@ struct dir_info {

/* free pages cache */
struct region_info free_regions[MALLOC_MAXCACHE];
/* delayed free chunk slots */
- void *delayed_chunks[MALLOC_DELAYED_CHUNKS];
+ void *delayed_chunks[MALLOC_DELAYED_CHUNKS + 1];
#ifdef MALLOC_STATS
size_t inserts;
size_t insert_collisions;

@@ -185,9 +185,9 @@ static int malloc_active; /* status of

static size_t malloc_guarded; /* bytes used for guards */
static size_t malloc_used; /* bytes allocated */

-static size_t rbytesused; /* random bytes used */
+static size_t rnibblesused; /* random nibbles used */
static u_char rbytes[512]; /* random bytes */
-static u_char getrbyte(void);
+static u_char getrnibble(void);

extern char *__progname;

@@ -380,6 +380,24 @@ wrterror(char *p)

abort();
}

+static void
+rbytes_init(void)
+{
+ arc4random_buf(rbytes, sizeof(rbytes));
+ rnibblesused = 0;
+}
+
+static inline u_char
+getrnibble(void)
+{
+ u_char x;
+
+ if (rnibblesused >= 2 * sizeof(rbytes))
+ rbytes_init();

+ x = rbytes[rnibblesused++ / 2];
+ return (rnibblesused & 1 ? x & 0xf : x >> 4);

+}
+
/*
* Cache maintenance. We keep at most malloc_cache pages cached.
* If the cache is becoming full, unmap pages in the cache for real,

@@ -410,7 +428,7 @@ unmap(struct dir_info *d, void *p, size_

rsz = mopts.malloc_cache - d->free_regions_size;
if (psz > rsz)
tounmap = psz - rsz;
- offset = getrbyte();
+ offset = getrnibble();
for (i = 0; tounmap > 0 && i < mopts.malloc_cache; i++) {
r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
if (r->p != NULL) {

@@ -491,7 +509,7 @@ map(struct dir_info *d, size_t sz, int z

/* zero fill not needed */
return p;
}
- offset = getrbyte();
+ offset = getrnibble();

for (i = 0; i < mopts.malloc_cache; i++) {

r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
if (r->p != NULL) {

@@ -538,21 +556,6 @@ map(struct dir_info *d, size_t sz, int z

return p;
}

-static void
-rbytes_init(void)
-{
- arc4random_buf(rbytes, sizeof(rbytes));
- rbytesused = 0;
-}
-
-static u_char
-getrbyte(void)
-{
- if (rbytesused >= sizeof(rbytes))
- rbytes_init();
- return rbytes[rbytesused++];
-}
-
/*
* Initialize a dir_info, which should have been cleared by caller
*/

@@ -1012,7 +1015,7 @@ malloc_bytes(struct dir_info *d, size_t

}

/* advance a random # of positions */
- i = (getrbyte() & (MALLOC_DELAYED_CHUNKS - 1)) % bp->free;
+ i = getrnibble() % bp->free;
while (i > 0) {
u += u;
k++;

@@ -1275,7 +1278,7 @@ ofree(void *p)