count of bits in a long

Peter Miller

unread,

Oct 4, 1990, 10:17:33 PM10/4/90

to

/*
* A while back I posted a solution to the "count of bits set in a
* long" question, suggesting that HACKMEM 169 was a very fast way to do it.
* This program demonstrates the technique.
*
* The HACKMEM 169 method uses a modulo to sum the digits.
* A number of people have asked me "how can a division be faster than a loop?"
* This program demonstrates how. The modulo is a special case,
* and can be unwound into a tight loop if your machine does not
* have a fast divide opcode (although many modern machines have a 1 cycle
* divide opcode, except for we-really-believe-our-press-hype risc
* manufacturers).
*
* This program was run on a 386/20 under unix with the following results:
* Func sec/iter scaled
* nbits 0.00018310547 1.000 hackmem 169
* nbits1 0.0006980896 3.812 hackmem 169 no % operator
* nbits2 0.0016822815 9.188 count lsb's loop
* nbits3 0.0037384033 20.417
* nbits4 0.0034370422 18.771
* nbits5 0.0037765503 20.625
* nbits6 0.0035247803 19.250
*/

#include <stdio.h>
#include <sys/types.h>
#include <sys/times.h>

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

/*
* This function is used to calibrate the timing mechanism.
* This way we can subtract the loop and call overheads.
*/
int
calibrate(n)
register unsigned long n;
{
return 0;
}

/*
* This function counts the number of bits in a long.
* It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
*
* It is so magic, an explanation is required:
* Consider a 3 bit number as being
* 4a+2b+c
* if we shift it right 1 bit, we have
* 2a+b
* subtracting this from the original gives
* 2a+b+c
* if we shift the original 2 bits right we get
* a
* and so with another subtraction we have
* a+b+c
* which is the number of bits in the original number.
* Suitable masking allows the sums of the octal digits in a
* 32 bit bumber to appear in each octal digit. This isn't much help
* unless we can get all of them summed together.
* This can be done by modulo arithmetic (sum the digits in a number by molulo
* the base of the number minus one) the old "cating out nines" trick they
* taught in school before calculators were invented.
* Now, using mod 7 wont help us, because our number will very likely have
* more than 7 bits set. So add the octal digits together to get base64
* digits, and use modulo 63.
* (Those of you with 64 bit machines need to add 3 octal digits together to
* get base512 digits, and use mod 511.)
*
* This is HACKMEM 169, as used in X11 sources.
*/

int
nbits(n)
register unsigned long n;
{
register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}

/*
* This is the same as the above, but does not use the % operator.
* Most modern machines have clockless division, and so the modulo is as
* expensive as, say, an addition.
*/
int
nbits1(n)
register unsigned long n;
{
register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
tmp = (tmp + (tmp >> 3)) & 030707070707;
while (tmp > 63)
tmp = (tmp & 63) + (tmp >> 6);
return tmp;
}

/*
* This function counts the bits in a long.
*
* It removes the lsb and counting the number of times round the loop.
* The expression (n & -n) yields the lsb of a number,
* but it only works on 2's compliment machines.
*/
int
nbits2(n)
register unsigned long n;
{
register int count;

for (count = 0; n; n -= (n & -n))
count++;
return count;
}

/*
* This function counts the bits in a long.
*
* It works by shifting the number left and testing the top bit.
* On many machines shift is expensive, so it uses a cheap addition instead.
*/
int
nbits3(n)
register unsigned long n;
{
register int count;

for (count = 0; n; n += n)
if (n & ~(~(unsigned long)0 >> 1))
count++;
return count;
}

/*
* This function counts the bits in a long.
*
* It works by shifting the number down and testing the bottom bit.
*/
int
nbits4(n)
register unsigned long n;
{
register int count;

for (count = 0; n; n >>= 1)
if (n & 1)
count++;
return count;
}

/*
* This function counts the bits in a long.
*
* It works by masking each bit.
* This is the second most intuitively obvious method,
* and is independent of the number of bits in the long.
*/
int
nbits5(n)
register unsigned long n;
{
register int count;
register unsigned long mask;

count = 0;
for (mask = 1; mask; mask += mask)
if (n & mask)
count++;
return count;
}

/*
* This function counts the bits in a long.
*
* It works by masking each bit.
* This is the most intuitively obvious method,
* but how do you a priori know how many bits in the long?
* (except for ''sizeof(long) * CHAR_BITS'' expression)
*/
int
nbits6(n)
register unsigned long n;
{
register int count;
register int bit;

count = 0;
for (bit = 0; bit < 32; ++bit)
if (n & ((unsigned long)1 << bit))
count++;
return count;
}

/*
* build a long random number.
* The standard rand() returns a 15bit number.
*/
unsigned long
bigrand()
{
return
(
((unsigned long)(rand() & 0xFFF) << 24)
|
((unsigned long)(rand() & 0xFFF) << 12)
|
(unsigned long)(rand() & 0xFFF)
);
}

/*
* Run the test many times.
* You will need a running time in the 10s of seconds
* for an accurate result.
*
* To give a fair comparison, the random number generator
* is seeded the same for each measurement.
*
* Return value is seconds per iteration.
*/
#define REPEAT (1L<<18)

double
measure(func)
int (*func)();
{
long j;
struct tms start;
struct tms finish;

srand(1);
times(&start);
for (j = 0; j < REPEAT; ++j)
func(bigrand());
times(&finish);
return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
}

struct table
{
char *name;
int (*func)();
double measurement;
int trial;
};

struct table table[] =
{
{ "nbits", nbits },
{ "nbits1", nbits1 },
{ "nbits2", nbits2 },
{ "nbits3", nbits3 },
{ "nbits4", nbits4 },
{ "nbits5", nbits3 },
{ "nbits6", nbits4 },
};

main()
{
double calibration;
double best;
int j, k, m;

/*
* run a few tests to make sure they all agree
*/
srand(getpid());
for (j = 0; j < 10; ++j)
{
unsigned long n;
int bad;

n = bigrand();
for (k = 0; k < SIZEOF(table); ++k)
table[k].trial = table[k].func(n);
bad = 0;
for (k = 0; k < SIZEOF(table); ++k)
for (m = 0; m < SIZEOF(table); ++m)
if (table[k].trial != table[m].trial)
bad = 1;
if (bad)
{
printf("wrong:\t%08lX", n);
for (k = 0; k < SIZEOF(table); ++k)
printf("\t%d", table[k].trial);
printf("\n");
}
}

/*
* calibrate the timing mechanism
*/
calibration = measure(calibrate);

/*
* time them all, keeping track of the best (smallest)
*/
for (j = 0; j < SIZEOF(table); ++j)
{
table[j].measurement = measure(table[j].func) - calibration;
if (!j || table[j].measurement < best)
best = table[j].measurement;
}
for (j = 0; j < SIZEOF(table); ++j)
{
printf
(
"%s\t%10.8g\t%6.3f\n",
table[j].name,
table[j].measurement,
table[j].measurement / best
);
}
exit(0);
}

Regards
Peter Miller UUCP uunet!munnari!pdact.pd.necisa.oz!pmiller
ARPA pmiller%pdact.pd.necisa.oz@australia
/\/\* CSNET pmiller%pdact.pd.necisa.oz@au
ACSnet pmi...@pdact.pd.necisa.oz
Disclaimer: The views expressed here are personal and do not necessarily
reflect the view of my employer or the views or my colleagues.
D

Chris Torek

unread,

Oct 6, 1990, 8:12:52 PM10/6/90

to

In article <8...@neccan.oz> pe...@neccan.oz (Peter Miller) sends out a
program to time various bit-counting functions. I decided to `improve'
the program a bit :-) and add a number of additional functions. I have
run the resulting program (which I will append below) on a number of
architectures. The `mod' version is never the fastest of the hackmem
variants. The fastest function is always table lookup, except on the
Sun-4.

Here are his results, for an i386 box:

> * Func sec/iter scaled
> * nbits 0.00018310547 1.000 hackmem 169
> * nbits1 0.0006980896 3.812 hackmem 169 no % operator
> * nbits2 0.0016822815 9.188 count lsb's loop
> * nbits3 0.0037384033 20.417
> * nbits4 0.0034370422 18.771
> * nbits5 0.0037765503 20.625
> * nbits6 0.0035247803 19.250

(I find the last two results particularly interesting since his code
as posted uses functions nbits3 and nbits4 to calculate the times
printed for nbits5 and nbits6, never evaluating 5 and 6 at all.)

Here is a short key to names:
my name (Peter's name) technique used
---------------------- --------------
hackmemmod (nbits) HACKMEM 169, using % operator
hackmemloop (nbits1) HACKMEM 169, using loop to implement %
hackmemunrolled (-) HACKMEM 169, with 5-step % (loop unrolled)
rmlsbsub (nbits2) remove lsb with `n -= (n & -n)'
rmlsbmask (-) remove lsb with `n &= n - 1'
testlsb (nbits4) test n&1, then n>>=1
testmsb (nbits3) test n&MSB, then n+=n (rather than n<<=1)
testsignandshift (-) test n<0, then n<<=1
testeachbit (nbits5, untimed) test n&mask, then mask+=mask
testeachbit1shl (nbits6, ") test n&(1<<bit) for bit in [0..32)
tableshift nbits[n>>24] + nbits[(n>>16)&255] ...
tableuchar set p=&n, nbits[p[0]] + nbits[p[1]] ...

The results:
--------------------------------------------------
DECstation 3100 (MIPS R2000, slow memory system)
function time ratio
hackmemmod 3.1029682e-06 3.179
hackmemloop 2.4697094e-06 2.531
hackmemunrolled 1.7693996e-06 1.813
rmlsbsub 4.8127670e-06 4.931
rmlsbmask 3.8405285e-06 3.935
testlsb 1.0221542e-05 10.473
testmsb 1.0899502e-05 11.168
testsignandshift 1.0780300e-05 11.046
testeachbit 1.0843626e-05 11.111
testeachbit1shl 1.6695683e-05 17.107
tableshift 1.0467396e-06 1.073
tableuchar 9.7596359e-07 1.000

--------------------------------------------------
Encore Multimax (NS32532, proprietary bus)
function time ratio
hackmemmod 6.3739853e-06 3.618
hackmemloop 3.8458633e-06 2.183
hackmemunrolled 6.4167595e-06 3.642
rmlsbsub 9.3918610e-06 5.330
rmlsbmask 9.4905777e-06 5.386
testlsb 2.4036179e-05 13.642
testmsb 2.3262550e-05 13.203
testsignandshift 2.1452625e-05 12.176
testeachbit 2.5112068e-05 14.253
testeachbit1shl 2.8207516e-05 16.010
tableshift 2.2915077e-06 1.301
tableuchar 1.7619209e-06 1.000

--------------------------------------------------
Sun-3/something (68020, ?vmebus memory?)

function time ratio
hackmemmod 0.00077819824 1.821
hackmemloop 0.00059890747 1.402
hackmemunrolled 0.00050354004 1.179
rmlsbsub 0.00069808960 1.634
rmlsbmask 0.00055313110 1.295
testlsb 0.00107955930 2.527
testmsb 0.00251007080 5.875
testsignandshift 0.00242614750 5.679
testeachbit 0.00254821780 5.964
testeachbit1shl 0.00362014770 8.473
tableshift 0.00083160400 1.946
tableuchar 0.00042724609 1.000

--------------------------------------------------
SparcStation 1+ (SPARC, private memory bus)
These times look a bit odd, and suggest that the compiler
could use some work.
function time ratio
hackmemmod 1.4972687e-06 12.077
hackmemloop 7.3432922e-07 5.923
hackmemunrolled 1.1825562e-06 9.538
rmlsbsub 1.3351440e-07 1.077
rmlsbmask 1.4305115e-07 1.154
testlsb 1.4305115e-07 1.154
testmsb 1.3351440e-07 1.077
testsignandshift 1.2397766e-07 1.000
testeachbit 6.6280365e-06 53.462
testeachbit1shl 9.2029572e-06 74.231
tableshift 7.2479248e-07 5.846
tableuchar 1.1539459e-06 9.308

--------------------------------------------------
VAX 8600 with gcc (ECL gate array, private memory bus)
function time ratio
hackmemmod 1.1291504e-05 4.290
hackmemloop 1.0757446e-05 4.087
hackmemunrolled 8.8500977e-06 3.362
rmlsbsub 1.7890930e-05 6.797
rmlsbmask 1.4801025e-05 5.623
testlsb 5.2032471e-05 19.768
testmsb 3.5400391e-05 13.449
testsignandshift 2.7008057e-05 10.261
testeachbit 3.0670166e-05 11.652
testeachbit1shl 4.7760010e-05 18.145
tableshift 5.3405762e-06 2.029
tableuchar 2.6321411e-06 1.000

--------------------------------------------------
VAX 8600 with Berkeley cc
function time ratio
hackmemmod 8.1634521e-06 3.292
hackmemloop 6.9808960e-06 2.815
hackmemunrolled 1.3885498e-05 5.600
rmlsbsub 6.2942505e-06 2.538
rmlsbmask 5.9890747e-06 2.415
testlsb 1.5754700e-05 6.354
testmsb 3.9138794e-05 15.785
testsignandshift 3.1127930e-05 12.554
testeachbit 4.1275024e-05 16.646
testeachbit1shl 6.0615540e-05 24.446
tableshift 5.9890747e-06 2.415
tableuchar 2.4795532e-06 1.000

--------------------------------------------------

There are some noteworthy points above, particularly the example
with the 8600, where the choice of compiler makes a great deal of
difference. The `testlsb' loop runs much faster under gcc, which
eliminated one `tstl' instruction, yet some of the other loops
run more slowly.

So that others can run their own tests, here is the program. I
added several functions, renamed them all, changed the timing
mechanism on BSD machines, and reformatted things a bit. I changed
the random number selection mechanism to be relatively machine
independent (assuming they all have the same `rand' function) and
to use values that are `more random'. I also changed the output
format (among other things, it uses `%#g'; the Sun SparcStation
printf does not handle this correctly, and I added trailing zeroes
manually above). Oh yes, I also had to make it run longer on the
MIPS and Sparc machines to get reliable times.

(You can tell which functions I added, as they are generally
uncommented. :-) )

--------------------------------------------------
#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

t0_hackmemmod(n)

register unsigned long n;
{
register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}

/*
* This is the same as the above, but does not use the % operator.
* Most modern machines have clockless division, and so the modulo is as
* expensive as, say, an addition.
*/
int

t1_hackmemloop(n)

register unsigned long n;
{
register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
tmp = (tmp + (tmp >> 3)) & 030707070707;
while (tmp > 63)
tmp = (tmp & 63) + (tmp >> 6);
return tmp;
}

/*
* Above, without using while loop. It takes at most 5 iterations of the
* loop, so we just do all 5 in-line. The final result is never 63
* (this is assumed above as well).
*/
int
t2_hackmemunrolled(n)

register unsigned long n;
{
register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
tmp = (tmp + (tmp >> 3)) & 030707070707;

tmp = (tmp & 63) + (tmp >> 6);
tmp = (tmp & 63) + (tmp >> 6);
tmp = (tmp & 63) + (tmp >> 6);
tmp = (tmp & 63) + (tmp >> 6);
tmp = (tmp & 63) + (tmp >> 6);

return (tmp);
}

/*
* This function counts the bits in a long.
*
* It removes the lsb and counting the number of times round the loop.
* The expression (n & -n) yields the lsb of a number,
* but it only works on 2's compliment machines.
*/
int

t3_rmlsbsub(n)

register unsigned long n;
{
register int count;

for (count = 0; n; n -= (n & -n))
count++;
return count;
}

int
t4_rmlsbmask(n)

register unsigned long n;
{
register int count;

for (count = 0; n; count++)
n &= n - 1; /* take away lsb */
return (count);
}

/*
* This function counts the bits in a long.
*
* It works by shifting the number down and testing the bottom bit.
*/
int

t5_testlsb(n)

register unsigned long n;
{
register int count;

for (count = 0; n; n >>= 1)
if (n & 1)
count++;
return count;
}

/*
* This function counts the bits in a long.
*

* It works by shifting the number left and testing the top bit.
* On many machines shift is expensive, so it uses a cheap addition instead.
*/
int

t6_testmsb(n)

register unsigned long n;
{
register int count;

for (count = 0; n; n += n)
if (n & ~(~(unsigned long)0 >> 1))
count++;
return count;
}

int
t7_testsignandshift(n)

register unsigned long n;
{
register int count;

for (count = 0; n; n <<= 1)

if ((long)n < 0)
count++;
return (count);
}

/*
* This function counts the bits in a long.
*
* It works by masking each bit.
* This is the second most intuitively obvious method,
* and is independent of the number of bits in the long.
*/
int

t8_testeachbit(n)

register unsigned long n;
{
register int count;
register unsigned long mask;

count = 0;
for (mask = 1; mask; mask += mask)
if (n & mask)
count++;
return count;
}

/*
* This function counts the bits in a long.
*
* It works by masking each bit.
* This is the most intuitively obvious method,
* but how do you a priori know how many bits in the long?
* (except for ''sizeof(long) * CHAR_BITS'' expression)
*/
int

t9_testeachbit1shl(n)

register unsigned long n;
{
register int count;
register int bit;

count = 0;
for (bit = 0; bit < 32; ++bit)
if (n & ((unsigned long)1 << bit))
count++;
return count;
}

static char nbits[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tA_tableshift(n)
register unsigned long n;
{

return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tB_tableuchar(n)
unsigned long n;
{
register unsigned char *p = (unsigned char *)&n;

return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}

/*
* build a long random number.

* The standard rand() returns at least a 15 bit number.
* We use the top 9 of 15 (since the lower N bits of the usual rand()
* repeat with a period of 2^N).
*/
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
register int r;

r = randbits() << 27;
r |= randbits() << 18;
r |= randbits() << 9;
r |= randbits();
}

/*
* Run the test many times.
* You will need a running time in the 10s of seconds
* for an accurate result.
*
* To give a fair comparison, the random number generator
* is seeded the same for each measurement.
*
* Return value is seconds per iteration.
*/

#if defined(mips) || defined(sparc)
#define REPEAT (1L<<20)
#else
#define REPEAT (1L<<18)
#endif

double
measure(func)
register int (*func)();
{
#ifdef USE_GETRUSAGE
struct rusage ru0, ru1;
register long j;

srand(1);
(void) getrusage(RUSAGE_SELF, &ru0);

for (j = 0; j < REPEAT; ++j)
func(bigrand());

(void) getrusage(RUSAGE_SELF, &ru1);
ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
ru1.ru_utime.tv_usec += 1000000;
ru1.ru_utime.tv_sec--;
}
return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
(double)REPEAT);
#else
register long j;

struct tms start;
struct tms finish;

srand(1);
times(&start);
for (j = 0; j < REPEAT; ++j)
func(bigrand());
times(&finish);
return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);

#endif
}

struct table {
char *name;
int (*func)();
double measurement;
int trial;
};

struct table table[] =
{
{ "hackmemmod", t0_hackmemmod },
{ "hackmemloop", t1_hackmemloop },
{ "hackmemunrolled", t2_hackmemunrolled },
{ "rmlsbsub", t3_rmlsbsub },
{ "rmlsbmask", t4_rmlsbmask },
{ "testlsb", t5_testlsb },
{ "testmsb", t6_testmsb },
{ "testsignandshift", t7_testsignandshift },
{ "testeachbit", t8_testeachbit },
{ "testeachbit1shl", t9_testeachbit1shl },
{ "tableshift", tA_tableshift },
{ "tableuchar", tB_tableuchar },
};

main(argc, argv)
int argc;
char **argv;
{
double calibration, v, best;
int j, k, m, verbose;

verbose = argc > 1;

/*
* run a few tests to make sure they all agree
*/
srand(getpid());
for (j = 0; j < 10; ++j) {
unsigned long n;
int bad;

n = bigrand();
for (k = 0; k < SIZEOF(table); ++k)
table[k].trial = table[k].func(n);
bad = 0;
for (k = 0; k < SIZEOF(table); ++k)
for (m = 0; m < SIZEOF(table); ++m)
if (table[k].trial != table[m].trial)
bad = 1;
if (bad) {

printf("wrong: %08lX", n);

for (k = 0; k < SIZEOF(table); ++k)

printf(" %3d", table[k].trial);
printf("\n");
}
}

/*
* calibrate the timing mechanism
*/
calibration = measure(calibrate);

if (verbose)
printf("calibration: %g\n", calibration);

/*
* time them all, keeping track of the best (smallest)
*/
for (j = 0; j < SIZEOF(table); ++j) {

v = measure(table[j].func) - calibration;
if (verbose)
printf("%s: %g\n", table[j].name, v);
table[j].measurement = v;
if (!j || v < best)
best = v;
}
(void) printf("%-24s %-14sratio\n", "function", "time");

for (j = 0; j < SIZEOF(table); ++j) {

(void) printf("%-24s %#10.8g %6.3f\n",

table[j].name,
table[j].measurement,
table[j].measurement / best);
}
exit(0);
}

--
In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 405 2750)
Domain: ch...@cs.umd.edu Path: uunet!mimsy!chris

Gene H. Olson

unread,

Oct 12, 1990, 11:41:37 AM10/12/90

to

pe...@neccan.oz (Peter Miller) writes:
>/*
> * A while back I posted a solution to the "count of bits set in a
> * long" question, suggesting that HACKMEM 169 was a very fast way to do it.
> * This program demonstrates the technique.
> *
> * The HACKMEM 169 method uses a modulo to sum the digits.
> * A number of people have asked me "how can a division be faster than a loop?"
> * This program demonstrates how. The modulo is a special case,
> * and can be unwound into a tight loop if your machine does not
> * have a fast divide opcode (although many modern machines have a 1 cycle
> * divide opcode, except for we-really-believe-our-press-hype risc
> * manufacturers).
> *
> * This program was run on a 386/20 under unix with the following results:
> * Func sec/iter scaled
> * nbits 0.00018310547 1.000 hackmem 169
> * nbits1 0.0006980896 3.812 hackmem 169 no % operator
> * nbits2 0.0016822815 9.188 count lsb's loop
> * nbits3 0.0037384033 20.417
> * nbits4 0.0034370422 18.771
> * nbits5 0.0037765503 20.625
> * nbits6 0.0035247803 19.250
> */

> ..... A program follows this.

To this program I have added a bug fix (the function table
was wrong) and another function (nbit7) which is MY FAVORITE
WAY of counting bits. The patches are at the end of this
posting....

Anyway, I got much better results on both the 386 (which
has a multiply) and on a SPARC (which doesn't) than any of
the functions in the original program.

The results were:

386 SPARC
------------------------------ ------------------------------
nbits 0.00020980835 1.774 nbits 0.00096893311 15.875
nbits1 0.00038909912 3.290 nbits1 0.00012969971 2.125
nbits2 0.00075149536 6.355 nbits2 0.00031280518 5.125
nbits3 0.0015411377 13.032 nbits3 0.00067138672 11.000
nbits4 0.0016479492 13.935 nbits4 0.00065231323 10.688
nbits5 0.0015525818 13.129 nbits5 0.00067520142 11.062
nbits6 0.0021286011 18.000 nbits6 0.00086212158 14.125
nbits7 0.00011825562 1.000 nbits7 6.1035156e-05 1.000

The nbit7 technique is a simple routine which shifts and sums
bits in place until all bits have summed in the lower 8 bits
of the word. The implementation here requires 32 bit longs,
but it is easily extensible to 64 bits with a mask change and
the additional shift shown.

Enjoy,

------------- Remainder of Posting is a Cdiff Patchfile ----------

*** orig.c Fri Oct 12 08:56:22 1990
--- gene.c Fri Oct 12 09:23:51 1990
***************
*** 215,220 ****
--- 215,236 ----
return count;
}

+ int
+ nbits7(n)
+ register unsigned long n ;
+ {
+ n = ((n >> 1) & 0x55555555) + (n & 0x55555555) ;
+ n = ((n >> 2) & 0x33333333) + (n & 0x33333333) ;
+ n = ((n >> 4) + n) & 0x0F0F0F0F ;
+ n = ((n >> 8) + n) ;
+ #if LONG64
+ n = (n >> 16) + n ;
+ return ((n >> 32) + n) & 0xFF ;
+ #else
+ return ((n >> 16) + n) & 0xFF ;
+ #endif
+ }
+

/*
* build a long random number.
* The standard rand() returns a 15bit number.

***************
*** 275,282 ****

{ "nbits2", nbits2 },
{ "nbits3", nbits3 },
{ "nbits4", nbits4 },

! { "nbits5", nbits3 },
! { "nbits6", nbits4 },
};

main()
--- 291,299 ----

{ "nbits2", nbits2 },
{ "nbits3", nbits3 },
{ "nbits4", nbits4 },

! { "nbits5", nbits5 },
! { "nbits6", nbits6 },
! { "nbits7", nbits7 },
};

main()
_________________________________________________________________________
__
/ ) Gene H. Olson uunet!digibd!zeno!gene
/ __ _ __ _ Smartware Consulting
(__/ _(/_//_(/_ Minneapolis, MN (612) 824-9108

Tim W Smith

unread,

Oct 13, 1990, 8:53:47 AM10/13/90

to

These times were given in a recent post for some
68020 based Sun:

function time ratio
hackmemmod 0.00077819824 1.821
hackmemloop 0.00059890747 1.402
hackmemunrolled 0.00050354004 1.179
rmlsbsub 0.00069808960 1.634
rmlsbmask 0.00055313110 1.295
testlsb 0.00107955930 2.527
testmsb 0.00251007080 5.875
testsignandshift 0.00242614750 5.679
testeachbit 0.00254821780 5.964
testeachbit1shl 0.00362014770 8.473
tableshift 0.00083160400 1.946
tableuchar 0.00042724609 1.000

Something is seriously wrong with either these numbers or that
computer. I've tried a couple of these on my Mac II, using
Think C. Note that 1) Apple is not noted for squeezing the
last drop of performance out of the 68020, and 2) Think C
has not stressed optimal code generation.

I get 36 usec for rmlsub on longs with all 32 bits set, 20
usec if 16 bits are set, and 5 usec of no bits are set.
Note that the Sun is getting, in the average case for rmlsub,
20 times worse performance than I am getting for the worst
case.

Tim Smith

Chris Torek

unread,

Oct 13, 1990, 9:37:49 AM10/13/90

to

In article <26...@mimsy.umd.edu> I posted a modified version of
Peter Miller's bit-count timing program. Unfortunately, my version
had a bug that effectively makes the results worthless (depending
on compiler vagaries; it `just happens' to compile to working code
on some systems).

Remarkably few people noted the bug (or rather, sent me mail about it).
Two people sent me additional functions, however. Arthur Olson added
four (all variants on table lookup); Mike Weaver added one. Gene Olson
has also posted a different modified version of Peter Miller's original
program; this contains a better version of the latter.

So, here are new results for the new set of (now 17) functions on the
same set of machines, along with the new program. Noteworthy facts:

- hackmem invariably loses to some other technique, always including
Gene Olson's, though sometimes not by much;
- the fastest is usually (but not always) one of the table lookups;
- tiny differences in ratios (less than .1) are usually insignificant.

name technique used
---- --------------
hackmemmod HACKMEM 169, using % operator
hackmemloop HACKMEM 169, using loop to implement %
hackmemunrolled HACKMEM 169, with 5-step % (loop unrolled)
rmlsbsub remove lsb with `n -= (n & -n)'
rmlsbmask remove lsb with `n &= n - 1'
testlsb test n&1, then n>>=1
testmsb test n&MSB, then n+=n (rather than n<<=1)
testsignandshift test n<0, then n<<=1
testeachbit test n&mask, then mask+=mask
testeachbit1shl test n&(1<<bit) for bit in [0..32)

tableshift nbits[n>>24] + nbits[(n>>16)&255] ...
tableuchar set p=&n, nbits[p[0]] + nbits[p[1]] ...

tableshiftcast nbits[n>>24] + nbits[(u_char)(n>>16)] ...
itableshift same as tableshift, but table datatype is int
itableuchar ditto
itableshiftcast ditto (note, nbits table is `char')
sumbits Gene Olson's function (see code for comments)

------------------------------------------------------------
Results:

::::::::::::::
time.ds3100
::::::::::::::
function time ratio
hackmemmod 3.0992432e-06 3.250
hackmemloop 2.5330353e-06 2.656
hackmemunrolled 1.7619495e-06 1.848
rmlsbsub 4.9207935e-06 5.160
rmlsbmask 3.9522800e-06 4.145
testlsb 1.0545622e-05 11.059
testmsb 1.0608948e-05 11.125
testsignandshift 1.0497196e-05 11.008
testeachbit 1.0839901e-05 11.367
testeachbit1shl 1.6718033e-05 17.531
tableshift 1.0243893e-06 1.074
tableuchar 9.5361328e-07 1.000
tableshiftcast 1.1063404e-06 1.160
itableshift 1.2814178e-06 1.344
itableuchar 1.2031918e-06 1.262
itableshiftcast 1.3223934e-06 1.387
sumbits 1.2106419e-06 1.270
::::::::::::::
time.encore
::::::::::::::
function time ratio
hackmemmod 8.0387573e-06 4.522
hackmemloop 7.2727394e-06 4.091
hackmemunrolled 4.3494453e-06 2.447
rmlsbsub 1.0656597e-05 5.994
rmlsbmask 1.1298252e-05 6.355
testlsb 3.1122246e-05 17.506
testmsb 2.2745319e-05 12.794
testsignandshift 1.9783443e-05 11.128
testeachbit 2.3917282e-05 13.454
testeachbit1shl 2.9880619e-05 16.808
tableshift 3.9419632e-06 2.217
tableuchar 2.6934662e-06 1.515
tableshiftcast 2.3953590e-06 1.347
itableshift 3.0450325e-06 1.713
itableuchar 1.7777634e-06 1.000
itableshiftcast 2.1755104e-06 1.224
sumbits 1.9636650e-06 1.105
::::::::::::::
time.sun3
::::::::::::::
function time ratio
hackmemmod 1.0604858e-05 2.106
hackmemloop 1.2664795e-05 2.515
hackmemunrolled 1.0833740e-05 2.152
rmlsbsub 2.2430420e-05 4.455
rmlsbmask 1.7318726e-05 3.439
testlsb 4.3182373e-05 8.576
testmsb 3.8909912e-05 7.727
testsignandshift 4.0664673e-05 8.076
testeachbit 4.4479370e-05 8.833
testeachbit1shl 5.9432983e-05 11.803
tableshift 9.9945068e-06 1.985
tableuchar 1.0910034e-05 2.167
tableshiftcast 1.4877319e-05 2.955
itableshift 7.9345703e-06 1.576
itableuchar 1.1672974e-05 2.318
itableshiftcast 1.1520386e-05 2.288
sumbits 5.0354004e-06 1.000
::::::::::::::
time.sun4
::::::::::::::
function time ratio
hackmemmod 1.3427734e-05 19.288
hackmemloop 1.6689301e-06 2.397
hackmemunrolled 1.0585785e-06 1.521
rmlsbsub 3.9768219e-06 5.712
rmlsbmask 3.4236908e-06 4.918
testlsb 8.4209442e-06 12.096
testmsb 8.4686279e-06 12.164
testsignandshift 8.4018707e-06 12.068
testeachbit 8.5353851e-06 12.260
testeachbit1shl 1.1539459e-05 16.575
tableshift 6.9618225e-07 1.000
tableuchar 1.1348724e-06 1.630
tableshiftcast 7.8201294e-07 1.123
itableshift 9.7274780e-07 1.397
itableuchar 1.2016296e-06 1.726
itableshiftcast 8.8691711e-07 1.274
sumbits 8.3923340e-07 1.205
::::::::::::::
time.vax.gcc
::::::::::::::
function time ratio
hackmemmod 1.5449524e-05 7.364
hackmemloop 1.3847351e-05 6.600
hackmemunrolled 8.6975098e-06 4.145
rmlsbsub 1.8386841e-05 8.764
rmlsbmask 1.4266968e-05 6.800
testlsb 5.3215027e-05 25.364
testmsb 3.4332275e-05 16.364
testsignandshift 2.6550293e-05 12.655
testeachbit 2.9983521e-05 14.291
testeachbit1shl 4.7454834e-05 22.618
tableshift 5.3405762e-06 2.545
tableuchar 2.4414062e-06 1.164
tableshiftcast 6.0272217e-06 2.873
itableshift 4.3106079e-06 2.055
itableuchar 2.0980835e-06 1.000
itableshiftcast 5.6076050e-06 2.673
sumbits 5.7983398e-06 2.764
::::::::::::::
time.vax.ucbcc
::::::::::::::
function time ratio
hackmemmod 7.6293945e-06 3.774
hackmemloop 1.6746521e-05 8.283
hackmemunrolled 1.3504028e-05 6.679
rmlsbsub 2.0332336e-05 10.057
rmlsbmask 1.8882751e-05 9.340
testlsb 5.6457520e-05 27.925
testmsb 3.7384033e-05 18.491
testsignandshift 2.9602051e-05 14.642
testeachbit 3.8681030e-05 19.132
testeachbit1shl 5.8860779e-05 29.113
tableshift 5.6838989e-06 2.811
tableuchar 2.2888184e-06 1.132
tableshiftcast 5.1879883e-06 2.566
itableshift 4.3487549e-06 2.151
itableuchar 2.0217896e-06 1.000
itableshiftcast 4.5394897e-06 2.245
sumbits 6.1798096e-06 3.057

------------------------------------------------------------
The program (and I used lint this time...):

#ifndef lint
static char rcsid[] = "$Id: bct.c,v 1.5 90/10/13 08:44:12 chris Exp $";
#endif

/*
* bct - bitcount timing
*
* Runs a bunch of different functions-to-count-bits-in-a-longword
* (many assume 32 bits per long) with a timer around each loop, and
* tries to calcuate the time used.
*/
#include <stdio.h>
#include <sys/types.h>

* 32 bit number to appear in each octal digit. This isn't much help

* unless we can get all of them summed together.
* This can be done by modulo arithmetic (sum the digits in a number by molulo

* the base of the number minus one) the old "casting out nines" trick they

static int inbits[256] = {

int
tC_tableshiftcast(n)
register unsigned long n;
{

return nbits[(unsigned char) n] +
nbits[(unsigned char) (n >> 8)] +
nbits[(unsigned char) (n >> 16)] +
nbits[(unsigned char) (n >> 24)];
}

int
tD_itableshift(n)
register unsigned long n;
{

return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] +
inbits[(n >> 16) & 0xff] + inbits[n >> 24]);
}

int
tE_itableuchar(n)

unsigned long n;
{
register unsigned char *p = (unsigned char *)&n;

return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]);
}

int
tF_itableshiftcast(n)
register unsigned long n;
{

return inbits[(unsigned char) n] +
inbits[(unsigned char) (n >> 8)] +
inbits[(unsigned char) (n >> 16)] +
inbits[(unsigned char) (n >> 24)];
}

/*
* Explanation:
* First we add 32 1-bit fields to get 16 2-bit fields.
* Each 2-bit field is one of 00, 01, or 10 (binary).
* We then add all the two-bit fields to get 8 4-bit fields.
* These are all one of 0000, 0001, 0010, 0011, or 0100.
*
* Now we can do something different, becuase for the first
* time the value in each k-bit field (k now being 4) is small
* enough that adding two k-bit fields results in a value that
* still fits in the k-bit field. The result is four 4-bit
* fields containing one of {0000,0001,...,0111,1000} and four
* more 4-bit fields containing junk (sums that are uninteresting).
* Pictorially:
* n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh
* n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg
* sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ
* where W, X, Y, and Z are the interesting sums (each at most 1000,
* or 8 decimal). Masking with 0x0f0f0f0f extracts these.
*
* Now we can change tactics yet again, because now we have:
* n = 0000WWWW0000XXXX0000YYYY0000ZZZZ
* n>>8 = 000000000000WWWW0000XXXX0000YYYY
* so sum = 0000WWWW000ppppp000qqqqq000rrrrr
* where p and r are the interesting sums (and each is at most
* 10000, or 16 decimal). The sum `q' is junk, like i, j, and
* k above; but it is not necessarry to discard it this time.
* One more fold, this time by sixteen bits, gives
* n = 0000WWWW000ppppp000qqqqq000rrrrr
* n>>16 = 00000000000000000000WWWW000ppppp
* so sum = 0000WWWW000ppppp000sssss00tttttt
* where s is at most 11000 and t is it most 100000 (32 decimal).
*
* Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)),
* or in other words, t is the number of bits set in the original
* 32-bit longword. So all we have to do is return the low byte
* (or low 6 bits, but `low byte' is typically just as easy if not
* easier).
*
* This technique is also applicable to 64 and 128 bit words, but
* 256 bit or larger word sizes require at least one more masking
* step.
*/
int
tG_sumbits(n)
register unsigned long n;
{

n = (n & 0x55555555) + ((n >> 1) & 0x55555555);
n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
n = (n + (n >> 4)) & 0x0f0f0f0f;
n += n >> 8;
n += n >> 16;
return (n & 0xff);
}

/*
* build a long random number.
* The standard rand() returns at least a 15 bit number.
* We use the top 9 of 15 (since the lower N bits of the usual rand()
* repeat with a period of 2^N).
*/
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
register int r;

r = randbits() << 27;
r |= randbits() << 18;
r |= randbits() << 9;
r |= randbits();

return (r);
}

/*
* Run the test many times.
* You will need a running time in the 10s of seconds
* for an accurate result.
*
* To give a fair comparison, the random number generator
* is seeded the same for each measurement.
*
* Return value is seconds per iteration.
*/

#ifndef REPEAT

{ "tableshiftcast", tC_tableshiftcast },
{ "itableshift", tD_itableshift },
{ "itableuchar", tE_itableuchar },
{ "itableshiftcast", tF_itableshiftcast },
{ "sumbits", tG_sumbits },

Gene H. Olson

unread,

Oct 14, 1990, 2:39:32 PM10/14/90

to

ch...@mimsy.umd.edu (Chris Torek) writes:
>In article <8...@neccan.oz> pe...@neccan.oz (Peter Miller) sends out a
>program to time various bit-counting functions. I decided to `improve'
>the program a bit :-) and add a number of additional functions. I have
>run the resulting program (which I will append below) on a number of
>architectures. The `mod' version is never the fastest of the hackmem
>variants. The fastest function is always table lookup, except on the
>Sun-4.

I also posted an "improved" version of this program that got
horribly erratic and odd results.

Looking at all this closely, it seems the erratic results on this
"test" are due two 3 problems:

1) The original program had a bug in the procedure table
which tested the wrong procedures. Both Chris and I
fixed this.

2) The original program neglected to return a value in
procedure bigrand(). On the Sun4, this means all the
"random" data was zero. This kinda skewed the results.

I just fixed this by putting in the return().

3) The bigrand() procedure was called every invokation of
the test procedure. Since for the fast routines it takes
*MUCH* longer to generate these numbers than to count the
bits, the noise level is extreme.

I just fixed this by creating an array once, and then
referencing it. The test runs about 6 times faster now,
and it is much more consistent.

So the bottom line is that most of the results posted so far are
largely nonsense. So, in the spirit of throwing good net bandwidth
after bad, I am posting the blasted thing again. I hope this
discussion is useful to *someone*.

Earlier I had added a test now called "suminplace" to the list
which I believed was faster than any other way of doing it.
(Except maybe a 16-bit lookup table). It turns out, that
an 8-bit lookup table beats it.

Blast the torpedos, I still like my method best. Maybe
it will run faster on another machine or another compiler.
It certainly is smaller.

>>>>> SparcStation results:

function time ratio
hackmemmod 0.00093555450 16.082
hackmemloop 0.00013732910 2.361
hackmemunrolled 9.0599060e-05 1.557
rmlsbsub 0.00032043457 5.508
rmlsbmask 0.00026893616 4.623
testlsb 0.00064945221 11.164
testmsb 0.00065517426 11.262
testsignandshift 0.00065326691 11.230
testeachbit 0.00067424774 11.590
testeachbit1shl 0.00086975098 14.951
tableshift 5.8174133e-05 1.000
tableuchar 8.2015991e-05 1.410
suminplace 6.6757202e-05 1.148

>>>>> 386 results:

function time ratio
hackmemmod 0.00022506714 1.553
hackmemloop 0.00039672852 2.737
hackmemunrolled 0.00025939941 1.789
rmlsbsub 0.00074005127 5.105
rmlsbmask 0.00067138672 4.632
testlsb 0.0016098022 11.105
testmsb 0.0015449524 10.658
testsignandshift 0.0015525818 10.711
testeachbit 0.0015563965 10.737
testeachbit1shl 0.0020790100 14.342
tableshift 0.00014877319 1.026
tableuchar 0.00014495850 1.000
suminplace 0.00014877319 1.026

Perhaps Chris will find it worth his time to run the thing
on his list of machines.

-------------------- Yet another posting ----------------------

#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

#define NUMRAND 16384
unsigned long rr[NUMRAND] ;

int
tC_suminplace(n)
register unsigned long n ;
{

n = ((n >> 1) & 0x55555555) + (n & 0x55555555) ;

n = ((n >> 2) & 0x33333333) + (n & 0x33333333) ;

n = ((n >> 4) + n) & 0x0F0F0F0F ;

n = ((n >> 8) + n) ;

return ((n >> 16) + n) & 0xFF ;
}

/*
* build a long random number.
* The standard rand() returns at least a 15 bit number.
* We use the top 9 of 15 (since the lower N bits of the usual rand()
* repeat with a period of 2^N).
*/
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
register int r;

r = randbits() << 27;
r |= randbits() << 18;
r |= randbits() << 9;
r |= randbits();

return(r) ;
}

/*
* Run the test many times.
* You will need a running time in the 10s of seconds
* for an accurate result.
*
* To give a fair comparison, the random number generator
* is seeded the same for each measurement.
*
* Return value is seconds per iteration.
*/
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<20)
#else
#define REPEAT (1L<<18)
#endif

double
measure(func)
register int (*func)();
{
#ifdef USE_GETRUSAGE
struct rusage ru0, ru1;
register long j;

(void) getrusage(RUSAGE_SELF, &ru0);

for (j = 0; j < REPEAT; ++j)

func(rr[j & (NUMRAND-1)]);

(void) getrusage(RUSAGE_SELF, &ru1);
ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
ru1.ru_utime.tv_usec += 1000000;
ru1.ru_utime.tv_sec--;
}
return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
(double)REPEAT);
#else
register long j;
struct tms start;
struct tms finish;

times(&start);

for (j = 0; j < REPEAT; ++j)

func(rr[j & (NUMRAND-1)]);

times(&finish);
return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
char *name;
int (*func)();
double measurement;
int trial;
};

struct table table[] =
{
{ "hackmemmod", t0_hackmemmod },
{ "hackmemloop", t1_hackmemloop },
{ "hackmemunrolled", t2_hackmemunrolled },
{ "rmlsbsub", t3_rmlsbsub },
{ "rmlsbmask", t4_rmlsbmask },
{ "testlsb", t5_testlsb },
{ "testmsb", t6_testmsb },
{ "testsignandshift", t7_testsignandshift },
{ "testeachbit", t8_testeachbit },
{ "testeachbit1shl", t9_testeachbit1shl },
{ "tableshift", tA_tableshift },
{ "tableuchar", tB_tableuchar },

{ "suminplace", tC_suminplace },
};

for (k = 0 ; k < NUMRAND ; k++) rr[k] = bigrand() ;

/*
* calibrate the timing mechanism
*/
calibration = measure(calibrate);
if (verbose)
printf("calibration: %g\n", calibration);

/*
* time them all, keeping track of the best (smallest)
*/
for (j = 0; j < SIZEOF(table); ++j) {
v = measure(table[j].func) - calibration;
if (verbose)
printf("%s: %g\n", table[j].name, v);
table[j].measurement = v;
if (!j || v < best)
best = v;
}
(void) printf("%-24s %-14sratio\n", "function", "time");
for (j = 0; j < SIZEOF(table); ++j) {
(void) printf("%-24s %#10.8g %6.3f\n",
table[j].name,
table[j].measurement,
table[j].measurement / best);
}
exit(0);
}

David Moews

unread,

Oct 17, 1990, 3:45:48 AM10/17/90

to

I took Gene Olson's test program and stuck Torek's routines into it, as well
as adding routines to count bits by using a table with a 16-bit index, and
a slightly sped-up version of HAKMEM 169 (`hackmemallfold'). The 16-bit
table lookup appears to be fastest, at least on a Sun-3/50 and a
Sparcstation 1. `hackmemallfold' and `sumbits' both use a total of 16
arithmetic, logical, and shift operations. However, if gcc is used
`hackmemallfold' is apparently faster (because gcc refuses to put constants
in registers?)

name technique used
---- --------------

hackmemmod HACKMEM 169, using % operator
hackmemloop HACKMEM 169, using loop to implement %
hackmemunrolled HACKMEM 169, with 5-step % (loop unrolled)
hackmemallfold HACKMEM 169, no %, fold to the bitter end
rmlsbsub remove lsb with `n -= (n & -n)'
rmlsbmask remove lsb with `n &= n - 1'
testlsb test n&1, then n>>=1
testmsb test n&MSB, then n+=n (rather than n<<=1)
testsignandshift test n<0, then n<<=1
testeachbit test n&mask, then mask+=mask
testeachbit1shl test n&(1<<bit) for bit in [0..32)

tableshift nbits[n>>24] + nbits[(n>>16)&255] ...

tableuchar set p=&n, nbits[p[0]] + nbits[p[1]] ...
tableshiftcast nbits[n>>24] + nbits[(u_char)(n>>16)] ...
itableshift same as tableshift, but table datatype is int
itableuchar ditto
itableshiftcast ditto (note, nbits table is `char')

16tableshift nbits16[n>>16] + nbits16[n&65535]
16tableushort set p=&n, nbits16[p[0]] + nbits16[p[1]]
16tableshiftcast nbits16[n>>16] + nbits16[(u_short)(n)]
16itableshift same as 16tableshift, but table datatype is int
16itableushort ditto
16itableshiftcast ditto

sumbits Gene Olson's function (see code for comments)

Sun-3/50, SunOS 4.1, cc -O

function time ratio
hackmemmod 1.0166168e-05 1.931
hackmemloop 1.2187958e-05 2.315
hackmemunrolled 1.3732910e-05 2.609
hackmemallfold 8.2588196e-06 1.569
rmlsbsub 1.9512177e-05 3.707
rmlsbmask 1.9512177e-05 3.707
testlsb 4.6901703e-05 8.909
testmsb 4.8561096e-05 9.225
testsignandshift 4.0779114e-05 7.746
testeachbit 4.6749115e-05 8.880
testeachbit1shl 6.6032410e-05 12.543
tableshift 1.6460419e-05 3.127
tableuchar 1.3256073e-05 2.518
tableshiftcast 1.1730194e-05 2.228
itableshift 1.6918182e-05 3.214
itableuchar 9.5176697e-06 1.808
itableshiftcast 1.2359619e-05 2.348
16tableshift 1.0337830e-05 1.964
16tableushort 6.0081482e-06 1.141
16tableshiftcast 6.1416626e-06 1.167
16itableshift 5.2642822e-06 1.000
16itableushort 6.6184998e-06 1.257
16itableshiftcast 1.1196136e-05 2.127
sumbits 7.1525574e-06 1.359

Sun-3/50, SunOS 4.1, gcc -O, ver. 1.37.1

function time ratio
hackmemmod 1.0070801e-05 3.259
hackmemloop 1.1653900e-05 3.772
hackmemunrolled 1.0719299e-05 3.469
hackmemallfold 7.9727173e-06 2.580
rmlsbsub 1.9207001e-05 6.216
rmlsbmask 1.9226074e-05 6.222
testlsb 4.4364929e-05 14.358
testmsb 3.7155151e-05 12.025
testsignandshift 4.1427612e-05 13.407
testeachbit 4.8522949e-05 15.704
testeachbit1shl 5.4588318e-05 17.667
tableshift 1.1863708e-05 3.840
tableuchar 8.7928772e-06 2.846
tableshiftcast 1.6403198e-05 5.309
itableshift 9.8037720e-06 3.173
itableuchar 6.3896179e-06 2.068
itableshiftcast 1.3904572e-05 4.500
16tableshift 5.7983398e-06 1.877
16tableushort 4.4250488e-06 1.432
16tableshiftcast 5.3215027e-06 1.722
16itableshift 4.9591064e-06 1.605
16itableushort 3.0899048e-06 1.000
16itableshiftcast 9.1934204e-06 2.975
sumbits 9.6511841e-06 3.123

SparcStation 1, SunOS 4.0.3c, cc -O

function time ratio
hackmemmod 1.4197826e-05 18.551
hackmemloop 2.2125244e-06 2.891
hackmemunrolled 1.4662743e-06 1.916
hackmemallfold 1.0657310e-06 1.393
rmlsbsub 5.1355362e-06 6.710
rmlsbmask 4.2915344e-06 5.607
testlsb 1.0557175e-05 13.794
testmsb 1.0647774e-05 13.913
testsignandshift 1.0533333e-05 13.763
testeachbit 1.0833740e-05 14.156
testeachbit1shl 1.3933182e-05 18.206
tableshift 9.0837479e-07 1.187
tableuchar 1.3136864e-06 1.717
tableshiftcast 9.1314316e-07 1.193
itableshift 1.1110306e-06 1.452
itableuchar 1.5211105e-06 1.988
itableshiftcast 1.1157990e-06 1.458
16tableshift 8.2015991e-07 1.072
16tableushort 1.1849403e-06 1.548
16tableshiftcast 7.6532364e-07 1.000
16itableshift 1.6880035e-06 2.206
16itableushort 2.0956993e-06 2.738
16itableshiftcast 1.6403198e-06 2.143
sumbits 1.0561943e-06 1.380

SparcStation 1, SunOS 4.0.3c, gcc -O, ver. 1.37

function time ratio
hackmemmod 1.1713505e-05 14.241
hackmemloop 2.4437904e-06 2.971
hackmemunrolled 1.4662743e-06 1.783
hackmemallfold 1.0633469e-06 1.293
rmlsbsub 5.1999092e-06 6.322
rmlsbmask 4.3869019e-06 5.333
testlsb 1.2927055e-05 15.716
testmsb 1.6040802e-05 19.501
testsignandshift 1.2907982e-05 15.693
testeachbit 1.1508465e-05 13.991
testeachbit1shl 1.4796257e-05 17.988
tableshift 1.0609627e-06 1.290
tableuchar 1.5187263e-06 1.846
tableshiftcast 1.0633469e-06 1.293
itableshift 1.2660027e-06 1.539
itableuchar 1.7237663e-06 2.096
itableshiftcast 1.2612343e-06 1.533
16tableshift 8.7499619e-07 1.064
16tableushort 1.1301041e-06 1.374
16tableshiftcast 8.2254410e-07 1.000
16itableshift 1.7356873e-06 2.110
16itableushort 1.9979477e-06 2.429
16itableshiftcast 1.6951561e-06 2.061
sumbits 1.3136864e-06 1.597
--
David Moews mo...@math.berkeley.edu
-----------------------------cut here for program-------------------------------
#include <stdio.h>
#include <sys/types.h>

* 32 bit number to appear in each octal digit. This isn't much help

* unless we can get all of them summed together.
* This can be done by modulo arithmetic (sum the digits in a number by molulo

* the base of the number minus one) the old "casting out nines" trick they

* taught in school before calculators were invented.
* Now, using mod 7 wont help us, because our number will very likely have
* more than 7 bits set. So add the octal digits together to get base64
* digits, and use modulo 63.
* (Those of you with 64 bit machines need to add 3 octal digits together to
* get base512 digits, and use mod 511.)
*
* This is HACKMEM 169, as used in X11 sources.
*/
int
t0_hackmemmod(n)
register unsigned long n;
{
register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}

/*
* This is the same as the above, but does not use the % operator.
* Most modern machines have clockless division, and so the modulo is as

* expensive as, say, [sic] an addition.

/*
* As above, but no modulus nor simulation thereof. Instead,
* we continue shifting and masking to the bitter end.
*/
int
t3_hackmemallfold(n)

register unsigned long n;
{
register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
tmp = (tmp + (tmp >> 3)) & 030707070707;

tmp += tmp >> 6;
return ((tmp + (tmp >> 12) + (tmp >> 24)) & 077);
}

/*
* This function counts the bits in a long.
*
* It removes the lsb and counting the number of times round the loop.
* The expression (n & -n) yields the lsb of a number,
* but it only works on 2's compliment machines.
*/
int

t4_rmlsbsub(n)

register unsigned long n;
{
register int count;

for (count = 0; n; n -= (n & -n))
count++;
return count;
}

int
t5_rmlsbmask(n)

register unsigned long n;
{
register int count;

for (count = 0; n; count++)
n &= n - 1; /* take away lsb */
return (count);
}

/*
* This function counts the bits in a long.
*
* It works by shifting the number down and testing the bottom bit.
*/
int

t6_testlsb(n)

register unsigned long n;
{
register int count;

for (count = 0; n; n >>= 1)
if (n & 1)
count++;
return count;
}

/*
* This function counts the bits in a long.
*
* It works by shifting the number left and testing the top bit.
* On many machines shift is expensive, so it uses a cheap addition instead.
*/
int

t7_testmsb(n)

register unsigned long n;
{
register int count;

for (count = 0; n; n += n)
if (n & ~(~(unsigned long)0 >> 1))
count++;
return count;
}

int
t8_testsignandshift(n)

register unsigned long n;
{
register int count;

for (count = 0; n; n <<= 1)
if ((long)n < 0)
count++;
return (count);
}

/*
* This function counts the bits in a long.
*
* It works by masking each bit.
* This is the second most intuitively obvious method,
* and is independent of the number of bits in the long.
*/
int

t9_testeachbit(n)

register unsigned long n;
{
register int count;
register unsigned long mask;

count = 0;
for (mask = 1; mask; mask += mask)
if (n & mask)
count++;
return count;
}

/*
* This function counts the bits in a long.
*
* It works by masking each bit.
* This is the most intuitively obvious method,
* but how do you a priori know how many bits in the long?
* (except for ''sizeof(long) * CHAR_BITS'' expression)
*/
int

tA_testeachbit1shl(n)

static int inbits[256] = {

0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tB_tableshift(n)
register unsigned long n;
{

return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tC_tableuchar(n)

unsigned long n;
{
register unsigned char *p = (unsigned char *)&n;

return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}

int
tD_tableshiftcast(n)
register unsigned long n;
{

return nbits[(unsigned char) n] +

nbits[(unsigned char) (n >> 8)] +
nbits[(unsigned char) (n >> 16)] +

nbits[n >> 24];
}

int
tE_itableshift(n)
register unsigned long n;
{

return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] +

inbits[(n >> 16) & 0xff] + inbits[n >> 24]);
}

int
tF_itableuchar(n)

unsigned long n;
{
register unsigned char *p = (unsigned char *)&n;

return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]);
}

int
tG_itableshiftcast(n)
register unsigned long n;
{

return inbits[(unsigned char) n] +

inbits[(unsigned char) (n >> 8)] +
inbits[(unsigned char) (n >> 16)] +

inbits[n >> 24];
}

static char nbits16[65536];
static int inbits16[65536];

int
tH_16tableshift(n)
register unsigned long n;
{

return (nbits16[n & 0xffff] + nbits16[n >> 16]);
}

int
tI_16tableushort(n)
unsigned long n;
{
register unsigned short *p = (unsigned short *)&n;

return (nbits16[p[0]] + nbits16[p[1]]);
}

int
tJ_16tableshiftcast(n)
register unsigned long n;
{

return nbits16[(unsigned short) n] +
nbits16[n >> 16];
}

int
tK_16itableshift(n)
register unsigned long n;
{

return (inbits16[n & 0xffff] + inbits16[n >> 16]);
}

int
tL_16itableushort(n)
unsigned long n;
{
register unsigned short *p = (unsigned short *)&n;

return (inbits16[p[0]] + inbits16[p[1]]);
}

int
tM_16itableshiftcast(n)
register unsigned long n;
{

return inbits16[(unsigned short) n] +
inbits16[n >> 16];
}

tN_sumbits(n)
register unsigned long n;
{

n = (n & 0x55555555) + ((n >> 1) & 0x55555555);

n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
n = (n + (n >> 4)) & 0x0f0f0f0f;
n += n >> 8;
n += n >> 16;
return (n & 0xff);
}

/*

return(r) ;
}

#define REPEAT (1L<<22)
#else
#define REPEAT (1L<<20)
#endif

{ "hackmemallfold", t3_hackmemallfold },
{ "rmlsbsub", t4_rmlsbsub },
{ "rmlsbmask", t5_rmlsbmask },
{ "testlsb", t6_testlsb },
{ "testmsb", t7_testmsb },
{ "testsignandshift", t8_testsignandshift },
{ "testeachbit", t9_testeachbit },
{ "testeachbit1shl", tA_testeachbit1shl },
{ "tableshift", tB_tableshift },
{ "tableuchar", tC_tableuchar },
{ "tableshiftcast", tD_tableshiftcast },
{ "itableshift", tE_itableshift },
{ "itableuchar", tF_itableuchar },
{ "itableshiftcast", tG_itableshiftcast },
{ "16tableshift", tH_16tableshift },
{ "16tableushort", tI_16tableushort },
{ "16tableshiftcast", tJ_16tableshiftcast },
{ "16itableshift", tK_16itableshift },
{ "16itableushort", tL_16itableushort },
{ "16itableshiftcast", tM_16itableshiftcast },
{ "sumbits", tN_sumbits }
};

main(argc, argv)
int argc;
char **argv;
{
double calibration, v, best;
int j, k, m, verbose;

/* Initialize the 16-bit lookup table. */
for (j = 0; j < 65536; ++j) {
nbits16[j] = inbits16[j] = tN_sumbits((unsigned long)j);

Chris Torek

unread,

Oct 18, 1990, 12:19:16 AM10/18/90

to

In article <34...@cup.portal.com> t...@cup.portal.com (Tim W Smith) writes:
>... I've tried a couple of these on my Mac II, using Think C. Note

>that 1) Apple is not noted for squeezing the last drop of performance
>out of the 68020, and 2) Think C has not stressed optimal code generation.

>I get [... numeric value deleted, but] the Sun is getting ...
>20 times worse performance. [Why?]

Two obvious possibilities: you may have been running the buggy version
of the tests (no value returned from bigrand()), or---more likely---
you have a System V machine with no reasonable way of obtaining CPU
resource usage. (This is not to say that no System V machine has
a reasonable facility a la getrusage().) The `times' call used in the
original bit-counting program returns time values in `clock ticks'.
There is no reliable way to discover how long a `clock tick' is. It
is often 1/60th of a second, but often 1/50th or 1/100th or some other
value. On some system you can `#include <sys/param.h>' and get a
manifest constant called `HZ' that tells you the default divisor for
1---that is, the number such that HZ ticks elapse every second---but
this is merely the default; the actual running system might be using
some other value (and might even change it `at whim', so to speak).
On other systems you can fish the actual value of hz out of the system.
The method is highly machine-dependent, and may give you the wrong
answer (e.g., under Ultrix on a DECstation 3100, hz is normally 256,
yet the values from times() are always in 1/60th of a second, because
times() is actually implemented using getrusage() with a hardcoded
`60 hz' inside the times() function.)

This is why I put the getrusage() call into the program. getrusage()
fills in a machine- and system-independent structure with values in
seconds and microseconds. The microseconds field is typically inaccurate
(being a step function with a grain other than 1), but its units are
known, unlike those for times().

If you have a BSD-based system, the values reported by the timing
function in the bit-count timing program have some chance of being
useful; if not, they are much more chancy.

Kevin D. Quitt

unread,

Oct 18, 1990, 10:24:16 AM10/18/90

to

Something is seriously wrong with this bit-counting benchmark. I'm
running on a Motorola Delta 3600 (25MHz 68030), and my time are 2 orders
of magnitude slower than those reported for a Sun 3/50!!!!

function time ratio
hackmemmod 0.00033760071 2.810
hackmemloop 0.00034141541 2.841
hackmemunrolled 0.00017452240 1.452
hackmemallfold 0.00026226044 2.183
rmlsbsub 0.00062084198 5.167
rmlsbmask 0.00061321259 5.103
testlsb 0.0015430450 12.841
testmsb 0.0012369156 10.294
testsignandshift 0.0014562607 12.119
testeachbit 0.0016183853 13.468
testeachbit1shl 0.0018253326 15.190
tableshift 0.00029087067 2.421
tableuchar 0.00018882751 1.571
tableshiftcast 0.00039577484 3.294
itableshift 0.00022602081 1.881
itableuchar 0.00012016296 1.000
itableshiftcast 0.00032424927 2.698
16tableshift 0.00033283234 2.770
16tableushort 0.00027084351 2.254
16tableshiftcast 0.00029563904 2.460
16itableshift 0.00035190582 2.929
16itableushort 0.00033092499 2.754
16itableshiftcast 0.00042724609 3.556
sumbits 0.00023365021 1.944

In order to get this to compile on my SYSV machine, I had to add the
line:

#undef FD_SETSIZE

because there's no resource.h on this machine. Oh yes, it's gcc -O.

--
_
Kevin D. Quitt demott!kdq k...@demott.com
DeMott Electronics Co. 14707 Keswick St. Van Nuys, CA 91405-1266
VOICE (818) 988-4975 FAX (818) 997-1190 MODEM (818) 997-4496 PEP last

96.37% of all statistics are made up.

Tim W Smith

unread,

Oct 24, 1990, 2:48:43 AM10/24/90

to

When benchmarking something like this, a good thing to do
is to do a back-of-the-envelope estimate of what you
expect. For example, the Mac II runs at 16 MHz. Assume
a few wait states, and loop overhead, and stuff like that
will limit us to 1 "useful" instruction every 8 clocks,
so we get 2 "useful" MIPS. The operation x &= x-1 will
take three operations. At 2 MIPS, an operation is 1/2 uSec,
so we have 1.5 uSec per bit. The loop runs an average of
16 times, so we expect that we can do it in 24 uSec.

The measured time was something like 20 uSec, so we
pass.

If you see numbers in the hundreds of microseconds, you
can be almost sure that something is wrong with the program.

Tim Smith