count of bits in a long

Sat Oct 13 23:37:49 AEST 1990

In article <26881 at mimsy.umd.edu> I posted a modified version of
Peter Miller's bit-count timing program.  Unfortunately, my version
had a bug that effectively makes the results worthless (depending
on compiler vagaries; it `just happens' to compile to working code
on some systems).

Remarkably few people noted the bug (or rather, sent me mail about it).
Two people sent me additional functions, however.  Arthur Olson added
four (all variants on table lookup); Mike Weaver added one.  Gene Olson
has also posted a different modified version of Peter Miller's original
program; this contains a better version of the latter.

So, here are new results for the new set of (now 17) functions on the
same set of machines, along with the new program.  Noteworthy facts:

  - hackmem invariably loses to some other technique, always including
    Gene Olson's, though sometimes not by much;
  - the fastest is usually (but not always) one of the table lookups;
  - tiny differences in ratios (less than .1) are usually insignificant.

  name			  technique used
  ----			  --------------
hackmemmod		HACKMEM 169, using % operator
hackmemloop		HACKMEM 169, using loop to implement %
hackmemunrolled		HACKMEM 169, with 5-step % (loop unrolled)
rmlsbsub		remove lsb with `n -= (n & -n)'
rmlsbmask		remove lsb with `n &= n - 1'
testlsb			test n&1, then n>>=1
testmsb			test n&MSB, then n+=n (rather than n<<=1)
testsignandshift	test n<0, then n<<=1
testeachbit		test n&mask, then mask+=mask
testeachbit1shl		test n&(1<<bit) for bit in [0..32)
tableshift		nbits[n>>24] + nbits[(n>>16)&255] ...
tableuchar		set p=&n, nbits[p[0]] + nbits[p[1]] ...
tableshiftcast		nbits[n>>24] + nbits[(u_char)(n>>16)] ...
itableshift		same as tableshift, but table datatype is int
itableuchar		ditto
itableshiftcast		ditto (note, nbits table is `char')
sumbits			Gene Olson's function (see code for comments)

------------------------------------------------------------
Results:

::::::::::::::
time.ds3100
::::::::::::::
function                   time          ratio
hackmemmod               3.0992432e-06   3.250
hackmemloop              2.5330353e-06   2.656
hackmemunrolled          1.7619495e-06   1.848
rmlsbsub                 4.9207935e-06   5.160
rmlsbmask                3.9522800e-06   4.145
testlsb                  1.0545622e-05  11.059
testmsb                  1.0608948e-05  11.125
testsignandshift         1.0497196e-05  11.008
testeachbit              1.0839901e-05  11.367
testeachbit1shl          1.6718033e-05  17.531
tableshift               1.0243893e-06   1.074
tableuchar               9.5361328e-07   1.000
tableshiftcast           1.1063404e-06   1.160
itableshift              1.2814178e-06   1.344
itableuchar              1.2031918e-06   1.262
itableshiftcast          1.3223934e-06   1.387
sumbits                  1.2106419e-06   1.270
::::::::::::::
time.encore
::::::::::::::
function                   time          ratio
hackmemmod               8.0387573e-06   4.522
hackmemloop              7.2727394e-06   4.091
hackmemunrolled          4.3494453e-06   2.447
rmlsbsub                 1.0656597e-05   5.994
rmlsbmask                1.1298252e-05   6.355
testlsb                  3.1122246e-05  17.506
testmsb                  2.2745319e-05  12.794
testsignandshift         1.9783443e-05  11.128
testeachbit              2.3917282e-05  13.454
testeachbit1shl          2.9880619e-05  16.808
tableshift               3.9419632e-06   2.217
tableuchar               2.6934662e-06   1.515
tableshiftcast           2.3953590e-06   1.347
itableshift              3.0450325e-06   1.713
itableuchar              1.7777634e-06   1.000
itableshiftcast          2.1755104e-06   1.224
sumbits                  1.9636650e-06   1.105
::::::::::::::
time.sun3
::::::::::::::
function                   time          ratio
hackmemmod               1.0604858e-05   2.106
hackmemloop              1.2664795e-05   2.515
hackmemunrolled          1.0833740e-05   2.152
rmlsbsub                 2.2430420e-05   4.455
rmlsbmask                1.7318726e-05   3.439
testlsb                  4.3182373e-05   8.576
testmsb                  3.8909912e-05   7.727
testsignandshift         4.0664673e-05   8.076
testeachbit              4.4479370e-05   8.833
testeachbit1shl          5.9432983e-05  11.803
tableshift               9.9945068e-06   1.985
tableuchar               1.0910034e-05   2.167
tableshiftcast           1.4877319e-05   2.955
itableshift              7.9345703e-06   1.576
itableuchar              1.1672974e-05   2.318
itableshiftcast          1.1520386e-05   2.288
sumbits                  5.0354004e-06   1.000
::::::::::::::
time.sun4
::::::::::::::
function                   time          ratio
hackmemmod               1.3427734e-05  19.288
hackmemloop              1.6689301e-06   2.397
hackmemunrolled          1.0585785e-06   1.521
rmlsbsub                 3.9768219e-06   5.712
rmlsbmask                3.4236908e-06   4.918
testlsb                  8.4209442e-06  12.096
testmsb                  8.4686279e-06  12.164
testsignandshift         8.4018707e-06  12.068
testeachbit              8.5353851e-06  12.260
testeachbit1shl          1.1539459e-05  16.575
tableshift               6.9618225e-07   1.000
tableuchar               1.1348724e-06   1.630
tableshiftcast           7.8201294e-07   1.123
itableshift              9.7274780e-07   1.397
itableuchar              1.2016296e-06   1.726
itableshiftcast          8.8691711e-07   1.274
sumbits                  8.3923340e-07   1.205
::::::::::::::
time.vax.gcc
::::::::::::::
function                   time          ratio
hackmemmod               1.5449524e-05   7.364
hackmemloop              1.3847351e-05   6.600
hackmemunrolled          8.6975098e-06   4.145
rmlsbsub                 1.8386841e-05   8.764
rmlsbmask                1.4266968e-05   6.800
testlsb                  5.3215027e-05  25.364
testmsb                  3.4332275e-05  16.364
testsignandshift         2.6550293e-05  12.655
testeachbit              2.9983521e-05  14.291
testeachbit1shl          4.7454834e-05  22.618
tableshift               5.3405762e-06   2.545
tableuchar               2.4414062e-06   1.164
tableshiftcast           6.0272217e-06   2.873
itableshift              4.3106079e-06   2.055
itableuchar              2.0980835e-06   1.000
itableshiftcast          5.6076050e-06   2.673
sumbits                  5.7983398e-06   2.764
::::::::::::::
time.vax.ucbcc
::::::::::::::
function                   time          ratio
hackmemmod               7.6293945e-06   3.774
hackmemloop              1.6746521e-05   8.283
hackmemunrolled          1.3504028e-05   6.679
rmlsbsub                 2.0332336e-05  10.057
rmlsbmask                1.8882751e-05   9.340
testlsb                  5.6457520e-05  27.925
testmsb                  3.7384033e-05  18.491
testsignandshift         2.9602051e-05  14.642
testeachbit              3.8681030e-05  19.132
testeachbit1shl          5.8860779e-05  29.113
tableshift               5.6838989e-06   2.811
tableuchar               2.2888184e-06   1.132
tableshiftcast           5.1879883e-06   2.566
itableshift              4.3487549e-06   2.151
itableuchar              2.0217896e-06   1.000
itableshiftcast          4.5394897e-06   2.245
sumbits                  6.1798096e-06   3.057

------------------------------------------------------------
The program (and I used lint this time...):

#ifndef lint
static char rcsid[] = "$Id: bct.c,v 1.5 90/10/13 08:44:12 chris Exp $";
#endif

/*
 * bct - bitcount timing
 *
 * Runs a bunch of different functions-to-count-bits-in-a-longword
 * (many assume 32 bits per long) with a timer around each loop, and
 * tries to calcuate the time used.
 */
#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}

/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit number to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "casting out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */
int
t0_hackmemmod(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}

/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, an addition.
 */
int
t1_hackmemloop(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * Above, without using while loop.  It takes at most 5 iterations of the
 * loop, so we just do all 5 in-line.  The final result is never 63
 * (this is assumed above as well).
 */
int
t2_hackmemunrolled(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	return (tmp);
}

/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
t3_rmlsbsub(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

int
t4_rmlsbmask(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; count++)
		n &= n - 1;	/* take away lsb */
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
t5_testlsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
t6_testmsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

int
t7_testsignandshift(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n <<= 1)
		if ((long)n < 0)
			count++;
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
t8_testeachbit(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
t9_testeachbit1shl(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

static char nbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

static int inbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tA_tableshift(n)
	register unsigned long n;
{

	return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
		nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tB_tableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}

int
tC_tableshiftcast(n)
	register unsigned long n;
{

	return nbits[(unsigned char) n] +
		nbits[(unsigned char) (n >> 8)] +
		nbits[(unsigned char) (n >> 16)] +
		nbits[(unsigned char) (n >> 24)];
}

int
tD_itableshift(n)
	register unsigned long n;
{

	return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] +
		inbits[(n >> 16) & 0xff] + inbits[n >> 24]);
}

int
tE_itableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]);
}

int
tF_itableshiftcast(n)
	register unsigned long n;
{

	return inbits[(unsigned char) n] +
		inbits[(unsigned char) (n >> 8)] +
		inbits[(unsigned char) (n >> 16)] +
		inbits[(unsigned char) (n >> 24)];
}

/*
 * Explanation:
 * First we add 32 1-bit fields to get 16 2-bit fields.
 * Each 2-bit field is one of 00, 01, or 10 (binary).
 * We then add all the two-bit fields to get 8 4-bit fields.
 * These are all one of 0000, 0001, 0010, 0011, or 0100.
 *
 * Now we can do something different, becuase for the first
 * time the value in each k-bit field (k now being 4) is small
 * enough that adding two k-bit fields results in a value that
 * still fits in the k-bit field.  The result is four 4-bit
 * fields containing one of {0000,0001,...,0111,1000} and four
 * more 4-bit fields containing junk (sums that are uninteresting).
 * Pictorially:
 *	    n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh
 *	 n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg
 *	  sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ
 * where W, X, Y, and Z are the interesting sums (each at most 1000,
 * or 8 decimal).  Masking with 0x0f0f0f0f extracts these.
 *
 * Now we can change tactics yet again, because now we have:
 *	    n = 0000WWWW0000XXXX0000YYYY0000ZZZZ
 *	 n>>8 = 000000000000WWWW0000XXXX0000YYYY
 * so	  sum = 0000WWWW000ppppp000qqqqq000rrrrr
 * where p and r are the interesting sums (and each is at most
 * 10000, or 16 decimal).  The sum `q' is junk, like i, j, and
 * k above; but it is not necessarry to discard it this time.
 * One more fold, this time by sixteen bits, gives
 *	    n = 0000WWWW000ppppp000qqqqq000rrrrr
 *	n>>16 = 00000000000000000000WWWW000ppppp
 * so	  sum = 0000WWWW000ppppp000sssss00tttttt
 * where s is at most 11000 and t is it most 100000 (32 decimal).
 *
 * Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)),
 * or in other words, t is the number of bits set in the original
 * 32-bit longword.  So all we have to do is return the low byte
 * (or low 6 bits, but `low byte' is typically just as easy if not
 * easier).
 *
 * This technique is also applicable to 64 and 128 bit words, but
 * 256 bit or larger word sizes require at least one more masking
 * step.
 */
int
tG_sumbits(n)
	register unsigned long n;
{

	n = (n & 0x55555555) + ((n >> 1) & 0x55555555);
	n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
	n = (n + (n >> 4)) & 0x0f0f0f0f;
	n += n >> 8;
	n += n >> 16;
	return (n & 0xff);
}

/*
 * build a long random number.
 * The standard rand() returns at least a 15 bit number.
 * We use the top 9 of 15 (since the lower N bits of the usual rand()
 * repeat with a period of 2^N).
 */
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
	register int r;

	r = randbits() << 27;
	r |= randbits() << 18;
	r |= randbits() << 9;
	r |= randbits();
	return (r);
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#ifndef REPEAT
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<20)
#else
#define REPEAT (1L<<18)
#endif
#endif

double
measure(func)
	register int	(*func)();
{
#ifdef USE_GETRUSAGE
	struct rusage ru0, ru1;
	register long j;

	srand(1);
	(void) getrusage(RUSAGE_SELF, &ru0);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	(void) getrusage(RUSAGE_SELF, &ru1);
	ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
	if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
		ru1.ru_utime.tv_usec += 1000000;
		ru1.ru_utime.tv_sec--;
	}
	return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
		(double)REPEAT);
#else
	register long	j;
	struct tms	start;
	struct tms	finish;

	srand(1);
	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "hackmemmod",		t0_hackmemmod },
	{ "hackmemloop", 	t1_hackmemloop },
	{ "hackmemunrolled",	t2_hackmemunrolled },
	{ "rmlsbsub", 		t3_rmlsbsub },
	{ "rmlsbmask",	 	t4_rmlsbmask },
	{ "testlsb",		t5_testlsb },
	{ "testmsb", 		t6_testmsb },
	{ "testsignandshift", 	t7_testsignandshift },
	{ "testeachbit", 	t8_testeachbit },
	{ "testeachbit1shl", 	t9_testeachbit1shl },
	{ "tableshift", 	tA_tableshift },
	{ "tableuchar", 	tB_tableuchar },
	{ "tableshiftcast",	tC_tableshiftcast },
	{ "itableshift", 	tD_itableshift },
	{ "itableuchar", 	tE_itableuchar },
	{ "itableshiftcast",	tF_itableshiftcast },
	{ "sumbits",		tG_sumbits },
};

main(argc, argv)
	int argc;
	char **argv;
{
	double calibration, v, best;
	int j, k, m, verbose;

	verbose = argc > 1;
	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j) {
		unsigned long n;
		int bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad) {
			printf("wrong: %08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf(" %3d", table[k].trial);
			printf("\n");
		}
	}

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);
	if (verbose)
		printf("calibration: %g\n", calibration);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j) {
		v = measure(table[j].func) - calibration;
		if (verbose)
			printf("%s: %g\n", table[j].name, v);
		table[j].measurement = v;
		if (!j || v < best)
			best = v;
	}
	(void) printf("%-24s   %-14sratio\n", "function", "time");
	for (j = 0; j < SIZEOF(table); ++j) {
		(void) printf("%-24s %#10.8g  %6.3f\n",
		    table[j].name,
		    table[j].measurement,
		    table[j].measurement / best);
	}
	exit(0);
}
-- 
In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 405 2750)
Domain:	chris at cs.umd.edu	Path:	uunet!mimsy!chris