count of bits in a long

Wed Oct 17 17:45:48 AEST 1990

I took Gene Olson's test program and stuck Torek's routines into it, as well 
as adding routines to count bits by using a table with a 16-bit index, and 
a slightly sped-up version of HAKMEM 169 (`hackmemallfold').  The 16-bit 
table lookup appears to be fastest, at least on a Sun-3/50 and a 
Sparcstation 1.  `hackmemallfold' and `sumbits' both use a total of 16 
arithmetic, logical, and shift operations.  However, if gcc is used 
`hackmemallfold' is apparently faster (because gcc refuses to put constants 
in registers?)

  name			  technique used
  ----			  --------------
hackmemmod		HACKMEM 169, using % operator
hackmemloop		HACKMEM 169, using loop to implement %
hackmemunrolled		HACKMEM 169, with 5-step % (loop unrolled)
hackmemallfold		HACKMEM 169, no %, fold to the bitter end
rmlsbsub		remove lsb with `n -= (n & -n)'
rmlsbmask		remove lsb with `n &= n - 1'
testlsb			test n&1, then n>>=1
testmsb			test n&MSB, then n+=n (rather than n<<=1)
testsignandshift	test n<0, then n<<=1
testeachbit		test n&mask, then mask+=mask
testeachbit1shl		test n&(1<<bit) for bit in [0..32)
tableshift		nbits[n>>24] + nbits[(n>>16)&255] ...
tableuchar		set p=&n, nbits[p[0]] + nbits[p[1]] ...
tableshiftcast		nbits[n>>24] + nbits[(u_char)(n>>16)] ...
itableshift		same as tableshift, but table datatype is int
itableuchar		ditto
itableshiftcast		ditto (note, nbits table is `char')
16tableshift		nbits16[n>>16] + nbits16[n&65535]
16tableushort		set p=&n, nbits16[p[0]] + nbits16[p[1]]
16tableshiftcast	nbits16[n>>16] + nbits16[(u_short)(n)]
16itableshift		same as 16tableshift, but table datatype is int
16itableushort		ditto
16itableshiftcast	ditto
sumbits			Gene Olson's function (see code for comments)

Sun-3/50, SunOS 4.1, cc -O

function                   time          ratio
hackmemmod               1.0166168e-05   1.931
hackmemloop              1.2187958e-05   2.315
hackmemunrolled          1.3732910e-05   2.609
hackmemallfold           8.2588196e-06   1.569
rmlsbsub                 1.9512177e-05   3.707
rmlsbmask                1.9512177e-05   3.707
testlsb                  4.6901703e-05   8.909
testmsb                  4.8561096e-05   9.225
testsignandshift         4.0779114e-05   7.746
testeachbit              4.6749115e-05   8.880
testeachbit1shl          6.6032410e-05  12.543
tableshift               1.6460419e-05   3.127
tableuchar               1.3256073e-05   2.518
tableshiftcast           1.1730194e-05   2.228
itableshift              1.6918182e-05   3.214
itableuchar              9.5176697e-06   1.808
itableshiftcast          1.2359619e-05   2.348
16tableshift             1.0337830e-05   1.964
16tableushort            6.0081482e-06   1.141
16tableshiftcast         6.1416626e-06   1.167
16itableshift            5.2642822e-06   1.000
16itableushort           6.6184998e-06   1.257
16itableshiftcast        1.1196136e-05   2.127
sumbits                  7.1525574e-06   1.359

Sun-3/50, SunOS 4.1, gcc -O, ver. 1.37.1

function                   time          ratio
hackmemmod               1.0070801e-05   3.259
hackmemloop              1.1653900e-05   3.772
hackmemunrolled          1.0719299e-05   3.469
hackmemallfold           7.9727173e-06   2.580
rmlsbsub                 1.9207001e-05   6.216
rmlsbmask                1.9226074e-05   6.222
testlsb                  4.4364929e-05  14.358
testmsb                  3.7155151e-05  12.025
testsignandshift         4.1427612e-05  13.407
testeachbit              4.8522949e-05  15.704
testeachbit1shl          5.4588318e-05  17.667
tableshift               1.1863708e-05   3.840
tableuchar               8.7928772e-06   2.846
tableshiftcast           1.6403198e-05   5.309
itableshift              9.8037720e-06   3.173
itableuchar              6.3896179e-06   2.068
itableshiftcast          1.3904572e-05   4.500
16tableshift             5.7983398e-06   1.877
16tableushort            4.4250488e-06   1.432
16tableshiftcast         5.3215027e-06   1.722
16itableshift            4.9591064e-06   1.605
16itableushort           3.0899048e-06   1.000
16itableshiftcast        9.1934204e-06   2.975
sumbits                  9.6511841e-06   3.123

SparcStation 1, SunOS 4.0.3c, cc -O

function                   time          ratio
hackmemmod               1.4197826e-05  18.551
hackmemloop              2.2125244e-06   2.891
hackmemunrolled          1.4662743e-06   1.916
hackmemallfold           1.0657310e-06   1.393
rmlsbsub                 5.1355362e-06   6.710
rmlsbmask                4.2915344e-06   5.607
testlsb                  1.0557175e-05  13.794
testmsb                  1.0647774e-05  13.913
testsignandshift         1.0533333e-05  13.763
testeachbit              1.0833740e-05  14.156
testeachbit1shl          1.3933182e-05  18.206
tableshift               9.0837479e-07   1.187
tableuchar               1.3136864e-06   1.717
tableshiftcast           9.1314316e-07   1.193
itableshift              1.1110306e-06   1.452
itableuchar              1.5211105e-06   1.988
itableshiftcast          1.1157990e-06   1.458
16tableshift             8.2015991e-07   1.072
16tableushort            1.1849403e-06   1.548
16tableshiftcast         7.6532364e-07   1.000
16itableshift            1.6880035e-06   2.206
16itableushort           2.0956993e-06   2.738
16itableshiftcast        1.6403198e-06   2.143
sumbits                  1.0561943e-06   1.380

SparcStation 1, SunOS 4.0.3c, gcc -O, ver. 1.37

function                   time          ratio
hackmemmod               1.1713505e-05  14.241
hackmemloop              2.4437904e-06   2.971
hackmemunrolled          1.4662743e-06   1.783
hackmemallfold           1.0633469e-06   1.293
rmlsbsub                 5.1999092e-06   6.322
rmlsbmask                4.3869019e-06   5.333
testlsb                  1.2927055e-05  15.716
testmsb                  1.6040802e-05  19.501
testsignandshift         1.2907982e-05  15.693
testeachbit              1.1508465e-05  13.991
testeachbit1shl          1.4796257e-05  17.988
tableshift               1.0609627e-06   1.290
tableuchar               1.5187263e-06   1.846
tableshiftcast           1.0633469e-06   1.293
itableshift              1.2660027e-06   1.539
itableuchar              1.7237663e-06   2.096
itableshiftcast          1.2612343e-06   1.533
16tableshift             8.7499619e-07   1.064
16tableushort            1.1301041e-06   1.374
16tableshiftcast         8.2254410e-07   1.000
16itableshift            1.7356873e-06   2.110
16itableushort           1.9979477e-06   2.429
16itableshiftcast        1.6951561e-06   2.061
sumbits                  1.3136864e-06   1.597
--
David Moews                      moews at math.berkeley.edu
-----------------------------cut here for program-------------------------------
#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

#define NUMRAND	16384
unsigned long rr[NUMRAND];

/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}

/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit number to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "casting out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */
int
t0_hackmemmod(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}

/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, [sic] an addition.
 */
int
t1_hackmemloop(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * Above, without using while loop.  It takes at most 5 iterations of the
 * loop, so we just do all 5 in-line.  The final result is never 63
 * (this is assumed above as well).
 */
int
t2_hackmemunrolled(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	return (tmp);
}

/*
 * As above, but no modulus nor simulation thereof.  Instead,
 * we continue shifting and masking to the bitter end.
 */
int
t3_hackmemallfold(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp += tmp >> 6;
	return ((tmp + (tmp >> 12) + (tmp >> 24)) & 077);
}

/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
t4_rmlsbsub(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

int
t5_rmlsbmask(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; count++)
		n &= n - 1;	/* take away lsb */
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
t6_testlsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
t7_testmsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

int
t8_testsignandshift(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n <<= 1)
		if ((long)n < 0)
			count++;
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
t9_testeachbit(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
tA_testeachbit1shl(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

static char nbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

static int inbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tB_tableshift(n)
	register unsigned long n;
{

	return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
		nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tC_tableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}

int
tD_tableshiftcast(n)
	register unsigned long n;
{

	return nbits[(unsigned char) n] +
		nbits[(unsigned char) (n >> 8)] +
		nbits[(unsigned char) (n >> 16)] +
		nbits[n >> 24];
}

int
tE_itableshift(n)
	register unsigned long n;
{

	return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] +
		inbits[(n >> 16) & 0xff] + inbits[n >> 24]);
}

int
tF_itableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]);
}

int
tG_itableshiftcast(n)
	register unsigned long n;
{

	return inbits[(unsigned char) n] +
		inbits[(unsigned char) (n >> 8)] +
		inbits[(unsigned char) (n >> 16)] +
		inbits[n >> 24];
}

static char nbits16[65536];
static int inbits16[65536];

int
tH_16tableshift(n)
	register unsigned long n;
{

	return (nbits16[n & 0xffff] + nbits16[n >> 16]);
}

int
tI_16tableushort(n)
	unsigned long n;
{
	register unsigned short *p = (unsigned short *)&n;

	return (nbits16[p[0]] + nbits16[p[1]]);
}

int
tJ_16tableshiftcast(n)
	register unsigned long n;
{

	return nbits16[(unsigned short) n] +
		nbits16[n >> 16];
}

int
tK_16itableshift(n)
	register unsigned long n;
{

	return (inbits16[n & 0xffff] + inbits16[n >> 16]);
}

int
tL_16itableushort(n)
	unsigned long n;
{
	register unsigned short *p = (unsigned short *)&n;

	return (inbits16[p[0]] + inbits16[p[1]]);
}

int
tM_16itableshiftcast(n)
	register unsigned long n;
{

	return inbits16[(unsigned short) n] +
		inbits16[n >> 16];
}

/*
 * Explanation:
 * First we add 32 1-bit fields to get 16 2-bit fields.
 * Each 2-bit field is one of 00, 01, or 10 (binary).
 * We then add all the two-bit fields to get 8 4-bit fields.
 * These are all one of 0000, 0001, 0010, 0011, or 0100.
 *
 * Now we can do something different, becuase for the first
 * time the value in each k-bit field (k now being 4) is small
 * enough that adding two k-bit fields results in a value that
 * still fits in the k-bit field.  The result is four 4-bit
 * fields containing one of {0000,0001,...,0111,1000} and four
 * more 4-bit fields containing junk (sums that are uninteresting).
 * Pictorially:
 *	    n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh
 *	 n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg
 *	  sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ
 * where W, X, Y, and Z are the interesting sums (each at most 1000,
 * or 8 decimal).  Masking with 0x0f0f0f0f extracts these.
 *
 * Now we can change tactics yet again, because now we have:
 *	    n = 0000WWWW0000XXXX0000YYYY0000ZZZZ
 *	 n>>8 = 000000000000WWWW0000XXXX0000YYYY
 * so	  sum = 0000WWWW000ppppp000qqqqq000rrrrr
 * where p and r are the interesting sums (and each is at most
 * 10000, or 16 decimal).  The sum `q' is junk, like i, j, and
 * k above; but it is not necessarry to discard it this time.
 * One more fold, this time by sixteen bits, gives
 *	    n = 0000WWWW000ppppp000qqqqq000rrrrr
 *	n>>16 = 00000000000000000000WWWW000ppppp
 * so	  sum = 0000WWWW000ppppp000sssss00tttttt
 * where s is at most 11000 and t is it most 100000 (32 decimal).
 *
 * Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)),
 * or in other words, t is the number of bits set in the original
 * 32-bit longword.  So all we have to do is return the low byte
 * (or low 6 bits, but `low byte' is typically just as easy if not
 * easier).
 *
 * This technique is also applicable to 64 and 128 bit words, but
 * 256 bit or larger word sizes require at least one more masking
 * step.
 */
int
tN_sumbits(n)
	register unsigned long n;
{

	n = (n & 0x55555555) + ((n >> 1) & 0x55555555);
	n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
	n = (n + (n >> 4)) & 0x0f0f0f0f;
	n += n >> 8;
	n += n >> 16;
	return (n & 0xff);
}

/*
 * build a long random number.
 * The standard rand() returns at least a 15 bit number.
 * We use the top 9 of 15 (since the lower N bits of the usual rand()
 * repeat with a period of 2^N).
 */
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
	register int r;

	r = randbits() << 27;
	r |= randbits() << 18;
	r |= randbits() << 9;
	r |= randbits();

	return(r) ;
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<22)
#else
#define REPEAT (1L<<20)
#endif

double
measure(func)
	register int	(*func)();
{
#ifdef USE_GETRUSAGE
	struct rusage ru0, ru1;
	register long j;

	(void) getrusage(RUSAGE_SELF, &ru0);
	for (j = 0; j < REPEAT; ++j)
		func(rr[j & (NUMRAND-1)]);
	(void) getrusage(RUSAGE_SELF, &ru1);
	ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
	if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
		ru1.ru_utime.tv_usec += 1000000;
		ru1.ru_utime.tv_sec--;
	}
	return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
		(double)REPEAT);
#else
	register long	j;
	struct tms	start;
	struct tms	finish;

	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(rr[j & (NUMRAND-1)]);
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "hackmemmod",		t0_hackmemmod },
	{ "hackmemloop", 	t1_hackmemloop },
	{ "hackmemunrolled",	t2_hackmemunrolled },
	{ "hackmemallfold",	t3_hackmemallfold },
	{ "rmlsbsub", 		t4_rmlsbsub },
	{ "rmlsbmask",	 	t5_rmlsbmask },
	{ "testlsb",		t6_testlsb },
	{ "testmsb", 		t7_testmsb },
	{ "testsignandshift", 	t8_testsignandshift },
	{ "testeachbit", 	t9_testeachbit },
	{ "testeachbit1shl", 	tA_testeachbit1shl },
	{ "tableshift", 	tB_tableshift },
	{ "tableuchar", 	tC_tableuchar },
	{ "tableshiftcast",	tD_tableshiftcast },
	{ "itableshift", 	tE_itableshift },
	{ "itableuchar", 	tF_itableuchar },
	{ "itableshiftcast",	tG_itableshiftcast },
	{ "16tableshift", 	tH_16tableshift },
	{ "16tableushort", 	tI_16tableushort },
	{ "16tableshiftcast",	tJ_16tableshiftcast },
	{ "16itableshift", 	tK_16itableshift },
	{ "16itableushort", 	tL_16itableushort },
	{ "16itableshiftcast",	tM_16itableshiftcast },
	{ "sumbits",		tN_sumbits }
};

main(argc, argv)
	int argc;
	char **argv;
{
	double calibration, v, best;
	int j, k, m, verbose;

	/* Initialize the 16-bit lookup table. */
	for (j = 0; j < 65536; ++j) {
		nbits16[j] = inbits16[j] = tN_sumbits((unsigned long)j);
	}
	verbose = argc > 1;
	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j) {
		unsigned long n;
		int bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad) {
			printf("wrong: %08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf(" %3d", table[k].trial);
			printf("\n");
		}
	}

	for (k = 0 ; k < NUMRAND ; k++) rr[k] = bigrand() ;

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);
	if (verbose)
		printf("calibration: %g\n", calibration);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j) {
		v = measure(table[j].func) - calibration;
		if (verbose)
			printf("%s: %g\n", table[j].name, v);
		table[j].measurement = v;
		if (!j || v < best)
			best = v;
	}
	(void) printf("%-24s   %-14sratio\n", "function", "time");
	for (j = 0; j < SIZEOF(table); ++j) {
		(void) printf("%-24s %#10.8g  %6.3f\n",
		    table[j].name,
		    table[j].measurement,
		    table[j].measurement / best);
	}
	exit(0);
}