count of bits in a long

Sun Oct 7 10:12:52 AEST 1990

In article <826 at neccan.oz> peter at neccan.oz (Peter Miller) sends out a
program to time various bit-counting functions.  I decided to `improve'
the program a bit :-) and add a number of additional functions.  I have
run the resulting program (which I will append below) on a number of
architectures.  The `mod' version is never the fastest of the hackmem
variants.  The fastest function is always table lookup, except on the
Sun-4.

Here are his results, for an i386 box:
> *	Func	sec/iter	scaled
> *	nbits	0.00018310547	 1.000	hackmem 169
> *	nbits1	0.0006980896	 3.812	hackmem 169 no % operator
> *	nbits2	0.0016822815	 9.188	count lsb's loop
> *	nbits3	0.0037384033	20.417
> *	nbits4	0.0034370422	18.771
> *	nbits5	0.0037765503	20.625
> *	nbits6	0.0035247803	19.250

(I find the last two results particularly interesting since his code
as posted uses functions nbits3 and nbits4 to calculate the times
printed for nbits5 and nbits6, never evaluating 5 and 6 at all.)

Here is a short key to names:
  my name (Peter's name)	  technique used
  ----------------------	  --------------
hackmemmod (nbits)		HACKMEM 169, using % operator
hackmemloop (nbits1)		HACKMEM 169, using loop to implement %
hackmemunrolled (-)		HACKMEM 169, with 5-step % (loop unrolled)
rmlsbsub (nbits2)		remove lsb with `n -= (n & -n)'
rmlsbmask (-)			remove lsb with `n &= n - 1'
testlsb (nbits4)		test n&1, then n>>=1
testmsb (nbits3)		test n&MSB, then n+=n (rather than n<<=1)
testsignandshift (-)		test n<0, then n<<=1
testeachbit (nbits5, untimed)	test n&mask, then mask+=mask
testeachbit1shl (nbits6, ")	test n&(1<<bit) for bit in [0..32)
tableshift			nbits[n>>24] + nbits[(n>>16)&255] ...
tableuchar			set p=&n, nbits[p[0]] + nbits[p[1]] ...

The results:
--------------------------------------------------
  DECstation 3100 (MIPS R2000, slow memory system)
function                   time          ratio
hackmemmod               3.1029682e-06   3.179
hackmemloop              2.4697094e-06   2.531
hackmemunrolled          1.7693996e-06   1.813
rmlsbsub                 4.8127670e-06   4.931
rmlsbmask                3.8405285e-06   3.935
testlsb                  1.0221542e-05  10.473
testmsb                  1.0899502e-05  11.168
testsignandshift         1.0780300e-05  11.046
testeachbit              1.0843626e-05  11.111
testeachbit1shl          1.6695683e-05  17.107
tableshift               1.0467396e-06   1.073
tableuchar               9.7596359e-07   1.000

--------------------------------------------------
  Encore Multimax (NS32532, proprietary bus)
function                   time          ratio
hackmemmod               6.3739853e-06   3.618
hackmemloop              3.8458633e-06   2.183
hackmemunrolled          6.4167595e-06   3.642
rmlsbsub                 9.3918610e-06   5.330
rmlsbmask                9.4905777e-06   5.386
testlsb                  2.4036179e-05  13.642
testmsb                  2.3262550e-05  13.203
testsignandshift         2.1452625e-05  12.176
testeachbit              2.5112068e-05  14.253
testeachbit1shl          2.8207516e-05  16.010
tableshift               2.2915077e-06   1.301
tableuchar               1.7619209e-06   1.000

--------------------------------------------------
  Sun-3/something (68020, ?vmebus memory?)

function                   time          ratio
hackmemmod               0.00077819824   1.821
hackmemloop              0.00059890747   1.402
hackmemunrolled          0.00050354004   1.179
rmlsbsub                 0.00069808960   1.634
rmlsbmask                0.00055313110   1.295
testlsb                  0.00107955930   2.527
testmsb                  0.00251007080   5.875
testsignandshift         0.00242614750   5.679
testeachbit              0.00254821780   5.964
testeachbit1shl          0.00362014770   8.473
tableshift               0.00083160400   1.946
tableuchar               0.00042724609   1.000

--------------------------------------------------
  SparcStation 1+ (SPARC, private memory bus)
	These times look a bit odd, and suggest that the compiler
	could use some work.
function                   time          ratio
hackmemmod               1.4972687e-06  12.077
hackmemloop              7.3432922e-07   5.923
hackmemunrolled          1.1825562e-06   9.538
rmlsbsub                 1.3351440e-07   1.077
rmlsbmask                1.4305115e-07   1.154
testlsb                  1.4305115e-07   1.154
testmsb                  1.3351440e-07   1.077
testsignandshift         1.2397766e-07   1.000
testeachbit              6.6280365e-06  53.462
testeachbit1shl          9.2029572e-06  74.231
tableshift               7.2479248e-07   5.846
tableuchar               1.1539459e-06   9.308

--------------------------------------------------
  VAX 8600 with gcc (ECL gate array, private memory bus)
function                   time          ratio
hackmemmod               1.1291504e-05   4.290
hackmemloop              1.0757446e-05   4.087
hackmemunrolled          8.8500977e-06   3.362
rmlsbsub                 1.7890930e-05   6.797
rmlsbmask                1.4801025e-05   5.623
testlsb                  5.2032471e-05  19.768
testmsb                  3.5400391e-05  13.449
testsignandshift         2.7008057e-05  10.261
testeachbit              3.0670166e-05  11.652
testeachbit1shl          4.7760010e-05  18.145
tableshift               5.3405762e-06   2.029
tableuchar               2.6321411e-06   1.000

--------------------------------------------------
  VAX 8600 with Berkeley cc
function                   time          ratio
hackmemmod               8.1634521e-06   3.292
hackmemloop              6.9808960e-06   2.815
hackmemunrolled          1.3885498e-05   5.600
rmlsbsub                 6.2942505e-06   2.538
rmlsbmask                5.9890747e-06   2.415
testlsb                  1.5754700e-05   6.354
testmsb                  3.9138794e-05  15.785
testsignandshift         3.1127930e-05  12.554
testeachbit              4.1275024e-05  16.646
testeachbit1shl          6.0615540e-05  24.446
tableshift               5.9890747e-06   2.415
tableuchar               2.4795532e-06   1.000

--------------------------------------------------

There are some noteworthy points above, particularly the example
with the 8600, where the choice of compiler makes a great deal of
difference.  The `testlsb' loop runs much faster under gcc, which
eliminated one `tstl' instruction, yet some of the other loops
run more slowly.

So that others can run their own tests, here is the program.  I
added several functions, renamed them all, changed the timing
mechanism on BSD machines, and reformatted things a bit.  I changed
the random number selection mechanism to be relatively machine
independent (assuming they all have the same `rand' function) and
to use values that are `more random'.  I also changed the output
format (among other things, it uses `%#g'; the Sun SparcStation
printf does not handle this correctly, and I added trailing zeroes
manually above).  Oh yes, I also had to make it run longer on the
MIPS and Sparc machines to get reliable times.

(You can tell which functions I added, as they are generally
uncommented. :-) )

--------------------------------------------------
#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}

/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit bumber to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "cating out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */
int
t0_hackmemmod(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}

/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, an addition.
 */
int
t1_hackmemloop(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * Above, without using while loop.  It takes at most 5 iterations of the
 * loop, so we just do all 5 in-line.  The final result is never 63
 * (this is assumed above as well).
 */
int
t2_hackmemunrolled(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	return (tmp);
}

/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
t3_rmlsbsub(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

int
t4_rmlsbmask(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; count++)
		n &= n - 1;	/* take away lsb */
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
t5_testlsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
t6_testmsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

int
t7_testsignandshift(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n <<= 1)
		if ((long)n < 0)
			count++;
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
t8_testeachbit(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
t9_testeachbit1shl(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

static char nbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tA_tableshift(n)
	register unsigned long n;
{

	return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
		nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tB_tableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}

/*
 * build a long random number.
 * The standard rand() returns at least a 15 bit number.
 * We use the top 9 of 15 (since the lower N bits of the usual rand()
 * repeat with a period of 2^N).
 */
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
	register int r;

	r = randbits() << 27;
	r |= randbits() << 18;
	r |= randbits() << 9;
	r |= randbits();
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<20)
#else
#define REPEAT (1L<<18)
#endif

double
measure(func)
	register int	(*func)();
{
#ifdef USE_GETRUSAGE
	struct rusage ru0, ru1;
	register long j;

	srand(1);
	(void) getrusage(RUSAGE_SELF, &ru0);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	(void) getrusage(RUSAGE_SELF, &ru1);
	ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
	if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
		ru1.ru_utime.tv_usec += 1000000;
		ru1.ru_utime.tv_sec--;
	}
	return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
		(double)REPEAT);
#else
	register long	j;
	struct tms	start;
	struct tms	finish;

	srand(1);
	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "hackmemmod",		t0_hackmemmod },
	{ "hackmemloop", 	t1_hackmemloop },
	{ "hackmemunrolled",	t2_hackmemunrolled },
	{ "rmlsbsub", 		t3_rmlsbsub },
	{ "rmlsbmask",	 	t4_rmlsbmask },
	{ "testlsb",		t5_testlsb },
	{ "testmsb", 		t6_testmsb },
	{ "testsignandshift", 	t7_testsignandshift },
	{ "testeachbit", 	t8_testeachbit },
	{ "testeachbit1shl", 	t9_testeachbit1shl },
	{ "tableshift", 	tA_tableshift },
	{ "tableuchar", 	tB_tableuchar },
};

main(argc, argv)
	int argc;
	char **argv;
{
	double calibration, v, best;
	int j, k, m, verbose;

	verbose = argc > 1;
	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j) {
		unsigned long n;
		int bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad) {
			printf("wrong: %08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf(" %3d", table[k].trial);
			printf("\n");
		}
	}

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);
	if (verbose)
		printf("calibration: %g\n", calibration);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j) {
		v = measure(table[j].func) - calibration;
		if (verbose)
			printf("%s: %g\n", table[j].name, v);
		table[j].measurement = v;
		if (!j || v < best)
			best = v;
	}
	(void) printf("%-24s   %-14sratio\n", "function", "time");
	for (j = 0; j < SIZEOF(table); ++j) {
		(void) printf("%-24s %#10.8g  %6.3f\n",
		    table[j].name,
		    table[j].measurement,
		    table[j].measurement / best);
	}
	exit(0);
}
-- 
In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 405 2750)
Domain:	chris at cs.umd.edu	Path:	uunet!mimsy!chris