count of bits in a long

Fri Oct 5 12:17:33 AEST 1990

/*
 * A while back I posted a solution to the "count of bits set in a
 * long" question, suggesting that HACKMEM 169 was a very fast way to do it.
 * This program demonstrates the technique.
 * 
 * The HACKMEM 169 method uses a modulo to sum the digits.
 * A number of people have asked me "how can a division be faster than a loop?"
 * This program demonstrates how.  The modulo is a special case,
 * and can be unwound into a tight loop if your machine does not
 * have a fast divide opcode (although many modern machines have a 1 cycle
 * divide opcode, except for we-really-believe-our-press-hype risc
 * manufacturers).
 * 
 * This program was run on a 386/20 under unix with the following results:
 *	Func	sec/iter	scaled
 *	nbits	0.00018310547	 1.000	hackmem 169
 *	nbits1	0.0006980896	 3.812	hackmem 169 no % operator
 *	nbits2	0.0016822815	 9.188	count lsb's loop
 *	nbits3	0.0037384033	20.417
 *	nbits4	0.0034370422	18.771
 *	nbits5	0.0037765503	20.625
 *	nbits6	0.0035247803	19.250
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/times.h>

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}

/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit bumber to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "cating out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */

int
nbits(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}

/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, an addition.
 */
int
nbits1(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
nbits2(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
nbits3(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
nbits4(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
nbits5(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
nbits6(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

/*
 * build a long random number.
 * The standard rand() returns a 15bit number.
 */
unsigned long
bigrand()
{
	return
	(
		((unsigned long)(rand() & 0xFFF) << 24)
	|
		((unsigned long)(rand() & 0xFFF) << 12)
	|
		(unsigned long)(rand() & 0xFFF)
	);
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#define REPEAT (1L<<18)

double
measure(func)
	int	(*func)();
{
	long	j;
	struct tms	start;
	struct tms	finish;

	srand(1);
	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
}

struct table
{
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "nbits", 	nbits },
	{ "nbits1", 	nbits1 },
	{ "nbits2", 	nbits2 },
	{ "nbits3", 	nbits3 },
	{ "nbits4", 	nbits4 },
	{ "nbits5", 	nbits3 },
	{ "nbits6", 	nbits4 },
};

main()
{
	double	calibration;
	double	best;
	int	j, k, m;

	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j)
	{
		unsigned long n;
		int	bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad)
		{
			printf("wrong:\t%08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf("\t%d", table[k].trial);
			printf("\n");
		}
	}

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j)
	{
		table[j].measurement = measure(table[j].func) - calibration;
		if (!j || table[j].measurement < best)
			best = table[j].measurement;
	}
	for (j = 0; j < SIZEOF(table); ++j)
	{
		printf
		(
			"%s\t%10.8g\t%6.3f\n",
			table[j].name,
			table[j].measurement,
			table[j].measurement / best
		);
	}
	exit(0);
}

Regards
Peter Miller	UUCP	uunet!munnari!pdact.pd.necisa.oz!pmiller
		ARPA	pmiller%pdact.pd.necisa.oz at australia
/\/\*		CSNET	pmiller%pdact.pd.necisa.oz at au
		ACSnet	pmiller at pdact.pd.necisa.oz
Disclaimer:  The views expressed here are personal and do not necessarily
	reflect the view of my employer or the views or my colleagues.
D