Bit counting

Bennett Todd bet at dukeac.UUCP
Wed Jan 18 17:59:51 AEST 1989


: This is a sharchive -- extract the files by running through sh.
echo x - README
sed 's/^X//' <<\Shar_Eof >README
XI wanted to see for myself whether the byte-at-a-time table could perform
Xcomparably to the "parallel addition" algorithm presented; so I wrote a
Xlittle benchmark code to compare. I feel (and folks are welcome to disagree
Xwith me here, obviously) that taking advantage of pointer type punning to
Xconvert from shift-and-mask extractions to bump-a-pointer-and-dereference is
Xlegitimate and (as far as I know!) portable, and not unreasonably ugly when
Xyou are already wading down in the bit-bashing.
X
XI did this using GCC 1.31 on a Sun 3/50 with 4M under SunOS 3.5 (no suntools
Xrunning). The machine wasn't quite idle; I had an rlogin to another machine
Xwhere I was reading netnews. However, it was close enough to idle that I
Xthink the user times are reasonable for comparison. I ran it several times
Xand the runs didn't vary by as much as a second in the user time. 1000000
Xiterations didn't have quite enough separation to convince me for sure this
Xwas a solid result, so I bumped it to 5000000.
X
XThe timing results came out:
X
XByte at a time table lookup:   75.0 user
X          Parallel addition:   82.7 user
X
XSo, given this coding style, and this compiler technology, on this sort of
Xarchitecture, it appears that the byte-wise table lookup isn't particularly
Xslower. It also isn't importantly faster. I personally find it more
Xunderstandable.
X
XI compared the output for the first 1000 results (on the output-generating
Xversion) and they were identical, so I hope this code is correctly computing
Xthe desired function. It compiled with the options given in the Makefile
X(optization cranked way up, fairly strict diagnostics) without error or
Xwarning messages.
X
XNote that the generator program in the comment is really a one-shot;
XI built it with a library I run against that performs system and library
Xcall error checking for me, because I am extremely lazy. It should be clear
Xhow to extract that into a stand-alone.
X
XFinally, this is my first try ever at coding a timing benchmark, so it is
Xquite possible I have committed some classic benchmark blunder. I hope not.
XThe separately-compiled "consume" procedure might not have been absolutely
Xnecessary, but it kept me from having to examine assembler or figure out
Xwhat a heavy-duty optimizing compiler might be capable of omitting.
Shar_Eof
echo x - Makefile
sed 's/^X//' <<\Shar_Eof >Makefile
X#
X# Choose one of the following four OPTIONS definitions, which
X# cover generating 1000-point validation versions on each algorithm,
X# and generating 5000000-point silent versions for timing.
X#
X
X# OPTIONS=-DLIM=5000000 -DBENCHMARK -DBYTE_ORIENTED_TABLE
XOPTIONS=-DLIM=5000000 -DBENCHMARK -DPARALLEL_ADDITION
X# OPTIONS=-DLIM=1000 -DVERIFY -DBYTE_ORIENTED_TABLE
X# OPTIONS=-DLIM=1000 -DVERIFY -DPARALLEL_ADDITION
X
XOBJ=bench.o consume.o
X
XCC=gcc
XCFLAGS=-O -g -Wall -Wwrite-strings -msoft-float -fstrength-reduce \
X       -finline-functions -I/usr/local/include $(OPTIONS)
X
Xbench : $(OBJ)
X	$(CC) $(CFLAGS) -o $@ $(OBJ)
X
X$(OBJ) : Makefile
Shar_Eof
echo x - bench.c
sed 's/^X//' <<\Shar_Eof >bench.c
X#include <stdio.h>
X
X#ifndef LIM
X#define LIM 1000000
X#endif
X
Xvoid consume(int);
Xint bitcount(int);
X
Xint main() {
X    int i;
X
X    for (i=0; i<LIM; i++)
X        consume(bitcount(i));
X
X    return(0);
X}
X
X#ifdef BYTE_ORIENTED_TABLE
Xint bitcount(int num) {
X    register unsigned char *ptr = (unsigned char *) #
X    register int count = 0;
X    /*
X     * The following table was generated, of course. Here's the
X     * (one-shot, quick-and-dirty) generating program (derived
X     * from a program in
X     *   Harbison and Steele, "C: A Reference Manual", Second Edition
X     * on page 175):
X     *
X     *  #include <bent.h>
X     *
X     *  char syntax_args[] = "";
X     *
X     *  int main(int argc, char **argv) {
X     *      register unsigned i;
X     *
X     *      progname = argv[0];
X     *      if (argc != 1)
X     *          syntax();
X     *
X     *      for (i = 0; i<256; i++) {
X     *          register unsigned tmp = i;
X     *          register int count = 0;
X     *          while (tmp != 0) {
X     *              tmp ^= (tmp & -tmp);
X     *              count++;
X     *          }
X     *          eprintf("%d%s", count, (i%32 == 31) ? ",\n" : ",");
X     *      }
X     *      return(0);
X     *  }
X     */
X    static char bits_per_byte[] = {
X        0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
X        1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
X        1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
X        2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
X        1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
X        2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
X        2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
X        3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
X    };
X
X    /*
X     * The following lines are the loop:
X     *
X     *  for (counter = 0; counter < sizeof(int); counter++)
X     *      count += bits_per_byte[*ptr++];
X     *
X     * unrolled.
X     */
X    count += bits_per_byte[*ptr++];
X    count += bits_per_byte[*ptr++];
X    count += bits_per_byte[*ptr++];
X    count += bits_per_byte[*ptr++];
X    return(count);
X}
X#endif /* BYTE_ORIENTED_TABLE */
X
X#ifdef PARALLEL_ADDITION
X/*
X  Module: l_bitcnt.c
X  Author: Richard E. K. Neitzel
X
Xrevision history
X----------------
X1.0,9jan89,rekn            written
X
X
X  Explanation:
X      This routine takes an integer and returns the number of bits it contains
X      which are set. It does this via 'parallel' addition, turning the integer
X      into sets of bit pairs of increasing size. WARNING! This assumes that
X      integers are 32 bits! 
X*/
X
X/*
X * Excised from netnews posting, and entry point renamed "bitcount";
X * changes made for ANSI C.
X *
X * In other words, this is Richard E. K. Neitzel's work, not mine, but
X * I have monkeyed with it, so don't blame him.
X * Bennett Todd
X */
X
Xint bitcount(int word) {
X    register int j, k;      /* Our work area */
X
X    if (!word)          /* Why bother? */
X      return(0);
X
X    j = word;
X
X    k = j & 0x55555555;     /* Clear every other bit */
X    j >>= 1;            /* Do again for cleared bits */
X    j &= 0x55555555;
X    j += k;         /* 1st pairs done */
X
X    k = j & 0x33333333;     /* Clear every two bits */
X    j >>= 2;
X    j &= 0x33333333;
X    j += k;         /* 2nd pairs done */
X
X    k = j & 0xffff;     /* Only need last 4 sets */
X    j &= 0xffff0000;        /* and top 4 here */
X    j = j >> 16;        /* ready - */
X    j += k;         /* add 3rd pairs */
X
X    k = j & 0xf0f;      /* Clear bits */
X    j >>= 4;            /* Repeat */
X    j &= 0xf0f;
X    j += k;         /* add 4th pairs */
X
X    k = j & 0xff;       /* Now add the 8 bit pairs */
X    j >>= 8;
X    j += k;
X
X    return(j);          /* bye */
X}
X#endif /* PARALLEL_ADDITION */
Shar_Eof
echo x - consume.c
sed 's/^X//' <<\Shar_Eof >consume.c
X#include <stdio.h>
X
Xint printf(const char *, ...);
X
Xvoid consume(int val) {
X
X#ifdef VERIFY
X    (void) printf("%d\n", val);
X#endif /* VERIFY */
X
X#ifdef BENCHMARK
X    /* Then we simply return.... */
X#endif /* BENCHMARK */
X
X}
Shar_Eof
exit 0

-Bennett
bet at orion.mc.duke.edu



More information about the Comp.lang.c mailing list