hash function for text in C

Silver gaynor at paul.rutgers.edu
Thu Oct 18 15:13:09 AEST 1990


You must supply definitions of uint8 (an 8-bit unsigned int), uint16
(similarly), and uint32 (likewise).

----------------------------------- hash.h ------------------------------------
/* This is an implementation of Peter K. Pearson's hash algorithm as         */
/* presented in "Fast Hashing of Variable-Length Text Strings", ACM 6/90.    */
/* The conclusion of his article is cited below.                             */
/*                                                                           */
/*   The main advantages of the hashing function presented here are:         */
/*   1. No restriction is placed on the length of the text string.           */
/*   2. The length of the text string does not need to be known beforehand.  */
/*   3. Very little arithmetic is performed on each character being hashed.  */
/*   4. Similar strings are not likely to collide.                           */
/*   5. Minimal, perfect hashing functions can be built in this form.        */
/*  [6. It's average distribution is quite random.]                          */
/*                                                                           */
/*   Its principal disadvantages are:                                        */
/*   1. Output value ranges that are not powers of 2 are somwehat more       */
/*      complicated to provide.                                              */
/*   2. More auxiliary memory (the 256-byte table T) is required by this     */
/*      hashing function than by many traditional functions.                 */
/*  [3. I blew 50 cents copying the article and then had to type it in.]     */
/*                                                                           */
/* Elaborating on advantage 5 above, by fatootsing with the permutation      */
/* table T, one can make hash8 perform perfect minimal hashes.  The author   */
/* provides the following guidelines for construction of such a table:       */
/*                                                                           */
/*   [C[N] are the characters of the string being hashed and h[N] is the Nth */
/*    iteration of the loop.]                                                */
/*                                                                           */
/*   1. A table T was consructed by pseudorandom permutation of the integers */
/*      (0 ... 255).                                                         */
/*   2. One by one, the desired values were assigned to the words in the     */
/*      list.  Each assignment was effected by exchanging two elemnts in the */
/*      table.                                                               */
/*   3. For each word, the first candidate considered for exchange was       */
/*      T[h[n-1] ^ C[n]], the last table element referenced in the           */
/*      computation of the hash function for that word.                      */
/*   4. A table element could not be exchanged if it was referenced during   */
/*      the hashing of a previously assgined word or if it was referenced    */
/*      earlier in the hashing of the same word.                             */
/*   5. If the necessary exchange was forbidden by Rule 4, attention was     */
/*      shifted to the previously referenced table element,                  */
/*      T[[h[n-2]] ^ C[n-1]].                                                */
/*                                                                           */
/*   The procedure is not always successful.  For example, using the ascii   */
/*   character codes, if the word "a" hashes to 0 and the word "i" hashes to */
/*   15, it turns out that the word "in" must hash to 0.  Initial attempts   */
/*   to map Knuth's 31 words onto the integers (0 ... 30) failed for exactly */
/*   this reason.  The shift to the range (1 ... 31) was an ad hoc tactic to */
/*   circumvent this problem.                                                */

/* Hash s into an 8, 16, or 32 bit unsigned integer.  The larger-sized       */
/* values are generated by applying hash8 in parallel.                       */
extern uint8  hash8(uint8 * s);
extern uint16 hash16(uint8 * s);
extern uint32 hash32(uint8 * s);

/* The hash permutation table.                                               */
extern uint32 * hashpermutation;
----------------------------------- hash.c ------------------------------------
#include "hash.h"

static uint8 permutation[256] =
{
 51, 140, 233,  27, 118, 125, 170, 138, 119, 132, 174,  97,  25, 110,   1,  14,
 65,  36,  40, 188,  73, 173,   7,  30,  68,  56, 169, 234, 107, 177, 197,  87,
 28, 210, 186,  67,   2,  15, 115,  48, 223, 148, 211,  57, 190, 104, 213,  49,
144, 172, 147, 124, 157, 238, 167, 183,  78,  75,  58,  22,  70, 103, 181,  12,
254,  41, 198, 168,  46,  79, 241, 156,  83, 128,  66,  60,  86, 141, 161, 176,
221,  54, 192, 252, 116,  95, 206,  35,  88, 133, 154, 250, 237, 253,  85, 178,
 93, 159, 155,  42,   9,  89,   3,  61, 201, 158, 106,  82, 240, 255, 218, 102,
189,   8,  33,   4, 145,  16, 150,  26,  99, 100, 195, 175,  34,  50,  80, 166,
194, 195, 164,  29, 134, 105,  55, 143, 122, 130, 245, 208,  72,  77,  64, 121,
139, 232, 191, 108, 228, 137,  59,  74,  11, 126, 171,   5, 242, 101, 239, 193,
112, 113,  98,  21, 207, 225, 151, 251,  92,  91,  17, 127,  20,  81,  24,   6,
 43, 196, 204, 247, 212, 224, 220,  94,  32,  13, 187, 199, 214,  18, 226,  84,
 71, 231, 165,  19, 202, 217,  90, 129, 136, 153, 182, 111, 244,  45, 236, 249,
109,  47, 180, 205, 215, 160,  53, 162, 114, 246, 179,  62, 227,  96, 142, 230,
184, 146, 117,  39,  69,  37,  23,  63,  52, 216,   0, 135, 149,  31,  38,  44,
209, 120,  76, 203, 229, 123, 131, 152,  10, 219, 243, 248, 235, 222, 200, 163
};

uint8 hash8(register uint8 * s)
  {register static uint8 h;
   if (*s)
       {h = permutation[*s];
        while (*++s)
          h ^= permutation[h ^ *s];
        return(h);}
     else
       {return((uint8)0);}

uint16 hash16(register uint8 * s);
  {register uint8 h1;
   register uint8 h2;
   if (*s)
       {h1 = permutation[*s];
        h2 = permutation[*s + 1];
        while (*++s)
          {h1 ^= permutation[h1 ^ *s];
           h2 ^= permutation[h2 ^ *s];}
        return((uint16)((h1<<8) & h2));}
     else
       {return((uint16)0);}}

uint32 hash32(register uint8 * s);
  {register uint8 h1;
   register uint8 h2;
   register uint8 h3;
   register uint8 h4;
   if (*s)
       {h1 = permutation[*s]
        h2 = permutation[*s + 1];
        h2 = permutation[*s + 2];
        h2 = permutation[*s + 3];
        while (*++s)
          {h1 ^= permutation[h1 ^ *s];
           h2 ^= permutation[h2 ^ *s];
           h3 ^= permutation[h3 ^ *s];
           h4 ^= permutation[h4 ^ *s];}
        return((uint32)((h1<<24) & (h2<<16) & (h3<<8) & h4));}
     else
       {return((uint32)0);}}



More information about the Comp.lang.c mailing list