This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
libiberty PATCH: add generic iterative hash to hashtable.c
- From: Jason Merrill <jason at redhat dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Wed, 07 May 2003 13:55:54 -0400
- Subject: libiberty PATCH: add generic iterative hash to hashtable.c
The hashtable code currently has hash functions for single pointers and
strings, but doesn't provide a way to generate a hash value from multiple
bits of data. I want something like that for the tree-ssa gimplifier, for
hashing arbitrary tree expressions; I want to at least hash the tree code
and the operands. Looking around online, I found the following algorithm.
Zack mentions it in the comment for htab_hash_string as being inferior to
his algorithm for hashing identifiers, but my needs are more general.
I expect that this will be useful to other folks, too.
Applied to trunk and tree-ssa.
2003-05-07 Jason Merrill <jason@redhat.com>
libiberty/
* hashtab.c (burtle_hash): New fn.
* configure.in: Add AC_C_BIGENDIAN_CROSS.
* aclocal.m4: Include accross.m4.
* configure, config.in: Regenerate.
include/
* hashtab.h (burtle_hash): Prototype.
(burtle_hash_object): New macro.
*** libiberty/aclocal.m4.~1~ 2003-02-25 17:24:21.000000000 -0500
--- libiberty/aclocal.m4 2003-05-07 11:59:24.000000000 -0400
***************
*** 1,3 ****
--- 1,5 ----
+ sinclude(../config/accross.m4)
+
dnl See whether strncmp reads past the end of its string parameters.
dnl On some versions of SunOS4 at least, strncmp reads a word at a time
dnl but erroneously reads past the end of strings. This can cause
*** libiberty/configure.in.~1~ 2003-05-07 11:14:00.000000000 -0400
--- libiberty/configure.in 2003-05-07 11:48:50.000000000 -0400
*************** AC_SUBST(OUTPUT_OPTION)
*** 114,119 ****
--- 114,120 ----
AC_ISC_POSIX
AC_C_CONST
AC_C_INLINE
+ AC_C_BIGENDIAN_CROSS
dnl When we start using libtool:
dnl Default to a non shared library. This may be overridden by the
*** libiberty/hashtab.c.~1~ 2003-05-07 11:18:31.000000000 -0400
--- libiberty/hashtab.c 2003-05-07 13:35:31.000000000 -0400
*************** htab_hash_string (p)
*** 709,711 ****
--- 709,849 ----
return r;
}
+
+ /* DERIVED FROM:
+ --------------------------------------------------------------------
+ lookup2.c, by Bob Jenkins, December 1996, Public Domain.
+ hash(), hash2(), hash3, and mix() are externally useful functions.
+ Routines to test the hash are included if SELF_TEST is defined.
+ You can use this free for any purpose. It has no warranty.
+ --------------------------------------------------------------------
+ */
+
+ /*
+ --------------------------------------------------------------------
+ mix -- mix 3 32-bit values reversibly.
+ For every delta with one or two bit set, and the deltas of all three
+ high bits or all three low bits, whether the original value of a,b,c
+ is almost all zero or is uniformly distributed,
+ * If mix() is run forward or backward, at least 32 bits in a,b,c
+ have at least 1/4 probability of changing.
+ * If mix() is run forward, every bit of c will change between 1/3 and
+ 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
+ mix() was built out of 36 single-cycle latency instructions in a
+ structure that could supported 2x parallelism, like so:
+ a -= b;
+ a -= c; x = (c>>13);
+ b -= c; a ^= x;
+ b -= a; x = (a<<8);
+ c -= a; b ^= x;
+ c -= b; x = (b>>13);
+ ...
+ Unfortunately, superscalar Pentiums and Sparcs can't take advantage
+ of that parallelism. They've also turned some of those single-cycle
+ latency instructions into multi-cycle latency instructions. Still,
+ this is the fastest good hash I could find. There were about 2^^68
+ to choose from. I only looked at a billion or so.
+ --------------------------------------------------------------------
+ */
+ /* same, but slower, works on systems that might have 8 byte hashval_t's */
+ #define mix(a,b,c) \
+ { \
+ a -= b; a -= c; a ^= (c>>13); \
+ b -= c; b -= a; b ^= (a<< 8); \
+ c -= a; c -= b; c ^= ((b&0xffffffff)>>13); \
+ a -= b; a -= c; a ^= ((c&0xffffffff)>>12); \
+ b -= c; b -= a; b = (b ^ (a<<16)) & 0xffffffff; \
+ c -= a; c -= b; c = (c ^ (b>> 5)) & 0xffffffff; \
+ a -= b; a -= c; a = (a ^ (c>> 3)) & 0xffffffff; \
+ b -= c; b -= a; b = (b ^ (a<<10)) & 0xffffffff; \
+ c -= a; c -= b; c = (c ^ (b>>15)) & 0xffffffff; \
+ }
+
+ /*
+ --------------------------------------------------------------------
+ hash() -- hash a variable-length key into a 32-bit value
+ k : the key (the unaligned variable-length array of bytes)
+ len : the length of the key, counting by bytes
+ level : can be any 4-byte value
+ Returns a 32-bit value. Every bit of the key affects every bit of
+ the return value. Every 1-bit and 2-bit delta achieves avalanche.
+ About 36+6len instructions.
+
+ The best hash table sizes are powers of 2. There is no need to do
+ mod a prime (mod is sooo slow!). If you need less than 32 bits,
+ use a bitmask. For example, if you need only 10 bits, do
+ h = (h & hashmask(10));
+ In which case, the hash table should have hashsize(10) elements.
+
+ If you are hashing n strings (ub1 **)k, do it like this:
+ for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
+
+ By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this
+ code any way you wish, private, educational, or commercial. It's free.
+
+ See http://burtleburtle.net/bob/hash/evahash.html
+ Use for hash table lookup, or anything where one collision in 2^32 is
+ acceptable. Do NOT use for cryptographic purposes.
+ --------------------------------------------------------------------
+ */
+
+ hashval_t burtle_hash (k_in, length, initval)
+ const PTR k_in; /* the key */
+ register size_t length; /* the length of the key */
+ register hashval_t initval; /* the previous hash, or an arbitrary value */
+ {
+ register const unsigned char *k = (const unsigned char *)k_in;
+ register hashval_t a,b,c,len;
+
+ /* Set up the internal state */
+ len = length;
+ a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
+ c = initval; /* the previous hash value */
+
+ /*---------------------------------------- handle most of the key */
+ #ifndef WORDS_BIGENDIAN
+ /* On a little-endian machine, if the data is 4-byte aligned we can hash
+ by word for better speed. This gives nondeterministic results on
+ big-endian machines. */
+ if (sizeof (hashval_t) == 4 && (((size_t)k)&3) == 0)
+ while (len >= 12) /* aligned */
+ {
+ a += *(hashval_t *)(k+0);
+ b += *(hashval_t *)(k+4);
+ c += *(hashval_t *)(k+8);
+ mix(a,b,c);
+ k += 12; len -= 12;
+ }
+ else /* unaligned */
+ #endif
+ while (len >= 12)
+ {
+ a += (k[0] +((hashval_t)k[1]<<8) +((hashval_t)k[2]<<16) +((hashval_t)k[3]<<24));
+ b += (k[4] +((hashval_t)k[5]<<8) +((hashval_t)k[6]<<16) +((hashval_t)k[7]<<24));
+ c += (k[8] +((hashval_t)k[9]<<8) +((hashval_t)k[10]<<16)+((hashval_t)k[11]<<24));
+ mix(a,b,c);
+ k += 12; len -= 12;
+ }
+
+ /*------------------------------------- handle the last 11 bytes */
+ c += length;
+ switch(len) /* all the case statements fall through */
+ {
+ case 11: c+=((hashval_t)k[10]<<24);
+ case 10: c+=((hashval_t)k[9]<<16);
+ case 9 : c+=((hashval_t)k[8]<<8);
+ /* the first byte of c is reserved for the length */
+ case 8 : b+=((hashval_t)k[7]<<24);
+ case 7 : b+=((hashval_t)k[6]<<16);
+ case 6 : b+=((hashval_t)k[5]<<8);
+ case 5 : b+=k[4];
+ case 4 : a+=((hashval_t)k[3]<<24);
+ case 3 : a+=((hashval_t)k[2]<<16);
+ case 2 : a+=((hashval_t)k[1]<<8);
+ case 1 : a+=k[0];
+ /* case 0: nothing left to add */
+ }
+ mix(a,b,c);
+ /*-------------------------------------------- report the result */
+ return c;
+ }
*** include/hashtab.h.~1~ 2003-05-07 11:18:31.000000000 -0400
--- include/hashtab.h 2003-05-07 13:35:05.000000000 -0400
*************** extern htab_eq htab_eq_pointer;
*** 183,188 ****
--- 183,193 ----
/* A hash function for null-terminated strings. */
extern hashval_t htab_hash_string PARAMS ((const PTR));
+ /* An iterative hash function for arbitrary data. */
+ extern hashval_t burtle_hash PARAMS ((const PTR, size_t, hashval_t));
+ /* Shorthand for hashing something with an intrinsic size. */
+ #define burtle_hash_object(OB,INIT) burtle_hash (&OB, sizeof (OB), INIT)
+
#ifdef __cplusplus
}
#endif /* __cplusplus */