Skip to content

Commit

Permalink
hash: switch to simpler "fast algorithm" (#173)
Browse files Browse the repository at this point in the history
Bob Jenkins's Lookup3 hash function[1] added
in commit 9ff3373 ("add odict") might
be faster than Bob Jenkins's One-At-A-Time
hash function[2] used previously, but its
default implementation can trigger crashes
as it's reading data after the buffer to
process.

One has to define VALGRIND when building to
prevent the function from reading past a
buffer whose size is not a 4 bytes multple.

Instead of fixing lookup3 implementation,
simply defaulting to "VALGRIND" implementation,
32bits FNV1-a[3] hash was chosen to replace
the lookup3 hash, because
- it's simpler,
- it's smaller,
- it's fast enough for short keys,
- it's good enough collision-wise.

This conclusion is supported by running
the Perl testsuite built with various
hash algoritm: FNV1-a is faster than
One-At-A-time[4] when used to implement
a hash table, and it could be faster than
other algorithms that seems to be faster
than lookup3[4][5].

Faster hash algorithms seems to rely on
hardware acceleration (SIMD, AES, etc.),
unfortunately that make them more complex
and, more important, less portable.

Exceptions exist, such as xxHash[6], but
it's a large amount of code that hardly
justify the cost of importing it.

SipHash[7] gains many users over the years,
but would be slower than FNV1-a for small
strings.

So FNV1-a should be enough for re library
usage.

One-At-A-Time is kept asis because public
API rely on HTTP and SIP header identifiers
equal to their hashed string value.

https://www.burtleburtle.net/bob/hash/#lookup

[1] https://www.burtleburtle.net/bob/c/lookup3.c
[2] https://www.burtleburtle.net/bob/hash/doobs.html#one
[3] http://www.isthe.com/chongo/tech/comp/fnv/
[4] https://github.com/rurban/perl-hash-stats
[5] https://github.com/rurban/smhasher
[6] https://github.com/Cyan4973/xxHash#small-data
  • Loading branch information
ydroneaud authored Oct 26, 2021
1 parent 461b74c commit e923044
Showing 1 changed file with 18 additions and 206 deletions.
224 changes: 18 additions & 206 deletions src/hash/func.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
#include <re_hash.h>


#define FNV1_32A_INIT UINT32_C(0x811c9dc5)
#define FNV_32_PRIME UINT32_C(0x01000193)


/**
* Calculate hash-value using "Jenkins One-at-a-time" hash algorithm.
*
Expand Down Expand Up @@ -136,207 +140,6 @@ uint32_t hash_joaat_pl_ci(const struct pl *pl)
}


/*
* My best guess at if you are big-endian or little-endian. This may
* need adjustment.
*/
#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
__BYTE_ORDER == __LITTLE_ENDIAN) || \
(defined(i386) || defined(__i386__) || defined(__i486__) || \
defined(__i586__) || defined(__i686__) || \
defined(vax) || defined(MIPSEL))
# define HASH_LITTLE_ENDIAN 1
# define HASH_BIG_ENDIAN 0
#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
__BYTE_ORDER == __BIG_ENDIAN) || \
(defined(sparc) || defined(POWERPC) || \
defined(mc68000) || defined(sel))
# define HASH_LITTLE_ENDIAN 0
# define HASH_BIG_ENDIAN 1
#else
# define HASH_LITTLE_ENDIAN 0
# define HASH_BIG_ENDIAN 0
#endif

#define hashsize(n) ((uint32_t)1<<(n))
#define hashmask(n) (hashsize(n)-1)
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))

#define mix(a,b,c) { \
a -= c; a ^= rot(c, 4); c += b; \
b -= a; b ^= rot(a, 6); a += c; \
c -= b; c ^= rot(b, 8); b += a; \
a -= c; a ^= rot(c,16); c += b; \
b -= a; b ^= rot(a,19); a += c; \
c -= b; c ^= rot(b, 4); b += a; \
}


#define final(a,b,c) \
{ \
c ^= b; c -= rot(b,14); \
a ^= c; a -= rot(c,11); \
b ^= a; b -= rot(a,25); \
c ^= b; c -= rot(b,16); \
a ^= c; a -= rot(c,4); \
b ^= a; b -= rot(a,14); \
c ^= b; c -= rot(b,24); \
}


static uint32_t hashlittle( const void *key, size_t length, uint32_t initval)
{
uint32_t a,b,c;
union { const void *ptr; size_t i; } u;

/* Set up the internal state */
a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;

u.ptr = key;
if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
const uint32_t *k = (const uint32_t *)key;

while (length > 12) {
a += k[0];
b += k[1];
c += k[2];
mix(a,b,c);
length -= 12;
k += 3;
}

#ifndef VALGRIND
switch (length) {

case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
case 8 : b+=k[1]; a+=k[0]; break;
case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
case 6 : b+=k[1]&0xffff; a+=k[0]; break;
case 5 : b+=k[1]&0xff; a+=k[0]; break;
case 4 : a+=k[0]; break;
case 3 : a+=k[0]&0xffffff; break;
case 2 : a+=k[0]&0xffff; break;
case 1 : a+=k[0]&0xff; break;
case 0 : return c;
}

#else /* make valgrind happy */

const uint8_t *k8 = (const uint8_t *)k;
switch (length) {

case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
case 10: c+=((uint32_t)k8[9])<<8; /* fall through */
case 9 : c+=k8[8]; /* fall through */
case 8 : b+=k[1]; a+=k[0]; break;
case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */
case 5 : b+=k8[4]; /* fall through */
case 4 : a+=k[0]; break;
case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */
case 1 : a+=k8[0]; break;
case 0 : return c;
}

#endif /* !valgrind */

}
else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
const uint16_t *k = (const uint16_t *)key;
const uint8_t *k8;

while (length > 12) {
a += k[0] + (((uint32_t)k[1])<<16);
b += k[2] + (((uint32_t)k[3])<<16);
c += k[4] + (((uint32_t)k[5])<<16);
mix(a,b,c);
length -= 12;
k += 6;
}

k8 = (const uint8_t *)k;

switch (length) {

case 12: c+=k[4]+(((uint32_t)k[5])<<16);
b+=k[2]+(((uint32_t)k[3])<<16);
a+=k[0]+(((uint32_t)k[1])<<16);
break;
case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
case 10: c+=k[4];
b+=k[2]+(((uint32_t)k[3])<<16);
a+=k[0]+(((uint32_t)k[1])<<16);
break;
case 9 : c+=k8[8]; /* fall through */
case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
a+=k[0]+(((uint32_t)k[1])<<16);
break;
case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
case 6 : b+=k[2];
a+=k[0]+(((uint32_t)k[1])<<16);
break;
case 5 : b+=k8[4]; /* fall through */
case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
break;
case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
case 2 : a+=k[0];
break;
case 1 : a+=k8[0];
break;
case 0 : return c;
}
}
else {
const uint8_t *k = (const uint8_t *)key;

while (length > 12) {
a += k[0];
a += ((uint32_t)k[1])<<8;
a += ((uint32_t)k[2])<<16;
a += ((uint32_t)k[3])<<24;
b += k[4];
b += ((uint32_t)k[5])<<8;
b += ((uint32_t)k[6])<<16;
b += ((uint32_t)k[7])<<24;
c += k[8];
c += ((uint32_t)k[9])<<8;
c += ((uint32_t)k[10])<<16;
c += ((uint32_t)k[11])<<24;
mix(a,b,c);
length -= 12;
k += 12;
}

/* all the case statements fall through */
switch (length) {

case 12: c+=((uint32_t)k[11])<<24; /* fall through */
case 11: c+=((uint32_t)k[10])<<16; /* fall through */
case 10: c+=((uint32_t)k[9])<<8; /* fall through */
case 9 : c+=k[8]; /* fall through */
case 8 : b+=((uint32_t)k[7])<<24; /* fall through */
case 7 : b+=((uint32_t)k[6])<<16; /* fall through */
case 6 : b+=((uint32_t)k[5])<<8; /* fall through */
case 5 : b+=k[4]; /* fall through */
case 4 : a+=((uint32_t)k[3])<<24; /* fall through */
case 3 : a+=((uint32_t)k[2])<<16; /* fall through */
case 2 : a+=((uint32_t)k[1])<<8; /* fall through */
case 1 : a+=k[0];
break;
case 0 : return c;
}
}

final(a,b,c);
return c;
}


/**
* Calculate hash-value using fast hash algorithm.
*
Expand All @@ -347,12 +150,14 @@ static uint32_t hashlittle( const void *key, size_t length, uint32_t initval)
*/
uint32_t hash_fast(const char *k, size_t len)
{
static volatile int random_seed = 0x304a0012;
uint32_t h = FNV1_32A_INIT;

if (!k)
return 0;
while (len--) {
h ^= (uint32_t)*k++;
h *= FNV_32_PRIME;
}

return hashlittle(k, len, random_seed);
return h;
}


Expand All @@ -365,5 +170,12 @@ uint32_t hash_fast(const char *k, size_t len)
*/
uint32_t hash_fast_str(const char *str)
{
return hash_fast(str, str_len(str));
uint32_t h = FNV1_32A_INIT;

while (*str) {
h ^= (uint32_t)*str++;
h *= FNV_32_PRIME;
}

return h;
}

0 comments on commit e923044

Please sign in to comment.