dict: Update hash function

Update hash function from classic Jenkins OAAT (dict.c) and a variant of
DJB2 (hash.c) to "GoodOAAT" taken from the SMHasher repo. This hash
function passes all SMHasher tests.
This commit is contained in:
Nick Wellnhofer 2023-09-04 16:07:23 +02:00
parent 93e8bb2a40
commit edc2dd48cb
4 changed files with 120 additions and 85 deletions

83
dict.c
View File

@ -92,7 +92,7 @@ struct _xmlDictEntry {
const xmlChar *name; const xmlChar *name;
unsigned int len; unsigned int len;
int valid; int valid;
unsigned long okey; unsigned okey;
}; };
typedef struct _xmlDictStrings xmlDictStrings; typedef struct _xmlDictStrings xmlDictStrings;
@ -374,34 +374,28 @@ found_pool:
* *
* Calculate a hash key using a good hash function that works well for * Calculate a hash key using a good hash function that works well for
* larger hash table sizes. * larger hash table sizes.
*
* Hash function by "One-at-a-Time Hash" see
* http://burtleburtle.net/bob/hash/doobs.html
*/ */
#ifdef __clang__ #ifdef __clang__
ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow") ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow")
ATTRIBUTE_NO_SANITIZE("unsigned-shift-base") ATTRIBUTE_NO_SANITIZE("unsigned-shift-base")
#endif #endif
static uint32_t static unsigned
xmlDictComputeBigKey(const xmlChar* data, int namelen, unsigned seed) { xmlDictComputeBigKey(const xmlChar* data, int namelen, unsigned seed) {
uint32_t hash; unsigned h1, h2;
int i; int i;
if (namelen <= 0 || data == NULL) return(0); if (namelen <= 0 || data == NULL) return(0);
hash = seed; HASH_INIT(h1, h2, seed);
for (i = 0;i < namelen; i++) { for (i = 0;i < namelen; i++) {
hash += data[i]; HASH_UPDATE(h1, h2, data[i]);
hash += (hash << 10);
hash ^= (hash >> 6);
} }
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash; HASH_FINISH(h1, h2);
return h2;
} }
/* /*
@ -419,34 +413,27 @@ xmlDictComputeBigKey(const xmlChar* data, int namelen, unsigned seed) {
ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow") ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow")
ATTRIBUTE_NO_SANITIZE("unsigned-shift-base") ATTRIBUTE_NO_SANITIZE("unsigned-shift-base")
#endif #endif
static unsigned long static unsigned
xmlDictComputeBigQKey(const xmlChar *prefix, int plen, xmlDictComputeBigQKey(const xmlChar *prefix, int plen,
const xmlChar *name, int len, unsigned seed) const xmlChar *name, int len, unsigned seed)
{ {
uint32_t hash; unsigned h1, h2;
int i; int i;
hash = seed; HASH_INIT(h1, h2, seed);
for (i = 0;i < plen; i++) { for (i = 0; i < plen; i++) {
hash += prefix[i]; HASH_UPDATE(h1, h2, prefix[i]);
hash += (hash << 10);
hash ^= (hash >> 6);
} }
hash += ':'; HASH_UPDATE(h1, h2, ':');
hash += (hash << 10);
hash ^= (hash >> 6);
for (i = 0;i < len; i++) { for (i = 0; i < len; i++) {
hash += name[i]; HASH_UPDATE(h1, h2, name[i]);
hash += (hash << 10);
hash ^= (hash >> 6);
} }
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash; HASH_FINISH(h1, h2);
return h2;
} }
#endif /* WITH_BIG_KEY */ #endif /* WITH_BIG_KEY */
@ -456,9 +443,13 @@ xmlDictComputeBigQKey(const xmlChar *prefix, int plen,
* Calculate a hash key using a fast hash function that works well * Calculate a hash key using a fast hash function that works well
* for low hash table fill. * for low hash table fill.
*/ */
static unsigned long #ifdef __clang__
ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow")
ATTRIBUTE_NO_SANITIZE("unsigned-shift-base")
#endif
static unsigned
xmlDictComputeFastKey(const xmlChar *name, int namelen, unsigned seed) { xmlDictComputeFastKey(const xmlChar *name, int namelen, unsigned seed) {
unsigned long value = seed; unsigned value = seed;
if ((name == NULL) || (namelen <= 0)) if ((name == NULL) || (namelen <= 0))
return(value); return(value);
@ -500,11 +491,15 @@ xmlDictComputeFastKey(const xmlChar *name, int namelen, unsigned seed) {
* *
* Neither of the two strings must be NULL. * Neither of the two strings must be NULL.
*/ */
static unsigned long #ifdef __clang__
ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow")
ATTRIBUTE_NO_SANITIZE("unsigned-shift-base")
#endif
static unsigned
xmlDictComputeFastQKey(const xmlChar *prefix, int plen, xmlDictComputeFastQKey(const xmlChar *prefix, int plen,
const xmlChar *name, int len, unsigned seed) const xmlChar *name, int len, unsigned seed)
{ {
unsigned long value = seed; unsigned value = seed;
if (plen == 0) if (plen == 0)
value += 30 * ':'; value += 30 * ':';
@ -669,12 +664,12 @@ xmlDictReference(xmlDictPtr dict) {
*/ */
static int static int
xmlDictGrow(xmlDictPtr dict, size_t size) { xmlDictGrow(xmlDictPtr dict, size_t size) {
unsigned long key, okey; unsigned key, okey;
size_t oldsize, i; size_t oldsize, i;
xmlDictEntryPtr iter, next; xmlDictEntryPtr iter, next;
struct _xmlDictEntry *olddict; struct _xmlDictEntry *olddict;
#ifdef DEBUG_GROW #ifdef DEBUG_GROW
unsigned long nbElem = 0; unsigned nbElem = 0;
#endif #endif
int ret = 0; int ret = 0;
int keep_keys = 1; int keep_keys = 1;
@ -861,7 +856,7 @@ xmlDictFree(xmlDictPtr dict) {
*/ */
const xmlChar * const xmlChar *
xmlDictLookup(xmlDictPtr dict, const xmlChar *name, int len) { xmlDictLookup(xmlDictPtr dict, const xmlChar *name, int len) {
unsigned long key, okey, nbi = 0; unsigned key, okey, nbi = 0;
xmlDictEntryPtr entry; xmlDictEntryPtr entry;
xmlDictEntryPtr insert; xmlDictEntryPtr insert;
const xmlChar *ret; const xmlChar *ret;
@ -914,7 +909,7 @@ xmlDictLookup(xmlDictPtr dict, const xmlChar *name, int len) {
} }
if (dict->subdict) { if (dict->subdict) {
unsigned long skey; unsigned skey;
/* we cannot always reuse the same okey for the subdict */ /* we cannot always reuse the same okey for the subdict */
if (((dict->size == MIN_DICT_SIZE) && if (((dict->size == MIN_DICT_SIZE) &&
@ -1004,7 +999,7 @@ xmlDictLookup(xmlDictPtr dict, const xmlChar *name, int len) {
*/ */
const xmlChar * const xmlChar *
xmlDictExists(xmlDictPtr dict, const xmlChar *name, int len) { xmlDictExists(xmlDictPtr dict, const xmlChar *name, int len) {
unsigned long key, okey; unsigned key, okey;
xmlDictEntryPtr insert; xmlDictEntryPtr insert;
unsigned int l; unsigned int l;
@ -1053,7 +1048,7 @@ xmlDictExists(xmlDictPtr dict, const xmlChar *name, int len) {
} }
if (dict->subdict) { if (dict->subdict) {
unsigned long skey; unsigned skey;
/* we cannot always reuse the same okey for the subdict */ /* we cannot always reuse the same okey for the subdict */
if (((dict->size == MIN_DICT_SIZE) && if (((dict->size == MIN_DICT_SIZE) &&
@ -1110,7 +1105,7 @@ xmlDictExists(xmlDictPtr dict, const xmlChar *name, int len) {
*/ */
const xmlChar * const xmlChar *
xmlDictQLookup(xmlDictPtr dict, const xmlChar *prefix, const xmlChar *name) { xmlDictQLookup(xmlDictPtr dict, const xmlChar *prefix, const xmlChar *name) {
unsigned long okey, key, nbi = 0; unsigned okey, key, nbi = 0;
xmlDictEntryPtr entry; xmlDictEntryPtr entry;
xmlDictEntryPtr insert; xmlDictEntryPtr insert;
const xmlChar *ret; const xmlChar *ret;
@ -1146,7 +1141,7 @@ xmlDictQLookup(xmlDictPtr dict, const xmlChar *prefix, const xmlChar *name) {
} }
if (dict->subdict) { if (dict->subdict) {
unsigned long skey; unsigned skey;
/* we cannot always reuse the same okey for the subdict */ /* we cannot always reuse the same okey for the subdict */
if (((dict->size == MIN_DICT_SIZE) && if (((dict->size == MIN_DICT_SIZE) &&

76
hash.c
View File

@ -81,88 +81,88 @@ struct _xmlHashTable {
ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow") ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow")
ATTRIBUTE_NO_SANITIZE("unsigned-shift-base") ATTRIBUTE_NO_SANITIZE("unsigned-shift-base")
#endif #endif
static unsigned long static unsigned
xmlHashComputeKey(xmlHashTablePtr table, const xmlChar *name, xmlHashComputeKey(xmlHashTablePtr table, const xmlChar *name,
const xmlChar *name2, const xmlChar *name3) { const xmlChar *name2, const xmlChar *name3) {
unsigned long value; unsigned h1, h2, ch;
unsigned long ch;
HASH_INIT(h1, h2, table->random_seed);
value = table->random_seed;
if (name != NULL) { if (name != NULL) {
value += 30 * (*name);
while ((ch = *name++) != 0) { while ((ch = *name++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
} }
value = value ^ ((value << 5) + (value >> 3)); HASH_UPDATE(h1, h2, 0);
if (name2 != NULL) { if (name2 != NULL) {
while ((ch = *name2++) != 0) { while ((ch = *name2++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
} }
value = value ^ ((value << 5) + (value >> 3)); HASH_UPDATE(h1, h2, 0);
if (name3 != NULL) { if (name3 != NULL) {
while ((ch = *name3++) != 0) { while ((ch = *name3++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
} }
return (value % table->size);
HASH_FINISH(h1, h2);
return (h2 % table->size);
} }
#ifdef __clang__ #ifdef __clang__
ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow") ATTRIBUTE_NO_SANITIZE("unsigned-integer-overflow")
ATTRIBUTE_NO_SANITIZE("unsigned-shift-base") ATTRIBUTE_NO_SANITIZE("unsigned-shift-base")
#endif #endif
static unsigned long static unsigned
xmlHashComputeQKey(xmlHashTablePtr table, xmlHashComputeQKey(xmlHashTablePtr table,
const xmlChar *prefix, const xmlChar *name, const xmlChar *prefix, const xmlChar *name,
const xmlChar *prefix2, const xmlChar *name2, const xmlChar *prefix2, const xmlChar *name2,
const xmlChar *prefix3, const xmlChar *name3) { const xmlChar *prefix3, const xmlChar *name3) {
unsigned long value; unsigned h1, h2, ch;
unsigned long ch;
value = table->random_seed; HASH_INIT(h1, h2, table->random_seed);
if (prefix != NULL)
value += 30 * (*prefix);
else
value += 30 * (*name);
if (prefix != NULL) { if (prefix != NULL) {
while ((ch = *prefix++) != 0) { while ((ch = *prefix++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
value = value ^ ((value << 5) + (value >> 3) + ':'); HASH_UPDATE(h1, h2, ':');
} }
if (name != NULL) { if (name != NULL) {
while ((ch = *name++) != 0) { while ((ch = *name++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
} }
value = value ^ ((value << 5) + (value >> 3)); HASH_UPDATE(h1, h2, 0);
if (prefix2 != NULL) { if (prefix2 != NULL) {
while ((ch = *prefix2++) != 0) { while ((ch = *prefix2++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
value = value ^ ((value << 5) + (value >> 3) + ':'); HASH_UPDATE(h1, h2, ':');
} }
if (name2 != NULL) { if (name2 != NULL) {
while ((ch = *name2++) != 0) { while ((ch = *name2++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
} }
value = value ^ ((value << 5) + (value >> 3)); HASH_UPDATE(h1, h2, 0);
if (prefix3 != NULL) { if (prefix3 != NULL) {
while ((ch = *prefix3++) != 0) { while ((ch = *prefix3++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
value = value ^ ((value << 5) + (value >> 3) + ':'); HASH_UPDATE(h1, h2, ':');
} }
if (name3 != NULL) { if (name3 != NULL) {
while ((ch = *name3++) != 0) { while ((ch = *name3++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + ch); HASH_UPDATE(h1, h2, ch);
} }
} }
return (value % table->size);
HASH_FINISH(h1, h2);
return (h2 % table->size);
} }
/** /**
@ -232,12 +232,12 @@ xmlHashCreateDict(int size, xmlDictPtr dict) {
*/ */
static int static int
xmlHashGrow(xmlHashTablePtr table, int size) { xmlHashGrow(xmlHashTablePtr table, int size) {
unsigned long key; unsigned key;
int oldsize, i; int oldsize, i;
xmlHashEntryPtr iter, next; xmlHashEntryPtr iter, next;
struct _xmlHashEntry *oldtable; struct _xmlHashEntry *oldtable;
#ifdef DEBUG_GROW #ifdef DEBUG_GROW
unsigned long nbElem = 0; unsigned nbElem = 0;
#endif #endif
if (table == NULL) if (table == NULL)
@ -532,7 +532,7 @@ int
xmlHashAddEntry3(xmlHashTablePtr table, const xmlChar *name, xmlHashAddEntry3(xmlHashTablePtr table, const xmlChar *name,
const xmlChar *name2, const xmlChar *name3, const xmlChar *name2, const xmlChar *name3,
void *userdata) { void *userdata) {
unsigned long key, len = 0; unsigned key, len = 0;
xmlHashEntryPtr entry; xmlHashEntryPtr entry;
xmlHashEntryPtr insert; xmlHashEntryPtr insert;
@ -676,7 +676,7 @@ int
xmlHashUpdateEntry3(xmlHashTablePtr table, const xmlChar *name, xmlHashUpdateEntry3(xmlHashTablePtr table, const xmlChar *name,
const xmlChar *name2, const xmlChar *name3, const xmlChar *name2, const xmlChar *name3,
void *userdata, xmlHashDeallocator f) { void *userdata, xmlHashDeallocator f) {
unsigned long key; unsigned key;
xmlHashEntryPtr entry; xmlHashEntryPtr entry;
xmlHashEntryPtr insert; xmlHashEntryPtr insert;
@ -820,7 +820,7 @@ error:
void * void *
xmlHashLookup3(xmlHashTablePtr table, const xmlChar *name, xmlHashLookup3(xmlHashTablePtr table, const xmlChar *name,
const xmlChar *name2, const xmlChar *name3) { const xmlChar *name2, const xmlChar *name3) {
unsigned long key; unsigned key;
xmlHashEntryPtr entry; xmlHashEntryPtr entry;
if (table == NULL) if (table == NULL)
@ -866,7 +866,7 @@ xmlHashQLookup3(xmlHashTablePtr table,
const xmlChar *prefix, const xmlChar *name, const xmlChar *prefix, const xmlChar *name,
const xmlChar *prefix2, const xmlChar *name2, const xmlChar *prefix2, const xmlChar *name2,
const xmlChar *prefix3, const xmlChar *name3) { const xmlChar *prefix3, const xmlChar *name3) {
unsigned long key; unsigned key;
xmlHashEntryPtr entry; xmlHashEntryPtr entry;
if (table == NULL) if (table == NULL)
@ -1142,7 +1142,7 @@ xmlHashRemoveEntry2(xmlHashTablePtr table, const xmlChar *name,
int int
xmlHashRemoveEntry3(xmlHashTablePtr table, const xmlChar *name, xmlHashRemoveEntry3(xmlHashTablePtr table, const xmlChar *name,
const xmlChar *name2, const xmlChar *name3, xmlHashDeallocator f) { const xmlChar *name2, const xmlChar *name3, xmlHashDeallocator f) {
unsigned long key; unsigned key;
xmlHashEntryPtr entry; xmlHashEntryPtr entry;
xmlHashEntryPtr prev = NULL; xmlHashEntryPtr prev = NULL;

View File

@ -1,7 +1,47 @@
#ifndef XML_DICT_H_PRIVATE__ #ifndef XML_DICT_H_PRIVATE__
#define XML_DICT_H_PRIVATE__ #define XML_DICT_H_PRIVATE__
/*
* Values are ANDed with 0xFFFFFFFF to support platforms where
* unsigned is larger than 32 bits. With 32-bit unsigned values,
* modern compilers should optimize the operation away.
*/
#define HASH_ROL(x,n) ((x) << (n) | ((x) & 0xFFFFFFFF) >> (32 - (n))) #define HASH_ROL(x,n) ((x) << (n) | ((x) & 0xFFFFFFFF) >> (32 - (n)))
#define HASH_ROR(x,n) (((x) & 0xFFFFFFFF) >> (n) | (x) << (32 - (n)))
/*
* GoodOAAT: One of a smallest non-multiplicative One-At-a-Time functions
* that passes SMHasher.
*
* Author: Sokolov Yura aka funny-falcon
*/
#define HASH_INIT(h1, h2, seed) \
do { \
h1 = seed ^ 0x3b00; \
h2 = HASH_ROL(seed, 15); \
} while (0)
#define HASH_UPDATE(h1, h2, ch) \
do { \
h1 += ch; \
h1 += h1 << 3; \
h2 += h1; \
h2 = HASH_ROL(h2, 7); \
h2 += h2 << 2; \
} while (0)
/* Result is in h2 */
#define HASH_FINISH(h1, h2) \
do { \
h1 ^= h2; \
h1 += HASH_ROL(h2, 14); \
h2 ^= h1; h2 += HASH_ROR(h1, 6); \
h1 ^= h2; h1 += HASH_ROL(h2, 5); \
h2 ^= h1; h2 += HASH_ROR(h1, 8); \
h2 &= 0xFFFFFFFF; \
} while (0)
XML_HIDDEN void XML_HIDDEN void
xmlInitDictInternal(void); xmlInitDictInternal(void);

View File

@ -22,9 +22,9 @@ static const char *seeds2[] = {
NULL NULL
}; };
#define NB_STRINGS_MAX 10000 #define NB_STRINGS_MAX 100000
#define NB_STRINGS_NS 1000 #define NB_STRINGS_NS 10000
#define NB_STRINGS_PREFIX 50 #define NB_STRINGS_PREFIX (NB_STRINGS_NS / 20)
#define NB_STRINGS_MIN 10 #define NB_STRINGS_MIN 10
static xmlChar **strings1; static xmlChar **strings1;