-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add self-organizing hash table to improve the performance of bucket a…
…ggregations Signed-off-by: Ketan Verma <ketan9495@gmail.com>
- Loading branch information
Showing
3 changed files
with
293 additions
and
3 deletions.
There are no files selected for viewing
242 changes: 242 additions & 0 deletions
242
server/src/main/java/org/opensearch/common/util/LongRHHash.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.common.util; | ||
|
||
import com.carrotsearch.hppc.BitMixer; | ||
import org.opensearch.common.lease.Releasable; | ||
|
||
import java.util.Arrays; | ||
|
||
/** | ||
* Specialized hash table implementation that maps (primitive) long to long. | ||
* | ||
* It organizes itself by moving keys around dynamically in order to reduce the longest | ||
* probe sequence length (PSL). It also optimizes lookups for recently accessed keys, which | ||
* makes it very useful for aggregations where keys are correlated across consecutive hits. | ||
* | ||
* This class is not thread-safe. | ||
* | ||
* @opensearch.internal | ||
*/ | ||
public class LongRHHash implements Releasable { | ||
private static final long DEFAULT_CAPACITY = 1; | ||
private static final float DEFAULT_LOAD_FACTOR = 0.6f; | ||
private static final int DEFAULT_HINTS = 256; | ||
|
||
/** | ||
* Utility class to allocate big arrays. | ||
*/ | ||
private final BigArrays bigArrays; | ||
|
||
/** | ||
* Maximum load factor after which the capacity is doubled. | ||
*/ | ||
private final float loadFactor; | ||
|
||
/** | ||
* Current capacity of the hash table. | ||
* This must always be a power of two since we use a bit-mask to calculate the slot. | ||
*/ | ||
private long capacity; | ||
|
||
/** | ||
* Bit-mask used to calculate the slot (i.e. the index) after hashing a key. | ||
*/ | ||
private long mask; | ||
|
||
/** | ||
* Current size of the hash table. | ||
*/ | ||
private long size; | ||
|
||
/** | ||
* Threshold at which the hash table needs to be resized. | ||
*/ | ||
private long grow; | ||
|
||
/** | ||
* Lookup table for ordinals. | ||
*/ | ||
private LongArray ords; | ||
|
||
/** | ||
* Lookup table for keys. | ||
*/ | ||
private LongArray keys; | ||
|
||
/** | ||
* Lookup table for "recent" key hints. | ||
*/ | ||
private final long[] hints; | ||
|
||
/** | ||
* Bit-mask used to calculate the slot in the hints' table. | ||
*/ | ||
private final int hintMask; | ||
|
||
public LongRHHash(BigArrays bigArrays) { | ||
this(DEFAULT_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HINTS, bigArrays); | ||
} | ||
|
||
public LongRHHash(long capacity, float loadFactor, int numHints, BigArrays bigArrays) { | ||
capacity = (long) (capacity / loadFactor); | ||
capacity = Math.max(1, Long.highestOneBit(capacity - 1) << 1); // next power of two | ||
|
||
this.loadFactor = loadFactor; | ||
this.bigArrays = bigArrays; | ||
this.capacity = capacity; | ||
this.mask = capacity - 1; | ||
this.size = 0; | ||
this.grow = (long) (capacity * loadFactor); | ||
|
||
this.ords = bigArrays.newLongArray(capacity, false); | ||
this.ords.fill(0, capacity, -1); | ||
this.keys = bigArrays.newLongArray(capacity, false); | ||
|
||
numHints = Math.max(1, Integer.highestOneBit(numHints - 1) << 1); | ||
this.hintMask = numHints - 1; | ||
this.hints = new long[numHints]; | ||
Arrays.fill(this.hints, -1); | ||
} | ||
|
||
/** | ||
* Adds the key to the hash table and returns its ordinal (i.e. the index) starting from zero. | ||
* If the key exists already, it returns (-1-ordinal). | ||
*/ | ||
public long add(final long key) { | ||
final long ordinal = find(key); | ||
if (ordinal != -1) { | ||
return -(1 + ordinal); | ||
} | ||
|
||
if (size >= grow) { | ||
grow(); | ||
} | ||
return insert(key, size); | ||
} | ||
|
||
/** | ||
* Associates the given key with the given ordinal. | ||
* | ||
* This is inspired by Robin Hood hashing, but not implemented wholly. | ||
* Keys are moved around dynamically in order to reduce the longest probe sequence length (PSL), | ||
* which makes lookups faster as keys are likely to be found in the same CPU cache line. | ||
*/ | ||
private long insert(final long key, final long ordinal) { | ||
long idx = slot(key); // The ideal "home" location for the given key in the ordinals' table. | ||
long ord = ordinal; // The ordinal yet to find an empty slot in the ordinals' table (candidate). | ||
long psl = 0; // The displacement of "ord" from its home location. | ||
|
||
long curOrd; // The current ordinal stored at "idx". | ||
long curPsl; // The current ordinal's PSL. | ||
|
||
do { | ||
if ((curOrd = ords.get(idx)) == -1) { | ||
// Empty slot; insert the candidate ordinal here. | ||
ords.set(idx, ord); | ||
keys = bigArrays.grow(keys, size + 1); | ||
keys.set(ordinal, key); | ||
return size++; | ||
} else if ((curPsl = psl(keys.get(curOrd), idx)) < psl) { | ||
// Current key is "richer" than the candidate key at this slot; swap and find a new home for it. | ||
ord = ords.set(idx, ord); | ||
psl = curPsl; | ||
} | ||
idx = (idx + 1) & mask; | ||
psl++; | ||
} while (true); | ||
} | ||
|
||
/** | ||
* Returns the key associated with the given ordinal. | ||
* The result is undefined for an unused ordinal. | ||
*/ | ||
public long get(final long ordinal) { | ||
return keys.get(ordinal); | ||
} | ||
|
||
/** | ||
* Returns the ordinal associated with the given key, or -1 if the key doesn't exist. | ||
* | ||
* It is inspired by Cuckoo hashing but with a twist: | ||
* A fast hash function is used to look up "recent" keys, then falling back to a good hash function with | ||
* linear probing. Since two independent hash functions are used, it is unlikely that both collide. | ||
* | ||
* This is very effective for most bucket aggregations where keys are likely to be correlated across consecutive | ||
* hits (e.g. timestamps may belong to the same "hour" in the case of date histogram aggregations). | ||
*/ | ||
public long find(final long key) { | ||
int hintIdx = hintSlot(key); | ||
long hintOrd = hints[hintIdx]; | ||
if (hintOrd != -1 && keys.get(hintOrd) == key) { | ||
return hintOrd; | ||
} | ||
|
||
for (long idx = slot(key);; idx = (idx + 1) & mask) { | ||
final long ord = ords.get(idx); | ||
if (ord == -1 || keys.get(ord) == key) { | ||
hints[hintIdx] = ord; | ||
return ord; | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Returns the current size of the hash table. | ||
*/ | ||
public long size() { | ||
return size; | ||
} | ||
|
||
/** | ||
* Returns the "home" location for the given key in the ordinals' table. | ||
*/ | ||
private long slot(final long key) { | ||
return BitMixer.mix64(key) & mask; | ||
} | ||
|
||
/** | ||
* Returns the "home" location for the given key in the hints' table. | ||
*/ | ||
private int hintSlot(long key) { | ||
return BitMixer.mixPhi(key) & hintMask; | ||
} | ||
|
||
/** | ||
* Returns the probe sequence length (PSL) for the given key at the given index. | ||
*/ | ||
private long psl(final long key, final long idx) { | ||
return (capacity + idx - slot(key)) & mask; | ||
} | ||
|
||
/** | ||
* Grows the size of the hash table by doubling its capacity. | ||
*/ | ||
private void grow() { | ||
final long oldSize = size; | ||
|
||
capacity <<= 1; | ||
mask = capacity - 1; | ||
size = 0; | ||
grow = (long) (capacity * loadFactor); | ||
|
||
ords = bigArrays.resize(ords, capacity); | ||
ords.fill(0, capacity, -1); | ||
|
||
for (long ordinal = 0; ordinal < oldSize; ordinal++) { | ||
insert(keys.get(ordinal), ordinal); | ||
} | ||
} | ||
|
||
@Override | ||
public void close() { | ||
ords.close(); | ||
keys.close(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
48 changes: 48 additions & 0 deletions
48
server/src/test/java/org/opensearch/common/util/LongRHHashTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.common.util; | ||
|
||
import org.opensearch.test.OpenSearchTestCase; | ||
|
||
import java.util.Map; | ||
import java.util.TreeMap; | ||
|
||
public class LongRHHashTests extends OpenSearchTestCase { | ||
|
||
public void testFuzzy() { | ||
Map<Long, Long> reference = new TreeMap<>(); | ||
try ( | ||
LongRHHash h = new LongRHHash( | ||
randomIntBetween(1, 100), // random capacity | ||
0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution | ||
randomIntBetween(1, 256), // random hints' size | ||
BigArrays.NON_RECYCLING_INSTANCE | ||
) | ||
) { | ||
// Verify the behaviour of "add" and "find". | ||
for (int i = 0; i < (1 << 20); i++) { | ||
long key = randomLong() % (1 << 12); // roughly ~4% unique keys | ||
if (reference.containsKey(key)) { | ||
long expectedOrdinal = reference.get(key); | ||
assertEquals(-1 - expectedOrdinal, h.add(key)); | ||
assertEquals(expectedOrdinal, h.find(key)); | ||
} else { | ||
assertEquals(-1, h.find(key)); | ||
reference.put(key, (long) reference.size()); | ||
assertEquals((long) reference.get(key), h.add(key)); | ||
} | ||
} | ||
|
||
// Verify the behaviour of "get". | ||
for (Map.Entry<Long, Long> entry : reference.entrySet()) { | ||
assertEquals((long) entry.getKey(), h.get(entry.getValue())); | ||
} | ||
} | ||
} | ||
} |