From 4e86707e30d33827868cb9caacf1fe94cca05f10 Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Mon, 5 Jun 2023 11:12:02 +0530 Subject: [PATCH] Updated tests and added microbenchmarks Signed-off-by: Ketan Verma --- .../common/util/LongHashBenchmark.java | 421 ++++++++++++++++++ .../opensearch/common/util/FastLongHash.java | 19 +- .../common/util/FastLongHashTests.java | 95 ++++ 3 files changed, 531 insertions(+), 4 deletions(-) create mode 100644 benchmarks/src/main/java/org/opensearch/common/util/LongHashBenchmark.java diff --git a/benchmarks/src/main/java/org/opensearch/common/util/LongHashBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/util/LongHashBenchmark.java new file mode 100644 index 0000000000000..f5c56dd1e9806 --- /dev/null +++ b/benchmarks/src/main/java/org/opensearch/common/util/LongHashBenchmark.java @@ -0,0 +1,421 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.util; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; +import org.opensearch.common.lease.Releasable; + +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; + +@Fork(value = 3) +@Warmup(iterations = 1, time = 4) +@Measurement(iterations = 3, time = 2) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +public class LongHashBenchmark { + + @Benchmark + public void add(Blackhole bh, HashTableOptions tableOpts, WorkloadOptions workloadOpts) { + try (HashTable table = tableOpts.get(); WorkloadIterator iter = workloadOpts.iter()) { + while (iter.hasNext()) { + bh.consume(table.add(iter.next())); + } + } + } + + /** + * Creates a hash table with varying parameters. + */ + @State(Scope.Benchmark) + public static class HashTableOptions { + + @Param({ "LongHash", "FastLongHash" }) + public String type; + + @Param({ "1" }) + public long initialCapacity; + + @Param({ "0.6" }) + public float loadFactor; + + private Supplier supplier; + + @Setup + public void setup() { + switch (type) { + case "LongHash": + supplier = this::newLongHash; + break; + case "FastLongHash": + supplier = this::newFastLongHash; + break; + default: + throw new IllegalArgumentException("invalid hash table type: " + type); + } + } + + public HashTable get() { + return supplier.get(); + } + + private HashTable newLongHash() { + return new HashTable() { + private final LongHash table = new LongHash(initialCapacity, loadFactor, BigArrays.NON_RECYCLING_INSTANCE); + + @Override + public long add(long key) { + return table.add(key); + } + + @Override + public void close() { + table.close(); + } + }; + } + + private HashTable newFastLongHash() { + return new HashTable() { + private final FastLongHash table = new FastLongHash(initialCapacity, loadFactor, BigArrays.NON_RECYCLING_INSTANCE); + + @Override + public long add(long key) { + return table.add(key); + } + + @Override + public void close() { + table.close(); + } + }; + } + } + + /** + * Creates a workload with varying parameters. + */ + @State(Scope.Benchmark) + public static class WorkloadOptions { + public static final int NUM_HITS = 20_000_000; + + /** + * Repeat the experiment with growing number of keys. + * These values are generated with an exponential growth pattern such that: + * value = ceil(previous_value * random_float_between(1.0, 1.14)) + */ + @Param({ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "13", + "15", + "17", + "18", + "19", + "20", + "21", + "23", + "26", + "27", + "30", + "32", + "35", + "41", + "45", + "50", + "53", + "54", + "55", + "57", + "63", + "64", + "69", + "74", + "80", + "84", + "91", + "98", + "101", + "111", + "114", + "124", + "128", + "139", + "148", + "161", + "162", + "176", + "190", + "204", + "216", + "240", + "257", + "269", + "291", + "302", + "308", + "327", + "341", + "374", + "402", + "412", + "438", + "443", + "488", + "505", + "558", + "612", + "621", + "623", + "627", + "642", + "717", + "765", + "787", + "817", + "915", + "962", + "1011", + "1083", + "1163", + "1237", + "1301", + "1424", + "1541", + "1716", + "1805", + "1817", + "1934", + "2024", + "2238", + "2281", + "2319", + "2527", + "2583", + "2639", + "2662", + "2692", + "2991", + "3201", + "3215", + "3517", + "3681", + "3710", + "4038", + "4060", + "4199", + "4509", + "4855", + "5204", + "5624", + "6217", + "6891", + "7569", + "8169", + "8929", + "9153", + "10005", + "10624", + "10931", + "12070", + "12370", + "13694", + "14227", + "15925", + "17295", + "17376", + "18522", + "19200", + "20108", + "21496", + "23427", + "24224", + "26759", + "29199", + "29897", + "32353", + "33104", + "36523", + "38480", + "38958", + "40020", + "44745", + "45396", + "47916", + "49745", + "49968", + "52231", + "53606" }) + public int size; + + @Param({ "correlated", "uncorrelated", "distinct" }) + public String dataset; + + private WorkloadIterator iterator; + + @Setup + public void setup() { + switch (dataset) { + case "correlated": + iterator = newCorrelatedWorkload(); + break; + case "uncorrelated": + iterator = newUncorrelatedWorkload(); + break; + case "distinct": + iterator = newDistinctWorkload(); + break; + default: + throw new IllegalArgumentException("invalid dataset: " + dataset); + } + } + + public WorkloadIterator iter() { + return iterator; + } + + /** + * Simulates monotonically increasing timestamp data with multiple hits mapping to the same key. + */ + private WorkloadIterator newCorrelatedWorkload() { + assert NUM_HITS >= size : "ensure hits >= size so that each key is used at least once"; + + final long[] data = new long[size]; + for (int i = 0; i < data.length; i++) { + data[i] = 1420070400000L + 3600000L * i; + } + + return new WorkloadIterator() { + private int count = 0; + private int index = 0; + private int remaining = NUM_HITS / data.length; + + @Override + public boolean hasNext() { + return count < NUM_HITS; + } + + @Override + public long next() { + if (--remaining <= 0) { + index = (index + 1) % data.length; + remaining = NUM_HITS / data.length; + } + count++; + return data[index]; + } + + @Override + public void reset() { + count = 0; + index = 0; + remaining = NUM_HITS / data.length; + } + }; + } + + /** + * Simulates uncorrelated data (such as travel distance / fare amount). + */ + private WorkloadIterator newUncorrelatedWorkload() { + assert NUM_HITS >= size : "ensure hits >= size so that each key is used at least once"; + + final Random random = new Random(0); // fixed seed for reproducible results + final long[] data = new long[size]; + for (int i = 0; i < data.length; i++) { + data[i] = Double.doubleToLongBits(20.0 + 80 * random.nextDouble()); + } + + return new WorkloadIterator() { + private int count = 0; + private int index = 0; + + @Override + public boolean hasNext() { + return count < NUM_HITS; + } + + @Override + public long next() { + count++; + index = (index + 1) % data.length; + return data[index]; + } + + @Override + public void reset() { + count = 0; + index = 0; + } + }; + } + + /** + * Simulates workload with high cardinality, i.e., each hit mapping to a different key. + */ + private WorkloadIterator newDistinctWorkload() { + return new WorkloadIterator() { + private int count = 0; + + @Override + public boolean hasNext() { + return count < size; + } + + @Override + public long next() { + return count++; + } + + @Override + public void reset() { + count = 0; + } + }; + } + } + + private interface HashTable extends Releasable { + long add(long key); + } + + private interface WorkloadIterator extends Releasable { + boolean hasNext(); + + long next(); + + void reset(); + + @Override + default void close() { + reset(); + } + } +} diff --git a/server/src/main/java/org/opensearch/common/util/FastLongHash.java b/server/src/main/java/org/opensearch/common/util/FastLongHash.java index 00b2f60203b97..7cc2bb63f4aaf 100644 --- a/server/src/main/java/org/opensearch/common/util/FastLongHash.java +++ b/server/src/main/java/org/opensearch/common/util/FastLongHash.java @@ -229,7 +229,10 @@ private long insert(final long key) { } // Find an alternative slot for the displaced value such that the longest PSL is minimized. - for (idx = (idx + 1) & mask;; idx = (idx + 1) & mask) { + do { + idx = (idx + 1) & mask; + value += INCR_PSL; + if ((existingValue = table.get(idx)) == -1) { // Empty slot; insert the candidate value here. table.set(idx, value); @@ -242,8 +245,7 @@ private long insert(final long key) { // correlated lookups. value = table.set(idx, value); } - value += INCR_PSL; - } + } while (true); } /** @@ -257,11 +259,20 @@ private long append(final long key) { /** * Returns the hash for the given key. + * Visible for unit-tests. */ - protected long hash(final long key) { + long hash(final long key) { return BitMixer.mix64(key); } + /** + * Returns the underlying hash table. + * Visible for unit-tests. + */ + LongArray getTable() { + return table; + } + /** * Grows the hash table by doubling its capacity and reinserting the keys. */ diff --git a/server/src/test/java/org/opensearch/common/util/FastLongHashTests.java b/server/src/test/java/org/opensearch/common/util/FastLongHashTests.java index 89ef866d001e5..461de4670ae51 100644 --- a/server/src/test/java/org/opensearch/common/util/FastLongHashTests.java +++ b/server/src/test/java/org/opensearch/common/util/FastLongHashTests.java @@ -46,6 +46,101 @@ public void testFuzzy() { // Verify the behaviour of "size". assertEquals(reference.size(), h.size()); + + // Verify the calculation of PSLs. + final long capacity = h.getTable().size(); + final long mask = capacity - 1; + for (long idx = 0; idx < h.getTable().size(); idx++) { + final long value = h.getTable().get(idx); + if (value != -1) { + final long homeIdx = h.hash(h.get((int) value)) & mask; + assertEquals((capacity + idx - homeIdx) & mask, value >>> 48); + } + } + } + } + + public void testRearrangement() { + try (FastLongHash h = new FastLongHash(4, 0.6f, BigArrays.NON_RECYCLING_INSTANCE) { + /** + * Overriding with an "identity" hash function to make it easier to reason about the placement + * of values in the hash table. The backing array of the hash table will have a size (8), + * i.e. nextPowerOfTwo(initialCapacity/loadFactor), so the bitmask will be (7). + * The ideal home slot of a key can then be defined as: (hash(key) & mask) = (key & 7). + */ + @Override + long hash(long key) { + return key; + } + }) { + /* + * Add key=0, hash=0, home_slot=0 + * + * Before: empty slot. + * ▼ + * [ _ _ _ _ _ _ _ _ ] + * + * After: inserted [ordinal=0, psl=0] at the empty slot. + * [ 0 _ _ _ _ _ _ _ ] + */ + h.add(0); + assertEquals(encodeValue(0, 0, 0), h.getTable().get(0)); + + /* + * Add key=8, hash=8, home_slot=0 + * + * Before: occupied slot. + * ▼ + * [ 0 _ _ _ _ _ _ _ ] + * + * After: inserted [ordinal=1, psl=0] at the existing slot, displaced [ordinal=0, psl=0], + * and re-inserted it at the next empty slot as [ordinal=0, psl=1]. + * [ 1 0 _ _ _ _ _ _ ] + */ + h.add(8); + assertEquals(encodeValue(0, 0, 1), h.getTable().get(0)); + assertEquals(encodeValue(1, 0, 0), h.getTable().get(1)); + + /* + * Add key=1, hash=1, home_slot=1 + * + * Before: occupied slot. + * ▼ + * [ 1 0 _ _ _ _ _ _ ] + * + * After: inserted [ordinal=2, psl=0] at the existing slot, displaced [ordinal=0, psl=1], + * and re-inserted it at the next empty slot as [ordinal=0, psl=2]. + * [ 1 2 0 _ _ _ _ _ ] + */ + h.add(1); + assertEquals(encodeValue(0, 0, 1), h.getTable().get(0)); + assertEquals(encodeValue(0, 0, 2), h.getTable().get(1)); + assertEquals(encodeValue(2, 0, 0), h.getTable().get(2)); + + /* + * Add key=16, hash=16, home_slot=0 + * + * Before: occupied slot. + * ▼ + * [ 1 2 0 _ _ _ _ _ ] + * + * After: inserted [ordinal=3, psl=0] at the existing slot, displaced [ordinal=1, psl=0] + * and re-inserted it at the next best slot. Repeated this for other displaced values + * until everything found an empty slot. + * [ 3 1 0 2 _ _ _ _ ] + */ + h.add(16); + assertEquals(encodeValue(0, 0, 3), h.getTable().get(0)); + assertEquals(encodeValue(1, 0, 1), h.getTable().get(1)); + assertEquals(encodeValue(2, 0, 0), h.getTable().get(2)); + assertEquals(encodeValue(2, 0, 2), h.getTable().get(3)); } } + + private static long encodeValue(long psl, long fingerprint, long ordinal) { + assert psl < (1L << 15); + assert fingerprint < (1L << 16); + assert ordinal < (1L << 32); + return (psl << 48) | (fingerprint << 32) | ordinal; + } }