From 1a1da8d27910fab094060ef8558fc9681aaefa63 Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Wed, 19 Jul 2023 18:17:12 -0400 Subject: [PATCH 1/9] Performance improvements for BytesRefHash Signed-off-by: Ketan Verma --- .../common/util/BytesRefHashBenchmark.java | 260 +++++++++++++++ buildSrc/version.properties | 2 + server/build.gradle | 16 +- .../zero-allocation-hashing-0.16.jar.sha1 | 1 + .../zero-allocation-hashing-LICENSE.txt | 201 ++++++++++++ .../zero-allocation-hashing-NOTICE.txt | 0 .../common/util/CompactBytesRefHash.java | 286 +++++++++++++++++ .../common/util/ReorganizingBytesRefHash.java | 301 ++++++++++++++++++ .../bucket/terms/BytesKeyedBucketOrds.java | 5 +- .../org/opensearch/bootstrap/security.policy | 10 + .../bootstrap/test-framework.policy | 1 + .../common/util/CompactBytesRefHashTests.java | 58 ++++ .../util/ReorganizingBytesRefHashTests.java | 70 ++++ 13 files changed, 1208 insertions(+), 3 deletions(-) create mode 100644 benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java create mode 100644 server/licenses/zero-allocation-hashing-0.16.jar.sha1 create mode 100644 server/licenses/zero-allocation-hashing-LICENSE.txt create mode 100644 server/licenses/zero-allocation-hashing-NOTICE.txt create mode 100644 server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java create mode 100644 server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java create mode 100644 server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java create mode 100644 server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java diff --git a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java new file mode 100644 index 0000000000000..8ad04a42b190f --- /dev/null +++ b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java @@ -0,0 +1,260 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.util; + +import net.openhft.hashing.LongHashFunction; +import org.apache.lucene.util.BytesRef; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; +import org.opensearch.common.lease.Releasable; +import org.opensearch.common.lease.Releasables; + +import java.util.HashSet; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Stream; + +@Fork(value = 5) +@Warmup(iterations = 1, time = 2) +@Measurement(iterations = 3, time = 5) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +public class BytesRefHashBenchmark { + private static final int NUM_TABLES = 20; // run across many tables so that caches aren't effective + private static final int NUM_HITS = 1_000_000; // num hits per table + + @Benchmark + public void add(Blackhole bh, Options opts) { + for (int hit = 0; hit < NUM_HITS; hit++) { + BytesRef key = opts.keys[hit % opts.keys.length]; + for (HashTable table : opts.tables) { + bh.consume(table.add(key)); + } + } + } + + @State(Scope.Benchmark) + public static class Options { + @Param({ "baseline", "compact", "reorganizing" }) + public String type; + + @Param({ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "10", + "12", + "14", + "16", + "19", + "22", + "25", + "29", + "33", + "38", + "43", + "50", + "57", + "65", + "75", + "86", + "97", + "109", + "124", + "141", + "161", + "182", + "204", + "229", + "262", + "297", + "336", + "380", + "430", + "482", + "550", + "610", + "704", + "801", + "914", + "1042", + "1178", + "1343", + "1532", + "1716", + "1940", + "2173", + "2456", + "2751", + "3082", + "3514", + "4006", + "4487", + "5026", + "5730", + "6418", + "7317", + "8196", + "9180", + "10374", + "11723", + "13247", + "14837", + "16915", + "19114", + "21599", + "24623", + "28071", + "32001", + "36482", + "41590", + "46581", + "52637", + "58954", + "67208", + "76618", + "86579", + "97835", + "109576", + "122726", + "138681", + "156710", + "175516", + "198334", + "222135", + "248792", + "281135", + "320494", + "365364", + "409208", + "466498", + "527143", + "595672", + "667153", + "753883", + "851888", + "971153" }) + + public Integer size; + + @Param({ "8", "32", "128" }) + public Integer length; + + private HashTable[] tables; + + private BytesRef[] keys; + + @Setup + public void setup() { + assert size <= Math.pow(26, length) : "key length too small to generate the required number of keys"; + tables = Stream.generate(this::newHashTable).limit(NUM_TABLES).toArray(HashTable[]::new); + Random random = new Random(0); + Set seen = new HashSet<>(); + keys = new BytesRef[size]; + for (int i = 0; i < size; i++) { + BytesRef key; + do { + key = new BytesRef( + random.ints(97, 123) + .limit(length) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString() + ); + } while (seen.contains(key)); + keys[i] = key; + seen.add(key); + } + } + + @TearDown + public void tearDown() { + Releasables.close(tables); + } + + private HashTable newHashTable() { + switch (type) { + case "baseline": + return new HashTable() { + private final BytesRefHash table = new BytesRefHash(1, 0.6f, BigArrays.NON_RECYCLING_INSTANCE); + + @Override + public long add(BytesRef key) { + return table.add(key); + } + + @Override + public void close() { + table.close(); + } + }; + case "compact": + return new HashTable() { + private final CompactBytesRefHash table = new CompactBytesRefHash( + 1, + 0.6f, + key -> LongHashFunction.xx3().hashBytes(key.bytes, key.offset, key.length), + BigArrays.NON_RECYCLING_INSTANCE + ); + + @Override + public long add(BytesRef key) { + return table.add(key); + } + + @Override + public void close() { + table.close(); + } + }; + case "reorganizing": + return new HashTable() { + private final ReorganizingBytesRefHash table = new ReorganizingBytesRefHash( + 1, + 0.6f, + key -> LongHashFunction.xx3().hashBytes(key.bytes, key.offset, key.length), + BigArrays.NON_RECYCLING_INSTANCE + ); + + @Override + public long add(BytesRef key) { + return table.add(key); + } + + @Override + public void close() { + table.close(); + } + }; + default: + throw new IllegalArgumentException("invalid hash table type: " + type); + } + } + } + + private interface HashTable extends Releasable { + long add(BytesRef key); + } +} diff --git a/buildSrc/version.properties b/buildSrc/version.properties index ff962309cf084..2bb21dfca4b14 100644 --- a/buildSrc/version.properties +++ b/buildSrc/version.properties @@ -69,3 +69,5 @@ resteasy = 6.2.4.Final # opentelemetry dependencies opentelemetry = 1.26.0 +# hashing dependencies +zero_allocation_hashing = 0.16 diff --git a/server/build.gradle b/server/build.gradle index f6db3d53a0dcc..3b1fe9554a309 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -158,6 +158,9 @@ dependencies { api "com.google.protobuf:protobuf-java:${versions.protobuf}" api "jakarta.annotation:jakarta.annotation-api:${versions.jakarta_annotation}" + // hashing + api "net.openhft:zero-allocation-hashing:${versions.zero_allocation_hashing}" + testImplementation(project(":test:framework")) { // tests use the locally compiled version of server exclude group: 'org.opensearch', module: 'server' @@ -364,7 +367,18 @@ tasks.named("thirdPartyAudit").configure { 'com.google.protobuf.UnsafeUtil$Android32MemoryAccessor', 'com.google.protobuf.UnsafeUtil$Android64MemoryAccessor', 'com.google.protobuf.UnsafeUtil$JvmMemoryAccessor', - 'com.google.protobuf.UnsafeUtil$MemoryAccessor' + 'com.google.protobuf.UnsafeUtil$MemoryAccessor', + + // from zero-allocation-hashing + 'net.openhft.hashing.HotSpotPrior7u6StringHash', + 'net.openhft.hashing.LongHashFunction', + 'net.openhft.hashing.LongTupleHashFunction', + 'net.openhft.hashing.ModernCompactStringHash', + 'net.openhft.hashing.ModernHotSpotStringHash', + 'net.openhft.hashing.UnsafeAccess', + 'net.openhft.hashing.UnsafeAccess$OldUnsafeAccessBigEndian', + 'net.openhft.hashing.UnsafeAccess$OldUnsafeAccessLittleEndian', + 'net.openhft.hashing.Util' ) } diff --git a/server/licenses/zero-allocation-hashing-0.16.jar.sha1 b/server/licenses/zero-allocation-hashing-0.16.jar.sha1 new file mode 100644 index 0000000000000..e82e885f269ce --- /dev/null +++ b/server/licenses/zero-allocation-hashing-0.16.jar.sha1 @@ -0,0 +1 @@ +0ca252f328160ed5d027f100a4fe525d6d21daaf \ No newline at end of file diff --git a/server/licenses/zero-allocation-hashing-LICENSE.txt b/server/licenses/zero-allocation-hashing-LICENSE.txt new file mode 100644 index 0000000000000..261eeb9e9f8b2 --- /dev/null +++ b/server/licenses/zero-allocation-hashing-LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/server/licenses/zero-allocation-hashing-NOTICE.txt b/server/licenses/zero-allocation-hashing-NOTICE.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java b/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java new file mode 100644 index 0000000000000..d8d4690d14e90 --- /dev/null +++ b/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java @@ -0,0 +1,286 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.util; + +import net.openhft.hashing.LongHashFunction; +import org.apache.lucene.util.BytesRef; +import org.opensearch.common.lease.Releasable; +import org.opensearch.common.lease.Releasables; +import org.opensearch.core.common.util.ByteArray; + +import java.security.AccessController; +import java.security.PrivilegedAction; + +/** + * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. + * + *

+ * It uses a compact byte-packing strategy to encode the ordinal and fingerprint information + * in the hash table value. It makes lookups faster by short-circuiting expensive equality checks + * for keys that collide onto the same hash table slot. + * + *

+ * This class is not thread-safe. + * + * @opensearch.internal + */ +public class CompactBytesRefHash implements Releasable { + private static final LongHashFunction XX3 = AccessController.doPrivileged( + (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) + ); + + private static final long MAX_CAPACITY = 1L << 32; + private static final long DEFAULT_INITIAL_CAPACITY = 32; + private static final float DEFAULT_LOAD_FACTOR = 0.6f; + private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); + + private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal + private static final long MASK_FINGERPRINT = 0xFFFFFFFF00000000L; // extract fingerprint + + /** + * Maximum load factor after which the capacity is doubled. + */ + private final float loadFactor; + + /** + * Calculates the hash of a {@link BytesRef} key. + */ + private final Hasher hasher; + + /** + * Utility class to allocate recyclable arrays. + */ + private final BigArrays bigArrays; + + /** + * Reusable BytesRef to read keys. + */ + private final BytesRef scratch = new BytesRef(); + + /** + * Current capacity of the hash table. This must be a power of two so that the hash table slot + * can be identified quickly using bitmasks, thus avoiding expensive modulo or integer division. + */ + private long capacity; + + /** + * Bitmask to identify the hash table slot from a key's hash. + */ + private long mask; + + /** + * Size threshold after which the hash table needs to be doubled in capacity. + */ + private long grow; + + /** + * Current size of the hash table. + */ + private long size; + + /** + * Underlying array to store the hash table values. + * + *

+ * Each hash table value (64-bit) uses the following byte packing strategy: + *

+     * |================================|================================|
+     * | Fingerprint                    | Ordinal                        |
+     * |--------------------------------|--------------------------------|
+     * | 32 bits                        | 32 bits                        |
+     * |================================|================================|
+     * 
+ * + *

+ * This allows us to encode and manipulate additional information in the hash table + * itself without having to look elsewhere in the memory, which is much slower. + * + *

+ * Terminology: table[index] = value = (fingerprint | ordinal) + */ + private LongArray table; + + /** + * Underlying array to store the starting offsets of keys. + * + *

+ * Terminology: + *

+     *   offsets[ordinal] = starting offset (inclusive)
+     *   offsets[ordinal + 1] = ending offset (exclusive)
+     * 
+ */ + private LongArray offsets; + + /** + * Underlying byte array to store the keys. + * + *

+ * Terminology: keys[start...end] = key + */ + private ByteArray keys; + + public CompactBytesRefHash(final BigArrays bigArrays) { + this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); + } + + public CompactBytesRefHash(final long initialCapacity, final float loadFactor, final Hasher hasher, final BigArrays bigArrays) { + assert initialCapacity > 0 : "initial capacity must be greater than 0"; + assert loadFactor > 0 && loadFactor < 1 : "load factor must be between 0 and 1"; + + this.loadFactor = loadFactor; + this.hasher = hasher; + this.bigArrays = bigArrays; + + capacity = Math.max(1, Long.highestOneBit((long) (initialCapacity / loadFactor)) << 1); + mask = capacity - 1; + size = 0; + grow = (long) (capacity * loadFactor); + + table = bigArrays.newLongArray(capacity, false); + table.fill(0, capacity, -1); + offsets = bigArrays.newLongArray(initialCapacity + 1, false); + offsets.set(0, 0); + keys = bigArrays.newByteArray(initialCapacity * 3, false); + } + + /** + * Adds the given key to the hash table and returns its ordinal. + * If the key exists already, it returns (-1 - ordinal). + */ + public long add(final BytesRef key) { + final long hash = hasher.hash(key); + final long fingerprint = hash & MASK_FINGERPRINT; + + for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { + if ((value = table.get(idx)) == -1) { + final long val = fingerprint | size; + if (size >= grow) { + growAndInsert(hash, val); + } else { + table.set(idx, val); + } + return append(key); + } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { + return -1 - ordinal; + } + } + } + + /** + * Returns the ordinal associated with the given key, or -1 if the key doesn't exist. + * + *

+ * Using the 64-bit hash value, up to 32 least significant bits (LSB) are used to identify the + * home slot in the hash table, and an additional 32 bits are used to identify the fingerprint. + * The fingerprint further increases the entropy and reduces the number of false lookups in the + * keys' table during equality checks, which is expensive. + * + *

+ * Total entropy bits = 32 + log2(capacity) + * + *

+ * Linear probing starts from the home slot, until a match or an empty slot is found. + * Values are first checked using their fingerprint (to reduce false positives), then verified + * in the keys' table using an equality check. + */ + public long find(final BytesRef key) { + final long hash = hasher.hash(key); + final long fingerprint = hash & MASK_FINGERPRINT; + + for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { + if ((value = table.get(idx)) == -1) { + return -1; + } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { + return ordinal; + } + } + } + + /** + * Returns the key associated with the given ordinal. + * The result is undefined for an unused ordinal. + * + *

+ * Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called + */ + public BytesRef get(final long ordinal, final BytesRef dest) { + final long start = offsets.get(ordinal); + final int length = (int) (offsets.get(ordinal + 1) - start); + keys.get(start, length, dest); + return dest; + } + + /** + * Returns the number of mappings in this hash table. + */ + public long size() { + return size; + } + + /** + * Appends the key in the keys' and offsets' tables. + */ + private long append(final BytesRef key) { + final long start = offsets.get(size); + final long end = start + key.length; + offsets = bigArrays.grow(offsets, size + 2); + offsets.set(size + 1, end); + keys = bigArrays.grow(keys, end); + keys.set(start, key.bytes, key.offset, key.length); + return size++; + } + + /** + * Grows the hash table by doubling its capacity, inserting the provided value, + * and reinserting the previous values at their updated slots. + */ + private void growAndInsert(final long hash, final long value) { + // Ensure that the hash table doesn't grow too large. + // This implicitly also ensures that the ordinals are no larger than 2^32, thus, + // preventing them from polluting the fingerprint bits in the hash table values. + assert capacity < MAX_CAPACITY : "hash table already at the max capacity"; + + capacity <<= 1; + mask = capacity - 1; + grow = (long) (capacity * loadFactor); + table = bigArrays.grow(table, capacity); + table.fill(0, capacity, -1); + table.set(hash & mask, value); + + for (long ordinal = 0; ordinal < size; ordinal++) { + reinsert(ordinal, hasher.hash(get(ordinal, scratch))); + } + } + + /** + * Reinserts the hash table value for an existing key stored at the given ordinal. + */ + private void reinsert(final long ordinal, final long hash) { + for (long idx = hash & mask;; idx = (idx + 1) & mask) { + if (table.get(idx) == -1) { + table.set(idx, (hash & MASK_FINGERPRINT) | ordinal); + return; + } + } + } + + @Override + public void close() { + Releasables.close(table, offsets, keys); + } + + /** + * Hasher calculates the hash of a {@link BytesRef} key. + */ + @FunctionalInterface + public interface Hasher { + long hash(BytesRef key); + } +} diff --git a/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java b/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java new file mode 100644 index 0000000000000..1806733cc1567 --- /dev/null +++ b/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java @@ -0,0 +1,301 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.util; + +import net.openhft.hashing.LongHashFunction; +import org.apache.lucene.util.BytesRef; +import org.opensearch.common.lease.Releasable; +import org.opensearch.common.lease.Releasables; +import org.opensearch.core.common.util.ByteArray; + +import java.security.AccessController; +import java.security.PrivilegedAction; + +/** + * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. + * + *

+ * It organizes itself by moving keys around dynamically in order to reduce the + * longest probe sequence length (PSL), which makes lookups faster as keys are likely to + * be found in the same CPU cache line. It also uses fingerprints to short-circuit expensive + * equality checks for keys that collide onto the same hash table slot. + * + *

+ * This class is not thread-safe. + * + * @opensearch.internal + */ +public class ReorganizingBytesRefHash implements Releasable { + private static final LongHashFunction XX3 = AccessController.doPrivileged( + (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) + ); + + private static final long MAX_CAPACITY = 1L << 32; + private static final long DEFAULT_INITIAL_CAPACITY = 32; + private static final float DEFAULT_LOAD_FACTOR = 0.6f; + private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); + + private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal + private static final long MASK_FINGERPRINT = 0x0000FFFF00000000L; // extract fingerprint + private static final long MASK_PSL = 0x7FFF000000000000L; // extract PSL + private static final long INCR_PSL = 0x0001000000000000L; // increment PSL by one + + /** + * Maximum load factor after which the capacity is doubled. + */ + private final float loadFactor; + + /** + * Calculates the hash of a {@link BytesRef} key. + */ + private final Hasher hasher; + + /** + * Utility class to allocate recyclable arrays. + */ + private final BigArrays bigArrays; + + /** + * Reusable BytesRef to read keys. + */ + private final BytesRef scratch = new BytesRef(); + + /** + * Current capacity of the hash table. This must be a power of two so that the hash table slot + * can be identified quickly using bitmasks, thus avoiding expensive modulo or integer division. + */ + private long capacity; + + /** + * Bitmask to identify the hash table slot from a key's hash. + */ + private long mask; + + /** + * Size threshold after which the hash table needs to be doubled in capacity. + */ + private long grow; + + /** + * Current size of the hash table. + */ + private long size; + + /** + * Underlying array to store the hash table values. + * + *

+ * Each hash table value (64-bit) uses the following byte packing strategy: + *

+     * |=========|===============|================|================================|
+     * | Discard | PSL           | Fingerprint    | Ordinal                        |
+     * |    -    |---------------|----------------|--------------------------------|
+     * | 1 bit   | 15 bits       | 16 bits        | 32 bits                        |
+     * |=========|===============|================|================================|
+     * 
+ * + *

+ * This allows us to encode and manipulate additional information in the hash table + * itself without having to look elsewhere in the memory, which is much slower. + * + *

+ * Terminology: table[index] = value = (discard | psl | fingerprint | ordinal) + */ + private LongArray table; + + /** + * Underlying array to store the starting offsets of keys. + * + *

+ * Terminology: + *

+     *   offsets[ordinal] = starting offset (inclusive)
+     *   offsets[ordinal + 1] = ending offset (exclusive)
+     * 
+ */ + private LongArray offsets; + + /** + * Underlying byte array to store the keys. + * + *

+ * Terminology: keys[start...end] = key + */ + private ByteArray keys; + + public ReorganizingBytesRefHash(final BigArrays bigArrays) { + this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); + } + + public ReorganizingBytesRefHash(final long initialCapacity, final float loadFactor, final Hasher hasher, final BigArrays bigArrays) { + assert initialCapacity > 0 : "initial capacity must be greater than 0"; + assert loadFactor > 0 && loadFactor < 1 : "load factor must be between 0 and 1"; + + this.loadFactor = loadFactor; + this.hasher = hasher; + this.bigArrays = bigArrays; + + capacity = Math.max(1, Long.highestOneBit((long) (initialCapacity / loadFactor)) << 1); + mask = capacity - 1; + size = 0; + grow = (long) (capacity * loadFactor); + + table = bigArrays.newLongArray(capacity, false); + table.fill(0, capacity, -1); + offsets = bigArrays.newLongArray(initialCapacity + 1, false); + offsets.set(0, 0); + keys = bigArrays.newByteArray(initialCapacity * 3, false); + } + + /** + * Adds the given key to the hash table and returns its ordinal. + * If the key exists already, it returns (-1 - ordinal). + */ + public long add(final BytesRef key) { + final long hash = hasher.hash(key); + final long fingerprint = hash & MASK_FINGERPRINT; + + for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { + if ((value = table.get(idx)) == -1) { + final long val = (fingerprint | size); + if (size >= grow) { + growAndInsert(hash, val); + } else { + insert(hash, val); + } + return append(key); + } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { + return -(1 + ordinal); + } + } + } + + /** + * Returns the ordinal associated with the given key, or -1 if the key doesn't exist. + * + *

+ * Using the 64-bit hash value, up to 32 least significant bits (LSB) are used to identify the + * home slot in the hash table, and an additional 16 bits are used to identify the fingerprint. + * The fingerprint further increases the entropy and reduces the number of false lookups in the + * keys' table during equality checks, which is expensive. + * + *

+ * Total entropy bits = 16 + log2(capacity) + * + *

+ * Linear probing starts from the home slot, until a match or an empty slot is found. + * Values are first checked using their fingerprint (to reduce false positives), then verified + * in the keys' table using an equality check. + */ + public long find(final BytesRef key) { + final long hash = hasher.hash(key); + final long fingerprint = hash & MASK_FINGERPRINT; + + for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { + if ((value = table.get(idx)) == -1) { + return -1; + } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { + return ordinal; + } + } + } + + /** + * Returns the key associated with the given ordinal. + * The result is undefined for an unused ordinal. + * + *

+ * Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called + */ + public BytesRef get(final long ordinal, final BytesRef dest) { + final long start = offsets.get(ordinal); + final int length = (int) (offsets.get(ordinal + 1) - start); + keys.get(start, length, dest); + return dest; + } + + /** + * Returns the number of mappings in this hash table. + */ + public long size() { + return size; + } + + /** + * Appends the key in the keys' and offsets' tables. + */ + private long append(final BytesRef key) { + final long start = offsets.get(size); + final long end = start + key.length; + offsets = bigArrays.grow(offsets, size + 2); + offsets.set(size + 1, end); + keys = bigArrays.grow(keys, end); + keys.set(start, key.bytes, key.offset, key.length); + return size++; + } + + /** + * Grows the hash table by doubling its capacity, inserting the provided value, + * and reinserting the previous values at their updated slots. + */ + private void growAndInsert(final long hash, final long value) { + // Ensure that the hash table doesn't grow too large. + // This implicitly also ensures that the ordinals are no larger than 2^32, thus, + // preventing them from polluting the fingerprint bits in the hash table values. + assert capacity < MAX_CAPACITY : "hash table already at the max capacity"; + + capacity <<= 1; + mask = capacity - 1; + grow = (long) (capacity * loadFactor); + table = bigArrays.grow(table, capacity); + table.fill(0, capacity, -1); + table.set(hash & mask, value); + + for (long ordinal = 0; ordinal < size; ordinal++) { + final long h = hasher.hash(get(ordinal, scratch)); + insert(h, (h & MASK_FINGERPRINT) | ordinal); + } + } + + /** + * Inserts the hash table value for a missing key. + */ + private void insert(final long hash, final long value) { + for (long idx = hash & mask, current = value, existing;; idx = (idx + 1) & mask) { + if ((existing = table.get(idx)) == -1) { + table.set(idx, current); + return; + } else if ((existing & MASK_PSL) < (current & MASK_PSL)) { + current = table.set(idx, current); + } + current += INCR_PSL; + } + } + + @Override + public void close() { + Releasables.close(table, offsets, keys); + } + + /** + * Returns the underlying hash table. + * Visible for unit-tests. + */ + LongArray getTable() { + return table; + } + + /** + * Hasher calculates the hash of a {@link BytesRef} key. + */ + @FunctionalInterface + public interface Hasher { + long hash(BytesRef key); + } +} diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java index 0eb23013d1e47..2c804166eed78 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java @@ -37,6 +37,7 @@ import org.opensearch.common.lease.Releasables; import org.opensearch.common.util.BigArrays; import org.opensearch.common.util.BytesRefHash; +import org.opensearch.common.util.CompactBytesRefHash; import org.opensearch.search.aggregations.CardinalityUpperBound; /** @@ -128,10 +129,10 @@ public void readValue(BytesRef dest) {} * @opensearch.internal */ private static class FromSingle extends BytesKeyedBucketOrds { - private final BytesRefHash ords; + private final CompactBytesRefHash ords; private FromSingle(BigArrays bigArrays) { - ords = new BytesRefHash(1, bigArrays); + ords = new CompactBytesRefHash(bigArrays); } @Override diff --git a/server/src/main/resources/org/opensearch/bootstrap/security.policy b/server/src/main/resources/org/opensearch/bootstrap/security.policy index 77cd0ab05278e..2fde31cb1d648 100644 --- a/server/src/main/resources/org/opensearch/bootstrap/security.policy +++ b/server/src/main/resources/org/opensearch/bootstrap/security.policy @@ -48,6 +48,10 @@ grant codeBase "${codebase.opensearch}" { permission java.lang.RuntimePermission "setContextClassLoader"; // needed for SPI class loading permission java.lang.RuntimePermission "accessDeclaredMembers"; + + // needed for zero-allocation-hashing + permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; + permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; }; //// Very special jar permissions: @@ -85,6 +89,12 @@ grant codeBase "${codebase.zstd-jni}" { permission java.lang.RuntimePermission "loadLibrary.*"; }; +grant codeBase "${codebase.zero-allocation-hashing}" { + permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; + permission java.lang.RuntimePermission "accessDeclaredMembers"; + permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; +}; + //// Everything else: grant { diff --git a/server/src/main/resources/org/opensearch/bootstrap/test-framework.policy b/server/src/main/resources/org/opensearch/bootstrap/test-framework.policy index 0abfd7ef22ae7..7d35d439bd373 100644 --- a/server/src/main/resources/org/opensearch/bootstrap/test-framework.policy +++ b/server/src/main/resources/org/opensearch/bootstrap/test-framework.policy @@ -156,5 +156,6 @@ grant { permission java.lang.RuntimePermission "accessDeclaredMembers"; permission java.lang.RuntimePermission "reflectionFactoryAccess"; permission java.lang.RuntimePermission "accessClassInPackage.sun.reflect"; + permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; }; diff --git a/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java new file mode 100644 index 0000000000000..297fe82a2a505 --- /dev/null +++ b/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java @@ -0,0 +1,58 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.util; + +import net.openhft.hashing.LongHashFunction; +import org.apache.lucene.util.BytesRef; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Stream; + +public class CompactBytesRefHashTests extends OpenSearchTestCase { + + public void testFuzzy() { + LongHashFunction hasher = LongHashFunction.xx3(randomLong()); + Map reference = new HashMap<>(); + BytesRef[] keys = Stream.generate(() -> new BytesRef(randomAlphaOfLength(20))).limit(1000).toArray(BytesRef[]::new); + + try ( + CompactBytesRefHash h = new CompactBytesRefHash( + randomIntBetween(1, 100), // random capacity + 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution + key -> hasher.hashBytes(key.bytes, key.offset, key.length), + BigArrays.NON_RECYCLING_INSTANCE + ) + ) { + // Verify the behaviour of "add" and "find". + for (int i = 0; i < keys.length * 10; i++) { + BytesRef key = keys[i % keys.length]; + if (reference.containsKey(key)) { + long expectedOrdinal = reference.get(key); + assertEquals(-1 - expectedOrdinal, h.add(key)); + assertEquals(expectedOrdinal, h.find(key)); + } else { + assertEquals(-1, h.find(key)); + reference.put(key, (long) reference.size()); + assertEquals((long) reference.get(key), h.add(key)); + } + } + + // Verify the behaviour of "get". + BytesRef scratch = new BytesRef(); + for (Map.Entry entry : reference.entrySet()) { + assertEquals(entry.getKey(), h.get(entry.getValue(), scratch)); + } + + // Verify the behaviour of "size". + assertEquals(reference.size(), h.size()); + } + } +} diff --git a/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java new file mode 100644 index 0000000000000..e7ab6d1fa21a6 --- /dev/null +++ b/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java @@ -0,0 +1,70 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.util; + +import net.openhft.hashing.LongHashFunction; +import org.apache.lucene.util.BytesRef; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Stream; + +public class ReorganizingBytesRefHashTests extends OpenSearchTestCase { + + public void testFuzzy() { + LongHashFunction hasher = LongHashFunction.xx3(randomLong()); + Map reference = new HashMap<>(); + BytesRef[] keys = Stream.generate(() -> new BytesRef(randomAlphaOfLength(20))).limit(1000).toArray(BytesRef[]::new); + + try ( + ReorganizingBytesRefHash h = new ReorganizingBytesRefHash( + randomIntBetween(1, 100), // random capacity + 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution + key -> hasher.hashBytes(key.bytes, key.offset, key.length), + BigArrays.NON_RECYCLING_INSTANCE + ) + ) { + // Verify the behaviour of "add" and "find". + for (int i = 0; i < keys.length * 10; i++) { + BytesRef key = keys[i % keys.length]; + if (reference.containsKey(key)) { + long expectedOrdinal = reference.get(key); + assertEquals(-1 - expectedOrdinal, h.add(key)); + assertEquals(expectedOrdinal, h.find(key)); + } else { + assertEquals(-1, h.find(key)); + reference.put(key, (long) reference.size()); + assertEquals((long) reference.get(key), h.add(key)); + } + } + + // Verify the behaviour of "get". + BytesRef scratch = new BytesRef(); + for (Map.Entry entry : reference.entrySet()) { + assertEquals(entry.getKey(), h.get(entry.getValue(), scratch)); + } + + // Verify the behaviour of "size". + assertEquals(reference.size(), h.size()); + + // Verify the calculation of PSLs. + long capacity = h.getTable().size(); + long mask = capacity - 1; + for (long idx = 0; idx < h.getTable().size(); idx++) { + long value = h.getTable().get(idx); + if (value != -1) { + BytesRef key = h.get((int) value, scratch); + long homeIdx = hasher.hashBytes(key.bytes, key.offset, key.length) & mask; + assertEquals((capacity + idx - homeIdx) & mask, value >>> 48); + } + } + } + } +} From f8052d165227d1d50a3133055f742b78a1dbb4ad Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Sun, 30 Jul 2023 23:11:52 +0530 Subject: [PATCH 2/9] Replace BytesRefHash and clean up alternative implementations Signed-off-by: Ketan Verma --- CHANGELOG.md | 1 + .../common/util/BytesRefHashBenchmark.java | 78 +--- .../java/org/opensearch/common/Numbers.java | 8 + .../opensearch/common/util/BytesRefHash.java | 366 ++++++++++++------ .../common/util/CompactBytesRefHash.java | 286 -------------- .../common/util/ReorganizingBytesRefHash.java | 301 -------------- .../common/util/ReorganizingLongHash.java | 12 +- .../bucket/terms/BytesKeyedBucketOrds.java | 7 +- .../bucket/terms/SignificanceLookup.java | 2 +- .../terms/StringRareTermsAggregator.java | 2 +- .../org/opensearch/common/NumbersTests.java | 20 + .../common/util/BytesRefHashTests.java | 61 +-- .../common/util/CompactBytesRefHashTests.java | 58 --- .../util/ReorganizingBytesRefHashTests.java | 70 ---- 14 files changed, 315 insertions(+), 957 deletions(-) delete mode 100644 server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java delete mode 100644 server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java delete mode 100644 server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java delete mode 100644 server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 7af3c171e8c6a..9d81d24e15b9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -130,6 +130,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Replace the deprecated IndexReader APIs with new storedFields() & termVectors() ([#7792](https://github.com/opensearch-project/OpenSearch/pull/7792)) - [Remote Store] Add support to restore only unassigned shards of an index ([#8792](https://github.com/opensearch-project/OpenSearch/pull/8792)) - Add safeguard limits for file cache during node level allocation ([#8208](https://github.com/opensearch-project/OpenSearch/pull/8208)) +- Performance improvements for BytesRefHash ([#8788](https://github.com/opensearch-project/OpenSearch/pull/8788)) - Add support for aggregation profiler with concurrent aggregation ([#8801](https://github.com/opensearch-project/OpenSearch/pull/8801)) - [Remove] Deprecated Fractional ByteSizeValue support #9005 ([#9005](https://github.com/opensearch-project/OpenSearch/pull/9005)) - Add support for aggregation profiler with concurrent aggregation ([#8801](https://github.com/opensearch-project/OpenSearch/pull/8801)) diff --git a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java index 8ad04a42b190f..2e2a2399e9c0d 100644 --- a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java +++ b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java @@ -8,7 +8,6 @@ package org.opensearch.common.util; -import net.openhft.hashing.LongHashFunction; import org.apache.lucene.util.BytesRef; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -23,7 +22,6 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; import java.util.HashSet; @@ -32,7 +30,7 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Stream; -@Fork(value = 5) +@Fork(value = 3) @Warmup(iterations = 1, time = 2) @Measurement(iterations = 3, time = 5) @BenchmarkMode(Mode.AverageTime) @@ -45,7 +43,7 @@ public class BytesRefHashBenchmark { public void add(Blackhole bh, Options opts) { for (int hit = 0; hit < NUM_HITS; hit++) { BytesRef key = opts.keys[hit % opts.keys.length]; - for (HashTable table : opts.tables) { + for (BytesRefHash table : opts.tables) { bh.consume(table.add(key)); } } @@ -53,9 +51,6 @@ public void add(Blackhole bh, Options opts) { @State(Scope.Benchmark) public static class Options { - @Param({ "baseline", "compact", "reorganizing" }) - public String type; - @Param({ "1", "2", @@ -163,14 +158,16 @@ public static class Options { @Param({ "8", "32", "128" }) public Integer length; - private HashTable[] tables; + private BytesRefHash[] tables; private BytesRef[] keys; @Setup public void setup() { assert size <= Math.pow(26, length) : "key length too small to generate the required number of keys"; - tables = Stream.generate(this::newHashTable).limit(NUM_TABLES).toArray(HashTable[]::new); + tables = Stream.generate(() -> new BytesRefHash(BigArrays.NON_RECYCLING_INSTANCE)) + .limit(NUM_TABLES) + .toArray(BytesRefHash[]::new); Random random = new Random(0); Set seen = new HashSet<>(); keys = new BytesRef[size]; @@ -193,68 +190,5 @@ public void setup() { public void tearDown() { Releasables.close(tables); } - - private HashTable newHashTable() { - switch (type) { - case "baseline": - return new HashTable() { - private final BytesRefHash table = new BytesRefHash(1, 0.6f, BigArrays.NON_RECYCLING_INSTANCE); - - @Override - public long add(BytesRef key) { - return table.add(key); - } - - @Override - public void close() { - table.close(); - } - }; - case "compact": - return new HashTable() { - private final CompactBytesRefHash table = new CompactBytesRefHash( - 1, - 0.6f, - key -> LongHashFunction.xx3().hashBytes(key.bytes, key.offset, key.length), - BigArrays.NON_RECYCLING_INSTANCE - ); - - @Override - public long add(BytesRef key) { - return table.add(key); - } - - @Override - public void close() { - table.close(); - } - }; - case "reorganizing": - return new HashTable() { - private final ReorganizingBytesRefHash table = new ReorganizingBytesRefHash( - 1, - 0.6f, - key -> LongHashFunction.xx3().hashBytes(key.bytes, key.offset, key.length), - BigArrays.NON_RECYCLING_INSTANCE - ); - - @Override - public long add(BytesRef key) { - return table.add(key); - } - - @Override - public void close() { - table.close(); - } - }; - default: - throw new IllegalArgumentException("invalid hash table type: " + type); - } - } - } - - private interface HashTable extends Releasable { - long add(BytesRef key); } } diff --git a/libs/common/src/main/java/org/opensearch/common/Numbers.java b/libs/common/src/main/java/org/opensearch/common/Numbers.java index 084e52a41f8b1..d5a364a4a934e 100644 --- a/libs/common/src/main/java/org/opensearch/common/Numbers.java +++ b/libs/common/src/main/java/org/opensearch/common/Numbers.java @@ -260,4 +260,12 @@ public static double unsignedLongToDouble(long value) { // want to replace that with 1 in the shifted value for correct rounding. return (double) ((value >>> 1) | (value & 1)) * 2.0; } + + /** + * Return the strictly greater next power of two for the given value. + * For zero and negative numbers, it returns 1. + */ + public static long nextPowerOfTwo(long value) { + return 1L << (Long.SIZE - Long.numberOfLeadingZeros(value)); + } } diff --git a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java index ecc93d017beaf..c2f394c627806 100644 --- a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java +++ b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java @@ -6,179 +6,291 @@ * compatible open source license. */ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright OpenSearch Contributors. See - * GitHub history for details. - */ - package org.opensearch.common.util; +import net.openhft.hashing.LongHashFunction; import org.apache.lucene.util.BytesRef; +import org.opensearch.common.Numbers; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; import org.opensearch.core.common.util.ByteArray; +import java.security.AccessController; +import java.security.PrivilegedAction; + /** - * Specialized hash table implementation similar to Lucene's BytesRefHash that maps - * BytesRef values to ids. Collisions are resolved with open addressing and linear - * probing, growth is smooth thanks to {@link BigArrays}, hashes are cached for faster - * re-hashing and capacity is always a multiple of 2 for faster identification of buckets. - * This class is not thread-safe. + * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. + * + *

+ * It uses a compact byte-packing strategy to encode the ordinal and fingerprint information + * in the hash table value. It makes lookups faster by short-circuiting expensive equality checks + * for keys that collide onto the same hash table slot. * - * @opensearch.internal + *

+ * This class is not thread-safe. + * + * @opensearch.internal */ -public final class BytesRefHash extends AbstractHash { +public class BytesRefHash implements Releasable { + private static final LongHashFunction XX3 = AccessController.doPrivileged( + (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) + ); + + private static final long MAX_CAPACITY = 1L << 32; + private static final long DEFAULT_INITIAL_CAPACITY = 32; + private static final float DEFAULT_LOAD_FACTOR = 0.6f; + private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); + + private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal + private static final long MASK_FINGERPRINT = 0xFFFFFFFF00000000L; // extract fingerprint + + /** + * Maximum load factor after which the capacity is doubled. + */ + private final float loadFactor; - private LongArray startOffsets; - private ByteArray bytes; - private IntArray hashes; // we cache hashes for faster re-hashing - private final BytesRef spare; + /** + * Calculates the hash of a {@link BytesRef} key. + */ + private final Hasher hasher; + + /** + * Utility class to allocate recyclable arrays. + */ + private final BigArrays bigArrays; + + /** + * Reusable BytesRef to read keys. + */ + private final BytesRef scratch = new BytesRef(); + + /** + * Current capacity of the hash table. This must be a power of two so that the hash table slot + * can be identified quickly using bitmasks, thus avoiding expensive modulo or integer division. + */ + private long capacity; + + /** + * Bitmask to identify the hash table slot from a key's hash. + */ + private long mask; + + /** + * Size threshold after which the hash table needs to be doubled in capacity. + */ + private long grow; + + /** + * Current size of the hash table. + */ + private long size; + + /** + * Underlying array to store the hash table values. + * + *

+ * Each hash table value (64-bit) uses the following byte packing strategy: + *

+     * |================================|================================|
+     * | Fingerprint                    | Ordinal                        |
+     * |--------------------------------|--------------------------------|
+     * | 32 bits                        | 32 bits                        |
+     * |================================|================================|
+     * 
+ * + *

+ * This allows us to encode and manipulate additional information in the hash table + * itself without having to look elsewhere in the memory, which is much slower. + * + *

+ * Terminology: table[index] = value = (fingerprint | ordinal) + */ + private LongArray table; - // Constructor with configurable capacity and default maximum load factor. - public BytesRefHash(long capacity, BigArrays bigArrays) { - this(capacity, DEFAULT_MAX_LOAD_FACTOR, bigArrays); + /** + * Underlying array to store the starting offsets of keys. + * + *

+ * Terminology: + *

+     *   offsets[ordinal] = starting offset (inclusive)
+     *   offsets[ordinal + 1] = ending offset (exclusive)
+     * 
+ */ + private LongArray offsets; + + /** + * Underlying byte array to store the keys. + * + *

+ * Terminology: keys[start...end] = key + */ + private ByteArray keys; + + public BytesRefHash(final BigArrays bigArrays) { + this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); } - // Constructor with configurable capacity and load factor. - public BytesRefHash(long capacity, float maxLoadFactor, BigArrays bigArrays) { - super(capacity, maxLoadFactor, bigArrays); - startOffsets = bigArrays.newLongArray(capacity + 1, false); - startOffsets.set(0, 0); - bytes = bigArrays.newByteArray(capacity * 3, false); - hashes = bigArrays.newIntArray(capacity, false); - spare = new BytesRef(); + public BytesRefHash(final long initialCapacity, final BigArrays bigArrays) { + this(initialCapacity, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); } - // BytesRef has a weak hashCode function so we try to improve it by rehashing using Murmur3 - // Feel free to remove rehashing if BytesRef gets a better hash function - private static int rehash(int hash) { - return BitMixer.mix32(hash); + public BytesRefHash(final long initialCapacity, final float loadFactor, final BigArrays bigArrays) { + this(initialCapacity, loadFactor, DEFAULT_HASHER, bigArrays); } - /** - * Return the key at 0 <= index <= capacity(). The result is undefined if the slot is unused. - *

Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called

- */ - public BytesRef get(long id, BytesRef dest) { - final long startOffset = startOffsets.get(id); - final int length = (int) (startOffsets.get(id + 1) - startOffset); - bytes.get(startOffset, length, dest); - return dest; + public BytesRefHash(final long initialCapacity, final float loadFactor, final Hasher hasher, final BigArrays bigArrays) { + assert initialCapacity > 0 : "initial capacity must be greater than 0"; + assert loadFactor > 0 && loadFactor < 1 : "load factor must be between 0 and 1"; + + this.loadFactor = loadFactor; + this.hasher = hasher; + this.bigArrays = bigArrays; + + capacity = Numbers.nextPowerOfTwo((long) (initialCapacity / loadFactor)); + assert capacity <= MAX_CAPACITY : "required capacity too large"; + mask = capacity - 1; + size = 0; + grow = (long) (capacity * loadFactor); + + table = bigArrays.newLongArray(capacity, false); + table.fill(0, capacity, -1); + offsets = bigArrays.newLongArray(initialCapacity + 1, false); + offsets.set(0, 0); + keys = bigArrays.newByteArray(initialCapacity * 3, false); } /** - * Get the id associated with key + * Adds the given key to the hash table and returns its ordinal. + * If the key exists already, it returns (-1 - ordinal). */ - public long find(BytesRef key, int code) { - final long slot = slot(rehash(code), mask); - for (long index = slot;; index = nextSlot(index, mask)) { - final long id = id(index); - if (id == -1L || key.bytesEquals(get(id, spare))) { - return id; + public long add(final BytesRef key) { + final long hash = hasher.hash(key); + final long fingerprint = hash & MASK_FINGERPRINT; + + for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { + if ((value = table.get(idx)) == -1) { + final long val = fingerprint | size; + if (size >= grow) { + growAndInsert(hash, val); + } else { + table.set(idx, val); + } + return append(key); + } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { + return -1 - ordinal; } } } - /** Sugar for {@link #find(BytesRef, int) find(key, key.hashCode()} */ - public long find(BytesRef key) { - return find(key, key.hashCode()); - } + /** + * Returns the ordinal associated with the given key, or -1 if the key doesn't exist. + * + *

+ * Using the 64-bit hash value, up to 32 least significant bits (LSB) are used to identify the + * home slot in the hash table, and an additional 32 bits are used to identify the fingerprint. + * The fingerprint further increases the entropy and reduces the number of false lookups in the + * keys' table during equality checks, which is expensive. + * + *

+ * Total entropy bits = 32 + log2(capacity) + * + *

+ * Linear probing starts from the home slot, until a match or an empty slot is found. + * Values are first checked using their fingerprint (to reduce false positives), then verified + * in the keys' table using an equality check. + */ + public long find(final BytesRef key) { + final long hash = hasher.hash(key); + final long fingerprint = hash & MASK_FINGERPRINT; - private long set(BytesRef key, int code, long id) { - assert rehash(key.hashCode()) == code; - assert size < maxSize; - final long slot = slot(code, mask); - for (long index = slot;; index = nextSlot(index, mask)) { - final long curId = id(index); - if (curId == -1) { // means unset - id(index, id); - append(id, key, code); - ++size; - return id; - } else if (key.bytesEquals(get(curId, spare))) { - return -1 - curId; + for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { + if ((value = table.get(idx)) == -1) { + return -1; + } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { + return ordinal; } } } - private void append(long id, BytesRef key, int code) { - assert size == id; - final long startOffset = startOffsets.get(size); - bytes = bigArrays.grow(bytes, startOffset + key.length); - bytes.set(startOffset, key.bytes, key.offset, key.length); - startOffsets = bigArrays.grow(startOffsets, size + 2); - startOffsets.set(size + 1, startOffset + key.length); - hashes = bigArrays.grow(hashes, id + 1); - hashes.set(id, code); + /** + * Returns the key associated with the given ordinal. + * The result is undefined for an unused ordinal. + * + *

+ * Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called + */ + public BytesRef get(final long ordinal, final BytesRef dest) { + final long start = offsets.get(ordinal); + final int length = (int) (offsets.get(ordinal + 1) - start); + keys.get(start, length, dest); + return dest; } - private boolean assertConsistent(long id, int code) { - get(id, spare); - return rehash(spare.hashCode()) == code; + /** + * Returns the number of mappings in this hash table. + */ + public long size() { + return size; } - private void reset(int code, long id) { - assert assertConsistent(id, code); - final long slot = slot(code, mask); - for (long index = slot;; index = nextSlot(index, mask)) { - final long curId = id(index); - if (curId == -1) { // means unset - id(index, id); - break; - } - } + /** + * Appends the key in the keys' and offsets' tables. + */ + private long append(final BytesRef key) { + final long start = offsets.get(size); + final long end = start + key.length; + offsets = bigArrays.grow(offsets, size + 2); + offsets.set(size + 1, end); + keys = bigArrays.grow(keys, end); + keys.set(start, key.bytes, key.offset, key.length); + return size++; } /** - * Try to add key. Return its newly allocated id if it wasn't in the hash table yet, or -1-id - * if it was already present in the hash table. + * Grows the hash table by doubling its capacity, inserting the provided value, + * and reinserting the previous values at their updated slots. */ - public long add(BytesRef key, int code) { - if (size >= maxSize) { - assert size == maxSize; - grow(); - } - assert size < maxSize; - return set(key, rehash(code), size); - } + private void growAndInsert(final long hash, final long value) { + // Ensure that the hash table doesn't grow too large. + // This implicitly also ensures that the ordinals are no larger than 2^32, thus, + // preventing them from polluting the fingerprint bits in the hash table values. + assert capacity < MAX_CAPACITY : "hash table already at the max capacity"; - /** Sugar to {@link #add(BytesRef, int) add(key, key.hashCode()}. */ - public long add(BytesRef key) { - return add(key, key.hashCode()); + capacity <<= 1; + mask = capacity - 1; + grow = (long) (capacity * loadFactor); + table = bigArrays.grow(table, capacity); + table.fill(0, capacity, -1); + table.set(hash & mask, value); + + for (long ordinal = 0; ordinal < size; ordinal++) { + reinsert(ordinal, hasher.hash(get(ordinal, scratch))); + } } - @Override - protected void removeAndAdd(long index) { - final long id = id(index, -1); - assert id >= 0; - final int code = hashes.get(id); - reset(code, id); + /** + * Reinserts the hash table value for an existing key stored at the given ordinal. + */ + private void reinsert(final long ordinal, final long hash) { + for (long idx = hash & mask;; idx = (idx + 1) & mask) { + if (table.get(idx) == -1) { + table.set(idx, (hash & MASK_FINGERPRINT) | ordinal); + return; + } + } } @Override public void close() { - try (Releasable releasable = Releasables.wrap(bytes, hashes, startOffsets)) { - super.close(); - } + Releasables.close(table, offsets, keys); } + /** + * Hasher calculates the hash of a {@link BytesRef} key. + */ + @FunctionalInterface + public interface Hasher { + long hash(BytesRef key); + } } diff --git a/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java b/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java deleted file mode 100644 index d8d4690d14e90..0000000000000 --- a/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java +++ /dev/null @@ -1,286 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.util; - -import net.openhft.hashing.LongHashFunction; -import org.apache.lucene.util.BytesRef; -import org.opensearch.common.lease.Releasable; -import org.opensearch.common.lease.Releasables; -import org.opensearch.core.common.util.ByteArray; - -import java.security.AccessController; -import java.security.PrivilegedAction; - -/** - * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. - * - *

- * It uses a compact byte-packing strategy to encode the ordinal and fingerprint information - * in the hash table value. It makes lookups faster by short-circuiting expensive equality checks - * for keys that collide onto the same hash table slot. - * - *

- * This class is not thread-safe. - * - * @opensearch.internal - */ -public class CompactBytesRefHash implements Releasable { - private static final LongHashFunction XX3 = AccessController.doPrivileged( - (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) - ); - - private static final long MAX_CAPACITY = 1L << 32; - private static final long DEFAULT_INITIAL_CAPACITY = 32; - private static final float DEFAULT_LOAD_FACTOR = 0.6f; - private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); - - private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal - private static final long MASK_FINGERPRINT = 0xFFFFFFFF00000000L; // extract fingerprint - - /** - * Maximum load factor after which the capacity is doubled. - */ - private final float loadFactor; - - /** - * Calculates the hash of a {@link BytesRef} key. - */ - private final Hasher hasher; - - /** - * Utility class to allocate recyclable arrays. - */ - private final BigArrays bigArrays; - - /** - * Reusable BytesRef to read keys. - */ - private final BytesRef scratch = new BytesRef(); - - /** - * Current capacity of the hash table. This must be a power of two so that the hash table slot - * can be identified quickly using bitmasks, thus avoiding expensive modulo or integer division. - */ - private long capacity; - - /** - * Bitmask to identify the hash table slot from a key's hash. - */ - private long mask; - - /** - * Size threshold after which the hash table needs to be doubled in capacity. - */ - private long grow; - - /** - * Current size of the hash table. - */ - private long size; - - /** - * Underlying array to store the hash table values. - * - *

- * Each hash table value (64-bit) uses the following byte packing strategy: - *

-     * |================================|================================|
-     * | Fingerprint                    | Ordinal                        |
-     * |--------------------------------|--------------------------------|
-     * | 32 bits                        | 32 bits                        |
-     * |================================|================================|
-     * 
- * - *

- * This allows us to encode and manipulate additional information in the hash table - * itself without having to look elsewhere in the memory, which is much slower. - * - *

- * Terminology: table[index] = value = (fingerprint | ordinal) - */ - private LongArray table; - - /** - * Underlying array to store the starting offsets of keys. - * - *

- * Terminology: - *

-     *   offsets[ordinal] = starting offset (inclusive)
-     *   offsets[ordinal + 1] = ending offset (exclusive)
-     * 
- */ - private LongArray offsets; - - /** - * Underlying byte array to store the keys. - * - *

- * Terminology: keys[start...end] = key - */ - private ByteArray keys; - - public CompactBytesRefHash(final BigArrays bigArrays) { - this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); - } - - public CompactBytesRefHash(final long initialCapacity, final float loadFactor, final Hasher hasher, final BigArrays bigArrays) { - assert initialCapacity > 0 : "initial capacity must be greater than 0"; - assert loadFactor > 0 && loadFactor < 1 : "load factor must be between 0 and 1"; - - this.loadFactor = loadFactor; - this.hasher = hasher; - this.bigArrays = bigArrays; - - capacity = Math.max(1, Long.highestOneBit((long) (initialCapacity / loadFactor)) << 1); - mask = capacity - 1; - size = 0; - grow = (long) (capacity * loadFactor); - - table = bigArrays.newLongArray(capacity, false); - table.fill(0, capacity, -1); - offsets = bigArrays.newLongArray(initialCapacity + 1, false); - offsets.set(0, 0); - keys = bigArrays.newByteArray(initialCapacity * 3, false); - } - - /** - * Adds the given key to the hash table and returns its ordinal. - * If the key exists already, it returns (-1 - ordinal). - */ - public long add(final BytesRef key) { - final long hash = hasher.hash(key); - final long fingerprint = hash & MASK_FINGERPRINT; - - for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { - if ((value = table.get(idx)) == -1) { - final long val = fingerprint | size; - if (size >= grow) { - growAndInsert(hash, val); - } else { - table.set(idx, val); - } - return append(key); - } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { - return -1 - ordinal; - } - } - } - - /** - * Returns the ordinal associated with the given key, or -1 if the key doesn't exist. - * - *

- * Using the 64-bit hash value, up to 32 least significant bits (LSB) are used to identify the - * home slot in the hash table, and an additional 32 bits are used to identify the fingerprint. - * The fingerprint further increases the entropy and reduces the number of false lookups in the - * keys' table during equality checks, which is expensive. - * - *

- * Total entropy bits = 32 + log2(capacity) - * - *

- * Linear probing starts from the home slot, until a match or an empty slot is found. - * Values are first checked using their fingerprint (to reduce false positives), then verified - * in the keys' table using an equality check. - */ - public long find(final BytesRef key) { - final long hash = hasher.hash(key); - final long fingerprint = hash & MASK_FINGERPRINT; - - for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { - if ((value = table.get(idx)) == -1) { - return -1; - } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { - return ordinal; - } - } - } - - /** - * Returns the key associated with the given ordinal. - * The result is undefined for an unused ordinal. - * - *

- * Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called - */ - public BytesRef get(final long ordinal, final BytesRef dest) { - final long start = offsets.get(ordinal); - final int length = (int) (offsets.get(ordinal + 1) - start); - keys.get(start, length, dest); - return dest; - } - - /** - * Returns the number of mappings in this hash table. - */ - public long size() { - return size; - } - - /** - * Appends the key in the keys' and offsets' tables. - */ - private long append(final BytesRef key) { - final long start = offsets.get(size); - final long end = start + key.length; - offsets = bigArrays.grow(offsets, size + 2); - offsets.set(size + 1, end); - keys = bigArrays.grow(keys, end); - keys.set(start, key.bytes, key.offset, key.length); - return size++; - } - - /** - * Grows the hash table by doubling its capacity, inserting the provided value, - * and reinserting the previous values at their updated slots. - */ - private void growAndInsert(final long hash, final long value) { - // Ensure that the hash table doesn't grow too large. - // This implicitly also ensures that the ordinals are no larger than 2^32, thus, - // preventing them from polluting the fingerprint bits in the hash table values. - assert capacity < MAX_CAPACITY : "hash table already at the max capacity"; - - capacity <<= 1; - mask = capacity - 1; - grow = (long) (capacity * loadFactor); - table = bigArrays.grow(table, capacity); - table.fill(0, capacity, -1); - table.set(hash & mask, value); - - for (long ordinal = 0; ordinal < size; ordinal++) { - reinsert(ordinal, hasher.hash(get(ordinal, scratch))); - } - } - - /** - * Reinserts the hash table value for an existing key stored at the given ordinal. - */ - private void reinsert(final long ordinal, final long hash) { - for (long idx = hash & mask;; idx = (idx + 1) & mask) { - if (table.get(idx) == -1) { - table.set(idx, (hash & MASK_FINGERPRINT) | ordinal); - return; - } - } - } - - @Override - public void close() { - Releasables.close(table, offsets, keys); - } - - /** - * Hasher calculates the hash of a {@link BytesRef} key. - */ - @FunctionalInterface - public interface Hasher { - long hash(BytesRef key); - } -} diff --git a/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java b/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java deleted file mode 100644 index 1806733cc1567..0000000000000 --- a/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java +++ /dev/null @@ -1,301 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.util; - -import net.openhft.hashing.LongHashFunction; -import org.apache.lucene.util.BytesRef; -import org.opensearch.common.lease.Releasable; -import org.opensearch.common.lease.Releasables; -import org.opensearch.core.common.util.ByteArray; - -import java.security.AccessController; -import java.security.PrivilegedAction; - -/** - * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. - * - *

- * It organizes itself by moving keys around dynamically in order to reduce the - * longest probe sequence length (PSL), which makes lookups faster as keys are likely to - * be found in the same CPU cache line. It also uses fingerprints to short-circuit expensive - * equality checks for keys that collide onto the same hash table slot. - * - *

- * This class is not thread-safe. - * - * @opensearch.internal - */ -public class ReorganizingBytesRefHash implements Releasable { - private static final LongHashFunction XX3 = AccessController.doPrivileged( - (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) - ); - - private static final long MAX_CAPACITY = 1L << 32; - private static final long DEFAULT_INITIAL_CAPACITY = 32; - private static final float DEFAULT_LOAD_FACTOR = 0.6f; - private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); - - private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal - private static final long MASK_FINGERPRINT = 0x0000FFFF00000000L; // extract fingerprint - private static final long MASK_PSL = 0x7FFF000000000000L; // extract PSL - private static final long INCR_PSL = 0x0001000000000000L; // increment PSL by one - - /** - * Maximum load factor after which the capacity is doubled. - */ - private final float loadFactor; - - /** - * Calculates the hash of a {@link BytesRef} key. - */ - private final Hasher hasher; - - /** - * Utility class to allocate recyclable arrays. - */ - private final BigArrays bigArrays; - - /** - * Reusable BytesRef to read keys. - */ - private final BytesRef scratch = new BytesRef(); - - /** - * Current capacity of the hash table. This must be a power of two so that the hash table slot - * can be identified quickly using bitmasks, thus avoiding expensive modulo or integer division. - */ - private long capacity; - - /** - * Bitmask to identify the hash table slot from a key's hash. - */ - private long mask; - - /** - * Size threshold after which the hash table needs to be doubled in capacity. - */ - private long grow; - - /** - * Current size of the hash table. - */ - private long size; - - /** - * Underlying array to store the hash table values. - * - *

- * Each hash table value (64-bit) uses the following byte packing strategy: - *

-     * |=========|===============|================|================================|
-     * | Discard | PSL           | Fingerprint    | Ordinal                        |
-     * |    -    |---------------|----------------|--------------------------------|
-     * | 1 bit   | 15 bits       | 16 bits        | 32 bits                        |
-     * |=========|===============|================|================================|
-     * 
- * - *

- * This allows us to encode and manipulate additional information in the hash table - * itself without having to look elsewhere in the memory, which is much slower. - * - *

- * Terminology: table[index] = value = (discard | psl | fingerprint | ordinal) - */ - private LongArray table; - - /** - * Underlying array to store the starting offsets of keys. - * - *

- * Terminology: - *

-     *   offsets[ordinal] = starting offset (inclusive)
-     *   offsets[ordinal + 1] = ending offset (exclusive)
-     * 
- */ - private LongArray offsets; - - /** - * Underlying byte array to store the keys. - * - *

- * Terminology: keys[start...end] = key - */ - private ByteArray keys; - - public ReorganizingBytesRefHash(final BigArrays bigArrays) { - this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); - } - - public ReorganizingBytesRefHash(final long initialCapacity, final float loadFactor, final Hasher hasher, final BigArrays bigArrays) { - assert initialCapacity > 0 : "initial capacity must be greater than 0"; - assert loadFactor > 0 && loadFactor < 1 : "load factor must be between 0 and 1"; - - this.loadFactor = loadFactor; - this.hasher = hasher; - this.bigArrays = bigArrays; - - capacity = Math.max(1, Long.highestOneBit((long) (initialCapacity / loadFactor)) << 1); - mask = capacity - 1; - size = 0; - grow = (long) (capacity * loadFactor); - - table = bigArrays.newLongArray(capacity, false); - table.fill(0, capacity, -1); - offsets = bigArrays.newLongArray(initialCapacity + 1, false); - offsets.set(0, 0); - keys = bigArrays.newByteArray(initialCapacity * 3, false); - } - - /** - * Adds the given key to the hash table and returns its ordinal. - * If the key exists already, it returns (-1 - ordinal). - */ - public long add(final BytesRef key) { - final long hash = hasher.hash(key); - final long fingerprint = hash & MASK_FINGERPRINT; - - for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { - if ((value = table.get(idx)) == -1) { - final long val = (fingerprint | size); - if (size >= grow) { - growAndInsert(hash, val); - } else { - insert(hash, val); - } - return append(key); - } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { - return -(1 + ordinal); - } - } - } - - /** - * Returns the ordinal associated with the given key, or -1 if the key doesn't exist. - * - *

- * Using the 64-bit hash value, up to 32 least significant bits (LSB) are used to identify the - * home slot in the hash table, and an additional 16 bits are used to identify the fingerprint. - * The fingerprint further increases the entropy and reduces the number of false lookups in the - * keys' table during equality checks, which is expensive. - * - *

- * Total entropy bits = 16 + log2(capacity) - * - *

- * Linear probing starts from the home slot, until a match or an empty slot is found. - * Values are first checked using their fingerprint (to reduce false positives), then verified - * in the keys' table using an equality check. - */ - public long find(final BytesRef key) { - final long hash = hasher.hash(key); - final long fingerprint = hash & MASK_FINGERPRINT; - - for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { - if ((value = table.get(idx)) == -1) { - return -1; - } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { - return ordinal; - } - } - } - - /** - * Returns the key associated with the given ordinal. - * The result is undefined for an unused ordinal. - * - *

- * Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called - */ - public BytesRef get(final long ordinal, final BytesRef dest) { - final long start = offsets.get(ordinal); - final int length = (int) (offsets.get(ordinal + 1) - start); - keys.get(start, length, dest); - return dest; - } - - /** - * Returns the number of mappings in this hash table. - */ - public long size() { - return size; - } - - /** - * Appends the key in the keys' and offsets' tables. - */ - private long append(final BytesRef key) { - final long start = offsets.get(size); - final long end = start + key.length; - offsets = bigArrays.grow(offsets, size + 2); - offsets.set(size + 1, end); - keys = bigArrays.grow(keys, end); - keys.set(start, key.bytes, key.offset, key.length); - return size++; - } - - /** - * Grows the hash table by doubling its capacity, inserting the provided value, - * and reinserting the previous values at their updated slots. - */ - private void growAndInsert(final long hash, final long value) { - // Ensure that the hash table doesn't grow too large. - // This implicitly also ensures that the ordinals are no larger than 2^32, thus, - // preventing them from polluting the fingerprint bits in the hash table values. - assert capacity < MAX_CAPACITY : "hash table already at the max capacity"; - - capacity <<= 1; - mask = capacity - 1; - grow = (long) (capacity * loadFactor); - table = bigArrays.grow(table, capacity); - table.fill(0, capacity, -1); - table.set(hash & mask, value); - - for (long ordinal = 0; ordinal < size; ordinal++) { - final long h = hasher.hash(get(ordinal, scratch)); - insert(h, (h & MASK_FINGERPRINT) | ordinal); - } - } - - /** - * Inserts the hash table value for a missing key. - */ - private void insert(final long hash, final long value) { - for (long idx = hash & mask, current = value, existing;; idx = (idx + 1) & mask) { - if ((existing = table.get(idx)) == -1) { - table.set(idx, current); - return; - } else if ((existing & MASK_PSL) < (current & MASK_PSL)) { - current = table.set(idx, current); - } - current += INCR_PSL; - } - } - - @Override - public void close() { - Releasables.close(table, offsets, keys); - } - - /** - * Returns the underlying hash table. - * Visible for unit-tests. - */ - LongArray getTable() { - return table; - } - - /** - * Hasher calculates the hash of a {@link BytesRef} key. - */ - @FunctionalInterface - public interface Hasher { - long hash(BytesRef key); - } -} diff --git a/server/src/main/java/org/opensearch/common/util/ReorganizingLongHash.java b/server/src/main/java/org/opensearch/common/util/ReorganizingLongHash.java index 417eb6a316d86..d6c29fcae3a94 100644 --- a/server/src/main/java/org/opensearch/common/util/ReorganizingLongHash.java +++ b/server/src/main/java/org/opensearch/common/util/ReorganizingLongHash.java @@ -8,7 +8,9 @@ package org.opensearch.common.util; +import org.opensearch.common.Numbers; import org.opensearch.common.lease.Releasable; +import org.opensearch.common.lease.Releasables; /** * Specialized hash table implementation that maps a (primitive) long to long. @@ -109,7 +111,8 @@ public ReorganizingLongHash(final long initialCapacity, final float loadFactor, this.bigArrays = bigArrays; this.loadFactor = loadFactor; - capacity = nextPowerOfTwo((long) (initialCapacity / loadFactor)); + capacity = Numbers.nextPowerOfTwo((long) (initialCapacity / loadFactor)); + assert capacity <= MAX_CAPACITY : "required capacity too large"; mask = capacity - 1; grow = (long) (capacity * loadFactor); size = 0; @@ -296,11 +299,6 @@ private void grow() { @Override public void close() { - table.close(); - keys.close(); - } - - private static long nextPowerOfTwo(final long value) { - return Math.max(1, Long.highestOneBit(value - 1) << 1); + Releasables.close(table, keys); } } diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java index 2c804166eed78..5d7c5c2976169 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java @@ -37,7 +37,6 @@ import org.opensearch.common.lease.Releasables; import org.opensearch.common.util.BigArrays; import org.opensearch.common.util.BytesRefHash; -import org.opensearch.common.util.CompactBytesRefHash; import org.opensearch.search.aggregations.CardinalityUpperBound; /** @@ -129,10 +128,10 @@ public void readValue(BytesRef dest) {} * @opensearch.internal */ private static class FromSingle extends BytesKeyedBucketOrds { - private final CompactBytesRefHash ords; + private final BytesRefHash ords; private FromSingle(BigArrays bigArrays) { - ords = new CompactBytesRefHash(bigArrays); + ords = new BytesRefHash(bigArrays); } @Override @@ -191,7 +190,7 @@ private static class FromMany extends BytesKeyedBucketOrds { private final LongKeyedBucketOrds longToBucketOrds; private FromMany(BigArrays bigArrays) { - bytesToLong = new BytesRefHash(1, bigArrays); + bytesToLong = new BytesRefHash(bigArrays); longToBucketOrds = LongKeyedBucketOrds.build(bigArrays, CardinalityUpperBound.MANY); } diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificanceLookup.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificanceLookup.java index aee4caa67afa1..34bbac55900a8 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificanceLookup.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificanceLookup.java @@ -123,7 +123,7 @@ public void close() {} }; } return new BackgroundFrequencyForBytes() { - private final BytesRefHash termToPosition = new BytesRefHash(1, bigArrays); + private final BytesRefHash termToPosition = new BytesRefHash(bigArrays); private LongArray positionToFreq = bigArrays.newLongArray(1, false); @Override diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index c796faa6a8b76..cc35fe75e5e92 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -135,7 +135,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I Arrays.fill(mergeMap, -1); long offset = 0; for (int owningOrdIdx = 0; owningOrdIdx < owningBucketOrds.length; owningOrdIdx++) { - try (BytesRefHash bucketsInThisOwningBucketToCollect = new BytesRefHash(1, context.bigArrays())) { + try (BytesRefHash bucketsInThisOwningBucketToCollect = new BytesRefHash(context.bigArrays())) { filters[owningOrdIdx] = newFilter(); List builtBuckets = new ArrayList<>(); BytesKeyedBucketOrds.BucketOrdsEnum collectedBuckets = bucketOrds.ordsEnum(owningBucketOrds[owningOrdIdx]); diff --git a/server/src/test/java/org/opensearch/common/NumbersTests.java b/server/src/test/java/org/opensearch/common/NumbersTests.java index 5fb85d815ded2..ff12b3bc4cc96 100644 --- a/server/src/test/java/org/opensearch/common/NumbersTests.java +++ b/server/src/test/java/org/opensearch/common/NumbersTests.java @@ -221,4 +221,24 @@ public void testToUnsignedBigInteger() { assertEquals(random, Numbers.toUnsignedBigInteger(random.longValue())); assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE, Numbers.toUnsignedBigInteger(Numbers.MAX_UNSIGNED_LONG_VALUE.longValue())); } + + public void testNextPowerOfTwo() { + // Negative values: + for (int i = 0; i < 1000; i++) { + long value = randomLongBetween(-500, -1); + assertEquals(1, Numbers.nextPowerOfTwo(value)); + } + + // Zero value: + assertEquals(1, Numbers.nextPowerOfTwo(0L)); + + // Positive values: + for (int i = 0; i < 1000; i++) { + long value = randomLongBetween(1, 500); + long nextPowerOfTwo = Numbers.nextPowerOfTwo(value); + + assertTrue(nextPowerOfTwo > value); // must be strictly greater + assertEquals(0, nextPowerOfTwo & (nextPowerOfTwo - 1)); // must be a power of two + } + } } diff --git a/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java index a78a35e5a2412..d40012accbb7a 100644 --- a/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java +++ b/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java @@ -33,6 +33,7 @@ package org.opensearch.common.util; import org.apache.lucene.tests.util.TestUtil; +import net.openhft.hashing.LongHashFunction; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.opensearch.common.settings.Settings; @@ -44,6 +45,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.stream.Stream; public class BytesRefHashTests extends OpenSearchTestCase { @@ -57,9 +59,13 @@ private void newHash() { if (hash != null) { hash.close(); } - // Test high load factors to make sure that collision resolution works fine - final float maxLoadFactor = 0.6f + randomFloat() * 0.39f; - hash = new BytesRefHash(randomIntBetween(0, 100), maxLoadFactor, randomBigArrays()); + LongHashFunction hasher = LongHashFunction.xx3(randomLong()); + hash = new BytesRefHash( + randomIntBetween(1, 100), // random capacity + 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution + key -> hasher.hashBytes(key.bytes, key.offset, key.length), + randomBigArrays() + ); } @Override @@ -68,39 +74,34 @@ public void setUp() throws Exception { newHash(); } - public void testDuel() { - final int len = randomIntBetween(1, 100000); - final BytesRef[] values = new BytesRef[len]; - for (int i = 0; i < values.length; ++i) { - values[i] = new BytesRef(randomAlphaOfLength(5)); - } - final Map valueToId = new HashMap<>(); - final BytesRef[] idToValue = new BytesRef[values.length]; - final int iters = randomInt(1000000); - for (int i = 0; i < iters; ++i) { - final BytesRef value = randomFrom(values); - if (valueToId.containsKey(value)) { - assertEquals(-1 - valueToId.get(value), hash.add(value, value.hashCode())); + public void testFuzzy() { + Map reference = new HashMap<>(); + BytesRef[] keys = Stream.generate(() -> new BytesRef(randomAlphaOfLength(20))) + .limit(randomIntBetween(1000, 2000)) + .toArray(BytesRef[]::new); + + // Verify the behaviour of "add" and "find". + for (int i = 0; i < keys.length * 10; i++) { + BytesRef key = keys[i % keys.length]; + if (reference.containsKey(key)) { + long expectedOrdinal = reference.get(key); + assertEquals(-1 - expectedOrdinal, hash.add(key)); + assertEquals(expectedOrdinal, hash.find(key)); } else { - assertEquals(valueToId.size(), hash.add(value, value.hashCode())); - idToValue[valueToId.size()] = value; - valueToId.put(value, valueToId.size()); + assertEquals(-1, hash.find(key)); + reference.put(key, (long) reference.size()); + assertEquals((long) reference.get(key), hash.add(key)); } } - assertEquals(valueToId.size(), hash.size()); - for (final var next : valueToId.entrySet()) { - assertEquals(next.getValue().longValue(), hash.find(next.getKey(), next.getKey().hashCode())); + // Verify the behaviour of "get". + BytesRef scratch = new BytesRef(); + for (Map.Entry entry : reference.entrySet()) { + assertEquals(entry.getKey(), hash.get(entry.getValue(), scratch)); } - for (long i = 0; i < hash.capacity(); ++i) { - final long id = hash.id(i); - BytesRef spare = new BytesRef(); - if (id >= 0) { - hash.get(id, spare); - assertEquals(idToValue[(int) id], spare); - } - } + // Verify the behaviour of "size". + assertEquals(reference.size(), hash.size()); hash.close(); } diff --git a/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java deleted file mode 100644 index 297fe82a2a505..0000000000000 --- a/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.util; - -import net.openhft.hashing.LongHashFunction; -import org.apache.lucene.util.BytesRef; -import org.opensearch.test.OpenSearchTestCase; - -import java.util.HashMap; -import java.util.Map; -import java.util.stream.Stream; - -public class CompactBytesRefHashTests extends OpenSearchTestCase { - - public void testFuzzy() { - LongHashFunction hasher = LongHashFunction.xx3(randomLong()); - Map reference = new HashMap<>(); - BytesRef[] keys = Stream.generate(() -> new BytesRef(randomAlphaOfLength(20))).limit(1000).toArray(BytesRef[]::new); - - try ( - CompactBytesRefHash h = new CompactBytesRefHash( - randomIntBetween(1, 100), // random capacity - 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution - key -> hasher.hashBytes(key.bytes, key.offset, key.length), - BigArrays.NON_RECYCLING_INSTANCE - ) - ) { - // Verify the behaviour of "add" and "find". - for (int i = 0; i < keys.length * 10; i++) { - BytesRef key = keys[i % keys.length]; - if (reference.containsKey(key)) { - long expectedOrdinal = reference.get(key); - assertEquals(-1 - expectedOrdinal, h.add(key)); - assertEquals(expectedOrdinal, h.find(key)); - } else { - assertEquals(-1, h.find(key)); - reference.put(key, (long) reference.size()); - assertEquals((long) reference.get(key), h.add(key)); - } - } - - // Verify the behaviour of "get". - BytesRef scratch = new BytesRef(); - for (Map.Entry entry : reference.entrySet()) { - assertEquals(entry.getKey(), h.get(entry.getValue(), scratch)); - } - - // Verify the behaviour of "size". - assertEquals(reference.size(), h.size()); - } - } -} diff --git a/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java deleted file mode 100644 index e7ab6d1fa21a6..0000000000000 --- a/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.util; - -import net.openhft.hashing.LongHashFunction; -import org.apache.lucene.util.BytesRef; -import org.opensearch.test.OpenSearchTestCase; - -import java.util.HashMap; -import java.util.Map; -import java.util.stream.Stream; - -public class ReorganizingBytesRefHashTests extends OpenSearchTestCase { - - public void testFuzzy() { - LongHashFunction hasher = LongHashFunction.xx3(randomLong()); - Map reference = new HashMap<>(); - BytesRef[] keys = Stream.generate(() -> new BytesRef(randomAlphaOfLength(20))).limit(1000).toArray(BytesRef[]::new); - - try ( - ReorganizingBytesRefHash h = new ReorganizingBytesRefHash( - randomIntBetween(1, 100), // random capacity - 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution - key -> hasher.hashBytes(key.bytes, key.offset, key.length), - BigArrays.NON_RECYCLING_INSTANCE - ) - ) { - // Verify the behaviour of "add" and "find". - for (int i = 0; i < keys.length * 10; i++) { - BytesRef key = keys[i % keys.length]; - if (reference.containsKey(key)) { - long expectedOrdinal = reference.get(key); - assertEquals(-1 - expectedOrdinal, h.add(key)); - assertEquals(expectedOrdinal, h.find(key)); - } else { - assertEquals(-1, h.find(key)); - reference.put(key, (long) reference.size()); - assertEquals((long) reference.get(key), h.add(key)); - } - } - - // Verify the behaviour of "get". - BytesRef scratch = new BytesRef(); - for (Map.Entry entry : reference.entrySet()) { - assertEquals(entry.getKey(), h.get(entry.getValue(), scratch)); - } - - // Verify the behaviour of "size". - assertEquals(reference.size(), h.size()); - - // Verify the calculation of PSLs. - long capacity = h.getTable().size(); - long mask = capacity - 1; - for (long idx = 0; idx < h.getTable().size(); idx++) { - long value = h.getTable().get(idx); - if (value != -1) { - BytesRef key = h.get((int) value, scratch); - long homeIdx = hasher.hashBytes(key.bytes, key.offset, key.length) & mask; - assertEquals((capacity + idx - homeIdx) & mask, value >>> 48); - } - } - } - } -} From b4e27a4efe2592320ce41689f553fcaf97d2fefe Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Mon, 14 Aug 2023 21:00:36 +0530 Subject: [PATCH 3/9] Added t1ha1 to replace xxh3 hash function Signed-off-by: Ketan Verma --- .../common/hash/HashFunctionBenchmark.java | 172 +++++++++++++ .../common/util/BytesRefHashBenchmark.java | 81 ++++++- buildSrc/version.properties | 2 - .../java/org/opensearch/common/hash/T1ha.java | 226 +++++++++++++++++ .../opensearch/common/hash/package-info.java | 12 + .../common/hash/HashFunctionTestCase.java | 137 +++++++++++ .../org/opensearch/common/hash/T1haTests.java | 229 ++++++++++++++++++ server/build.gradle | 16 +- .../zero-allocation-hashing-0.16.jar.sha1 | 1 - .../zero-allocation-hashing-LICENSE.txt | 201 --------------- .../zero-allocation-hashing-NOTICE.txt | 0 .../opensearch/common/util/BytesRefHash.java | 37 ++- .../org/opensearch/bootstrap/security.policy | 10 - .../bootstrap/test-framework.policy | 1 - .../org/opensearch/common/NumbersTests.java | 1 + .../common/util/BytesRefHashTests.java | 6 +- 16 files changed, 876 insertions(+), 256 deletions(-) create mode 100644 benchmarks/src/main/java/org/opensearch/common/hash/HashFunctionBenchmark.java create mode 100644 libs/common/src/main/java/org/opensearch/common/hash/T1ha.java create mode 100644 libs/common/src/main/java/org/opensearch/common/hash/package-info.java create mode 100644 libs/common/src/test/java/org/opensearch/common/hash/HashFunctionTestCase.java create mode 100644 libs/common/src/test/java/org/opensearch/common/hash/T1haTests.java delete mode 100644 server/licenses/zero-allocation-hashing-0.16.jar.sha1 delete mode 100644 server/licenses/zero-allocation-hashing-LICENSE.txt delete mode 100644 server/licenses/zero-allocation-hashing-NOTICE.txt diff --git a/benchmarks/src/main/java/org/opensearch/common/hash/HashFunctionBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/hash/HashFunctionBenchmark.java new file mode 100644 index 0000000000000..bae6a813fc5a1 --- /dev/null +++ b/benchmarks/src/main/java/org/opensearch/common/hash/HashFunctionBenchmark.java @@ -0,0 +1,172 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.hash; + +import org.apache.lucene.util.StringHelper; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +import java.util.Random; + +@Fork(value = 3) +@Warmup(iterations = 1, time = 1) +@Measurement(iterations = 3, time = 3) +@BenchmarkMode(Mode.Throughput) +public class HashFunctionBenchmark { + + @Benchmark + public void hash(Blackhole bh, Options opts) { + bh.consume(opts.type.hash(opts.data)); + } + + @State(Scope.Benchmark) + public static class Options { + @Param({ "MURMUR3", "T1HA" }) + public Type type; + + @Param({ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "12", + "14", + "16", + "18", + "21", + "24", + "28", + "32", + "36", + "41", + "47", + "54", + "62", + "71", + "81", + "90", + "100", + "112", + "125", + "139", + "156", + "174", + "194", + "220", + "245", + "272", + "302", + "339", + "384", + "431", + "488", + "547", + "608", + "675", + "763", + "863", + "967", + "1084", + "1225", + "1372", + "1537", + "1737", + "1929", + "2142", + "2378", + "2664", + "3011", + "3343", + "3778", + "4232", + "4783", + "5310", + "5895", + "6662", + "7529", + "8508", + "9444", + "10483", + "11741", + "13150", + "14597", + "16495", + "18475", + "20877", + "23383", + "25956", + "29071", + "32560", + "36142", + "40841", + "46151", + "52151", + "57888", + "65414", + "72610", + "82050", + "91076", + "102006", + "114247", + "127957", + "143312", + "159077", + "176576", + "199531", + "223475", + "250292", + "277825", + "313943", + "351617", + "393812" }) + public Integer length; + public byte[] data; + + @Setup + public void setup() { + data = new byte[length]; + new Random(0).nextBytes(data); + } + } + + public enum Type { + MURMUR3((data, offset, length) -> StringHelper.murmurhash3_x86_32(data, offset, length, 0)), + T1HA((data, offset, length) -> T1ha.hash(data, offset, length, 0)); + + private final Hasher hasher; + + Type(Hasher hasher) { + this.hasher = hasher; + } + + public long hash(byte[] data) { + return hasher.hash(data, 0, data.length); + } + } + + @FunctionalInterface + interface Hasher { + long hash(byte[] data, int offset, int length); + } +} diff --git a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java index 2e2a2399e9c0d..379653a53bfd2 100644 --- a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java +++ b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java @@ -9,6 +9,7 @@ package org.opensearch.common.util; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.StringHelper; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -19,15 +20,17 @@ import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; +import org.opensearch.common.hash.T1ha; +import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; import java.util.HashSet; import java.util.Random; import java.util.Set; import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; import java.util.stream.Stream; @Fork(value = 3) @@ -41,16 +44,23 @@ public class BytesRefHashBenchmark { @Benchmark public void add(Blackhole bh, Options opts) { + HashTable[] tables = Stream.generate(opts.type::create).limit(NUM_TABLES).toArray(HashTable[]::new); + for (int hit = 0; hit < NUM_HITS; hit++) { BytesRef key = opts.keys[hit % opts.keys.length]; - for (BytesRefHash table : opts.tables) { + for (HashTable table : tables) { bh.consume(table.add(key)); } } + + Releasables.close(tables); } @State(Scope.Benchmark) public static class Options { + @Param({ "MURMUR3", "T1HA" }) + public Type type; + @Param({ "1", "2", @@ -152,23 +162,19 @@ public static class Options { "753883", "851888", "971153" }) - public Integer size; - @Param({ "8", "32", "128" }) + @Param({ "5", "28", "59", "105" }) public Integer length; - private BytesRefHash[] tables; - private BytesRef[] keys; @Setup public void setup() { assert size <= Math.pow(26, length) : "key length too small to generate the required number of keys"; - tables = Stream.generate(() -> new BytesRefHash(BigArrays.NON_RECYCLING_INSTANCE)) - .limit(NUM_TABLES) - .toArray(BytesRefHash[]::new); - Random random = new Random(0); + // Seeding with size will help produce deterministic results for the same size, and avoid similar + // looking clusters for different sizes, in case one hash function got unlucky. + Random random = new Random(size); Set seen = new HashSet<>(); keys = new BytesRef[size]; for (int i = 0; i < size; i++) { @@ -185,10 +191,59 @@ public void setup() { seen.add(key); } } + } + + public enum Type { + MURMUR3(() -> new HashTable() { + private final BytesRefHash table = new BytesRefHash(1, 0.6f, key -> { + // Repeating the lower bits into upper bits to make the fingerprint work. + // Alternatively, use a 64-bit murmur3 hash, but that won't represent the baseline. + long h = StringHelper.murmurhash3_x86_32(key.bytes, key.offset, key.length, 0) & 0xFFFFFFFFL; + return h | (h << 32); + }, BigArrays.NON_RECYCLING_INSTANCE); + + @Override + public long add(BytesRef key) { + return table.add(key); + } + + @Override + public void close() { + table.close(); + } + }), - @TearDown - public void tearDown() { - Releasables.close(tables); + T1HA(() -> new HashTable() { + private final BytesRefHash table = new BytesRefHash( + 1, + 0.6f, + key -> T1ha.hash(key.bytes, key.offset, key.length, 0), + BigArrays.NON_RECYCLING_INSTANCE + ); + + @Override + public long add(BytesRef key) { + return table.add(key); + } + + @Override + public void close() { + table.close(); + } + }); + + private final Supplier supplier; + + Type(Supplier supplier) { + this.supplier = supplier; + } + + public HashTable create() { + return supplier.get(); } } + + interface HashTable extends Releasable { + long add(BytesRef key); + } } diff --git a/buildSrc/version.properties b/buildSrc/version.properties index 2bb21dfca4b14..ff962309cf084 100644 --- a/buildSrc/version.properties +++ b/buildSrc/version.properties @@ -69,5 +69,3 @@ resteasy = 6.2.4.Final # opentelemetry dependencies opentelemetry = 1.26.0 -# hashing dependencies -zero_allocation_hashing = 0.16 diff --git a/libs/common/src/main/java/org/opensearch/common/hash/T1ha.java b/libs/common/src/main/java/org/opensearch/common/hash/T1ha.java new file mode 100644 index 0000000000000..185875f4a2d68 --- /dev/null +++ b/libs/common/src/main/java/org/opensearch/common/hash/T1ha.java @@ -0,0 +1,226 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.hash; + +import java.lang.invoke.MethodHandles; +import java.lang.invoke.VarHandle; +import java.nio.ByteOrder; + +import static java.lang.Long.rotateRight; + +/** + * t1ha: Fast Positive Hash + * + *

+ * Implements t1ha1; + * a fast portable hash function with reasonable quality for checksums, hash tables, and thin fingerprinting. + * + *

+ * To overcome language and performance limitations, this implementation differs slightly from the reference + * implementation in C++, so the returned values will vary. + * + *

+ * Intended for little-endian systems but returns the same result on big-endian, albeit marginally slower. + */ +public class T1ha { + private static final long SEED = System.nanoTime(); + + private static final VarHandle LONG_HANDLE = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.LITTLE_ENDIAN); + private static final VarHandle INT_HANDLE = MethodHandles.byteArrayViewVarHandle(int[].class, ByteOrder.LITTLE_ENDIAN); + private static final VarHandle SHORT_HANDLE = MethodHandles.byteArrayViewVarHandle(short[].class, ByteOrder.LITTLE_ENDIAN); + + // "Magic" primes: + private static final long p0 = 0xEC99BF0D8372CAABL; + private static final long p1 = 0x82434FE90EDCEF39L; + private static final long p2 = 0xD4F06DB99D67BE4BL; + private static final long p3 = 0xBD9CACC22C6E9571L; + private static final long p4 = 0x9C06FAF4D023E3ABL; + private static final long p5 = 0xC060724A8424F345L; + private static final long p6 = 0xCB5AF53AE3AAAC31L; + + // Rotations: + private static final int s0 = 41; + private static final int s1 = 17; + private static final int s2 = 31; + + /** + * Returns the hash code for the specified range of the given {@code byte} array. + * @param input the input byte array + * @param offset the starting offset + * @param length the length of the range + * @return hash code + */ + public static long hash(byte[] input, int offset, int length) { + return hash(input, offset, length, SEED); + } + + /** + * Returns the hash code for the specified range of the given {@code byte} array. + * @param input the input byte array + * @param offset the starting offset + * @param length the length of the range + * @param seed customized seed + * @return hash code + */ + public static long hash(byte[] input, int offset, int length, long seed) { + long a = seed; + long b = length; + + if (length > 32) { + long c = rotateRight(length, s1) + seed; + long d = length ^ rotateRight(seed, s1); + + do { + long w0 = fetch64(input, offset); + long w1 = fetch64(input, offset + 8); + long w2 = fetch64(input, offset + 16); + long w3 = fetch64(input, offset + 24); + + long d02 = w0 ^ rotateRight(w2 + d, s1); + long c13 = w1 ^ rotateRight(w3 + c, s1); + c += a ^ rotateRight(w0, s0); + d -= b ^ rotateRight(w1, s2); + a ^= p1 * (d02 + w3); + b ^= p0 * (c13 + w2); + + offset += 32; + length -= 32; + } while (length >= 32); + + a ^= p6 * (rotateRight(c, s1) + d); + b ^= p5 * (rotateRight(d, s1) + c); + } + + return h32(input, offset, length, a, b); + } + + /** + * Computes the hash of up to 32 bytes. + * Constants in the switch expression are dense; JVM will use them as indices into a table of + * instruction pointers (tableswitch instruction), making lookups really fast. + */ + @SuppressWarnings("fallthrough") + private static long h32(byte[] input, int offset, int length, long a, long b) { + switch (length) { + default: + b += mux64(fetch64(input, offset), p4); + offset += 8; + length -= 8; + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + a += mux64(fetch64(input, offset), p3); + offset += 8; + length -= 8; + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + b += mux64(fetch64(input, offset), p2); + offset += 8; + length -= 8; + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + a += mux64(tail64(input, offset, length), p1); + case 0: + // Final weak avalanche + return mux64(rotateRight(a + b, s1), p4) + mix64(a ^ b, p0); + } + } + + /** + * XOR the high and low parts of the full 128-bit product. + */ + private static long mux64(long a, long b) { + // Ideally, the following should be used to match the reference implementation: + // return Math.unsignedMultiplyHigh(a, b) ^ (a * b); + // Since unsignedMultiplyHigh isn't available before JDK 18, and calculating it without intrinsics is quite slow, + // the multiplyHigh method is used instead. Slight loss in quality is imperceptible for our use-case: a hash table. + return Math.multiplyHigh(a, b) ^ (a * b); + } + + /** + * XOR-MUL-XOR bit-mixer. + */ + private static long mix64(long a, long b) { + a *= b; + return a ^ rotateRight(a, s0); + } + + /** + * Reads "length" bytes starting at "offset" in little-endian order; returned as long. + * It is assumed that the length is between 1 and 8 (inclusive); but no defensive checks are made as such. + */ + private static long tail64(byte[] input, int offset, int length) { + switch (length) { + case 1: + return fetch8(input, offset); + case 2: + return fetch16(input, offset); + case 3: + return fetch16(input, offset) | (fetch8(input, offset + 2) << 16); + case 4: + return fetch32(input, offset); + case 5: + return fetch32(input, offset) | (fetch8(input, offset + 4) << 32); + case 6: + return fetch32(input, offset) | (fetch16(input, offset + 4) << 32); + case 7: + // This is equivalent to: + // return fetch32(input, offset) | (fetch16(input, offset + 4) << 32) | (fetch8(input, offset + 6) << 48); + // But reading two ints overlapping by one byte is faster due to lesser instructions. + return fetch32(input, offset) | (fetch32(input, offset + 3) << 24); + default: + return fetch64(input, offset); + } + } + + /** + * Reads a 64-bit long. + */ + private static long fetch64(byte[] input, int offset) { + return (long) LONG_HANDLE.get(input, offset); + } + + /** + * Reads a 32-bit unsigned integer, returned as long. + */ + private static long fetch32(byte[] input, int offset) { + return (int) INT_HANDLE.get(input, offset) & 0xFFFFFFFFL; + } + + /** + * Reads a 16-bit unsigned short, returned as long. + */ + private static long fetch16(byte[] input, int offset) { + return (short) SHORT_HANDLE.get(input, offset) & 0xFFFFL; + } + + /** + * Reads an 8-bit unsigned byte, returned as long. + */ + private static long fetch8(byte[] input, int offset) { + return input[offset] & 0xFFL; + } +} diff --git a/libs/common/src/main/java/org/opensearch/common/hash/package-info.java b/libs/common/src/main/java/org/opensearch/common/hash/package-info.java new file mode 100644 index 0000000000000..bd393b8b921ed --- /dev/null +++ b/libs/common/src/main/java/org/opensearch/common/hash/package-info.java @@ -0,0 +1,12 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * Common hashing utilities. + */ +package org.opensearch.common.hash; diff --git a/libs/common/src/test/java/org/opensearch/common/hash/HashFunctionTestCase.java b/libs/common/src/test/java/org/opensearch/common/hash/HashFunctionTestCase.java new file mode 100644 index 0000000000000..6f7f813bffa24 --- /dev/null +++ b/libs/common/src/test/java/org/opensearch/common/hash/HashFunctionTestCase.java @@ -0,0 +1,137 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.hash; + +import org.opensearch.common.Randomness; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.Locale; +import java.util.Random; + +public abstract class HashFunctionTestCase extends OpenSearchTestCase { + private static final int[] INPUT_BITS = new int[] { 24, 32, 40, 48, 56, 64, 72, 80, 96, 112, 128, 160, 512, 1024 }; + private static final int OUTPUT_BITS = 64; + private static final int ITERATIONS = 1000; + private static final double BIAS_THRESHOLD = 0.01; // 1% + + public abstract long hash(byte[] input); + + /** + * Tests if the hash function shows an avalanche effect, i.e, flipping a single input bit + * should flip half the output bits. + */ + public final void testAvalanche() { + for (int inputBits : INPUT_BITS) { + AvalancheStats stats = simulate(inputBits, OUTPUT_BITS, new RandomInputGenerator(inputBits)); + if (stats.bias() >= BIAS_THRESHOLD) { + fail("bias exceeds threshold: " + stats); + } + } + } + + private AvalancheStats simulate(int inputBits, int outputBits, InputGenerator inputGenerator) { + int[][] flips = new int[inputBits][outputBits]; + + for (int iter = 0; iter < ITERATIONS; iter++) { + byte[] input = inputGenerator.next(); + long hash = hash(input); + + for (int i = 0; i < inputBits; i++) { + flip(input, i); // flip one bit + long newHash = hash(input); // recompute the hash; half the bits should have flipped + flip(input, i); // return to original + + long diff = hash ^ newHash; + for (int o = 0; o < OUTPUT_BITS; o++) { + if ((diff & 1) == 1) { + flips[i][o] += 1; + } + diff >>>= 1; + } + } + } + + return new AvalancheStats(flips); + } + + private static void flip(byte[] input, int position) { + int offset = position / 8; + int bit = position & 7; + input[offset] ^= (1 << bit); + } + + @FunctionalInterface + interface InputGenerator { + byte[] next(); + } + + private static class RandomInputGenerator implements InputGenerator { + private final Random random = Randomness.get(); + private final byte[] input; + + public RandomInputGenerator(int size) { + input = new byte[size]; + } + + @Override + public byte[] next() { + random.nextBytes(input); + return input; + } + } + + private static class AvalancheStats { + private final int inputBits; + private final int outputBits; + private final double bias; + private final double sumOfSquaredErrors; + + public AvalancheStats(int[][] flips) { + this.inputBits = flips.length; + this.outputBits = flips[0].length; + double sumOfBiases = 0; + double sumOfSquaredErrors = 0; + + for (int i = 0; i < inputBits; i++) { + for (int o = 0; o < outputBits; o++) { + sumOfSquaredErrors += Math.pow(0.5 - ((double) flips[i][o] / ITERATIONS), 2); + sumOfBiases += 2 * ((double) flips[i][o] / ITERATIONS) - 1; + } + } + + this.bias = Math.abs(sumOfBiases / (inputBits * outputBits)); + this.sumOfSquaredErrors = sumOfSquaredErrors; + } + + public double bias() { + return bias; + } + + public double diffusion() { + return 1 - bias; + } + + public double sumOfSquaredErrors() { + return sumOfSquaredErrors; + } + + @Override + public String toString() { + return String.format( + Locale.ROOT, + "AvalancheStats{inputBits=%d, outputBits=%d, bias=%.4f%%, diffusion=%.4f%%, sumOfSquaredErrors=%.2f}", + inputBits, + outputBits, + bias() * 100, + diffusion() * 100, + sumOfSquaredErrors() + ); + } + } +} diff --git a/libs/common/src/test/java/org/opensearch/common/hash/T1haTests.java b/libs/common/src/test/java/org/opensearch/common/hash/T1haTests.java new file mode 100644 index 0000000000000..d6132e235d89b --- /dev/null +++ b/libs/common/src/test/java/org/opensearch/common/hash/T1haTests.java @@ -0,0 +1,229 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.hash; + +public class T1haTests extends HashFunctionTestCase { + + /** + * Inspired from the tests defined in the reference implementation: + * t1ha_selfcheck.c + */ + public void testSelfCheck() { + byte[] testPattern = { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + (byte) 0xFF, + 0x7F, + 0x3F, + 0x1F, + 0xF, + 8, + 16, + 32, + 64, + (byte) 0x80, + (byte) 0xFE, + (byte) 0xFC, + (byte) 0xF8, + (byte) 0xF0, + (byte) 0xE0, + (byte) 0xC0, + (byte) 0xFD, + (byte) 0xFB, + (byte) 0xF7, + (byte) 0xEF, + (byte) 0xDF, + (byte) 0xBF, + 0x55, + (byte) 0xAA, + 11, + 17, + 19, + 23, + 29, + 37, + 42, + 43, + 'a', + 'b', + 'c', + 'd', + 'e', + 'f', + 'g', + 'h', + 'i', + 'j', + 'k', + 'l', + 'm', + 'n', + 'o', + 'p', + 'q', + 'r', + 's', + 't', + 'u', + 'v', + 'w', + 'x' }; + + /* Reference hashes when using {@link Math::unsignedMultiplyHigh} in the mux64 step. + These values match the ones defined in the reference implementation: + https://github.com/erthink/t1ha/blob/master/src/t1ha1_selfcheck.c#L51-L72 + + We don't expect our implementation to return these values as we use {@link Math::multiplyHigh} instead. + Keeping it here for context. + + long[] reference = { + 0L, + 0x6A580668D6048674L, 0xA2FE904AFF0D0879L, 0xE3AB9C06FAF4D023L, 0x6AF1C60874C95442L, + 0xB3557E561A6C5D82L, 0x0AE73C696F3D37C0L, 0x5EF25F7062324941L, 0x9B784F3B4CE6AF33L, + 0x6993BB206A74F070L, 0xF1E95DF109076C4CL, 0x4E1EB70C58E48540L, 0x5FDD7649D8EC44E4L, + 0x559122C706343421L, 0x380133D58665E93DL, 0x9CE74296C8C55AE4L, 0x3556F9A5757AB6D0L, + 0xF62751F7F25C469EL, 0x851EEC67F6516D94L, 0xED463EE3848A8695L, 0xDC8791FEFF8ED3ACL, + 0x2569C744E1A282CFL, 0xF90EB7C1D70A80B9L, 0x68DFA6A1B8050A4CL, 0x94CCA5E8210D2134L, + 0xF5CC0BEABC259F52L, 0x40DBC1F51618FDA7L, 0x0807945BF0FB52C6L, 0xE5EF7E09DE70848DL, + 0x63E1DF35FEBE994AL, 0x2025E73769720D5AL, 0xAD6120B2B8A152E1L, 0x2A71D9F13959F2B7L, + 0x8A20849A27C32548L, 0x0BCBC9FE3B57884EL, 0x0E028D255667AEADL, 0xBE66DAD3043AB694L, + 0xB00E4C1238F9E2D4L, 0x5C54BDE5AE280E82L, 0x0E22B86754BC3BC4L, 0x016707EBF858B84DL, + 0x990015FBC9E095EEL, 0x8B9AF0A3E71F042FL, 0x6AA56E88BD380564L, 0xAACE57113E681A0FL, + 0x19F81514AFA9A22DL, 0x80DABA3D62BEAC79L, 0x715210412CABBF46L, 0xD8FA0B9E9D6AA93FL, + 0x6C2FC5A4109FD3A2L, 0x5B3E60EEB51DDCD8L, 0x0A7C717017756FE7L, 0xA73773805CA31934L, + 0x4DBD6BB7A31E85FDL, 0x24F619D3D5BC2DB4L, 0x3E4AF35A1678D636L, 0x84A1A8DF8D609239L, + 0x359C862CD3BE4FCDL, 0xCF3A39F5C27DC125L, 0xC0FF62F8FD5F4C77L, 0x5E9F2493DDAA166CL, + 0x17424152BE1CA266L, 0xA78AFA5AB4BBE0CDL, 0x7BFB2E2CEF118346L, 0x647C3E0FF3E3D241L, + 0x0352E4055C13242EL, 0x6F42FC70EB660E38L, 0x0BEBAD4FABF523BAL, 0x9269F4214414D61DL, + 0x1CA8760277E6006CL, 0x7BAD25A859D87B5DL, 0xAD645ADCF7414F1DL, 0xB07F517E88D7AFB3L, + 0xB321C06FB5FFAB5CL, 0xD50F162A1EFDD844L, 0x1DFD3D1924FBE319L, 0xDFAEAB2F09EF7E78L, + 0xA7603B5AF07A0B1EL, 0x41CD044C0E5A4EE3L, 0xF64D2F86E813BF33L, 0xFF9FDB99305EB06AL + }; */ + + // Reference hashes when using {@link Math::multiplyHigh} in the mux64 step. + long[] reference = { + 0L, + 0xCE510B7405E0A2CAL, + 0xC0A2DA74A8271FCBL, + 0x1C549C06FAF4D023L, + 0x084CDA0ED41CD2D4L, + 0xD05BA7AA9FEECE5BL, + 0x7D6128AB2CCC4EB1L, + 0x62332FA6EC1B50AAL, + 0x1B66C81767870EF2L, + 0xEC6B92A37AED73B8L, + 0x1712987232EF4ED3L, + 0xAA503A04AE2450B5L, + 0x15D25DE445730A6CL, + 0xAB87E38AA8D21746L, + 0x18CAE735BBF62D15L, + 0x0D56DFF9914CA656L, + 0xCB4F5859A9AE5B52L, + 0xEE97003F7B1283E1L, + 0x50CFB2AF0F54BA6DL, + 0x570B4D6AE4C67814L, + 0x1ED59274A97497EBL, + 0x8608D03D165C59BFL, + 0x6CBE0E537BE04C02L, + 0xD4C8FCFD4179A874L, + 0xFB4E677D876118A1L, + 0x6B1A96F1B4765D79L, + 0x1075B9B89BDFE5F8L, + 0x02771D08F2891CB1L, + 0x4BB8E16FF410F19EL, + 0x3EB7849C0DFAF566L, + 0x173B09359DE422CFL, + 0xFE212C6DB7474306L, + 0xA74E7C2D632664EFL, + 0x56ECDED6546F0914L, + 0x08DEF866EF20A94BL, + 0x7D0BAC64606521F1L, + 0xCA6BA9817A357FA9L, + 0x0873B834A6E2AAE4L, + 0x45EE02D6DCF8992EL, + 0x3EA060225B3E1C1FL, + 0x24DBB6D02D5CC531L, + 0xE5E91A7340BF9382L, + 0x28975F86E2E2177FL, + 0x80E48374A6B42E85L, + 0xDF40392265BB4A66L, + 0x43750475A48C7023L, + 0x5648BD3E391C01D3L, + 0x9BE9E11AD1A6C369L, + 0x2E079CB8C1A11F50L, + 0xB2D538403F1020F1L, + 0x297518A4EF6AF5F1L, + 0xA8CE1B90167A6F8BL, + 0xB926B2FA50541BA9L, + 0xC46A2D3BD6925A35L, + 0x3071BC8E6C400487L, + 0x300D3885894BA47FL, + 0x840BFF3BEB7EEADDL, + 0xDC9E04DF744BDC0CL, + 0xBE01CF6841412C77L, + 0x6C55B2DC74B816A1L, + 0x4D4C63128A344F82L, + 0xC6227497E100B463L, + 0x53C9987705EA71C0L, + 0x3E355394668C3559L, + 0x05984B7D358B107AL, + 0x4D32FA1D79002A57L, + 0x910B0DAD1440EC24L, + 0x025BDE6A7BEBF320L, + 0x0D33817EF345D999L, + 0xBA0DE64B3F4DB34AL, + 0x54666461D0EB4FD7L, + 0x746ECFA92D1CAF81L, + 0x6E6A774ACD266DF2L, + 0x1A86161AE8E82A85L, + 0xFFF7C351A4CEC13DL, + 0xFFF05844F57498B8L, + 0x8DB71789127C6C13L, + 0x4A52ACF805F370ABL, + 0xFE13F90A1ACFBD58L, + 0x615730E301ED12E2L, + 0x1A2D4AA43B6C0103L }; + + int offset = 0; + assertEquals(reference[offset++], T1ha.hash(null, 0, 0, 0L)); // empty-zero + assertEquals(reference[offset++], T1ha.hash(null, 0, 0, ~0L)); // empty-all1 + assertEquals(reference[offset++], T1ha.hash(testPattern, 0, 64, 0L)); // bin64-zero + + long seed = 1; + for (int i = 1; i < 64; i++) { + assertEquals(reference[offset++], T1ha.hash(testPattern, 0, i, seed)); // bin%i-1p%i + seed <<= 1; + } + + seed = ~0L; + for (int i = 1; i <= 7; i++) { + seed <<= 1; + assertEquals(reference[offset++], T1ha.hash(testPattern, i, 64 - i, seed)); // align%i_F%i + } + + byte[] testPatternLong = new byte[512]; + for (int i = 0; i < testPatternLong.length; i++) { + testPatternLong[i] = (byte) i; + } + for (int i = 0; i <= 7; i++) { + assertEquals(reference[offset++], T1ha.hash(testPatternLong, i, 128 + i * 17, seed)); // long-%05i + } + } + + @Override + public long hash(byte[] input) { + return T1ha.hash(input, 0, input.length); + } +} diff --git a/server/build.gradle b/server/build.gradle index 3b1fe9554a309..f6db3d53a0dcc 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -158,9 +158,6 @@ dependencies { api "com.google.protobuf:protobuf-java:${versions.protobuf}" api "jakarta.annotation:jakarta.annotation-api:${versions.jakarta_annotation}" - // hashing - api "net.openhft:zero-allocation-hashing:${versions.zero_allocation_hashing}" - testImplementation(project(":test:framework")) { // tests use the locally compiled version of server exclude group: 'org.opensearch', module: 'server' @@ -367,18 +364,7 @@ tasks.named("thirdPartyAudit").configure { 'com.google.protobuf.UnsafeUtil$Android32MemoryAccessor', 'com.google.protobuf.UnsafeUtil$Android64MemoryAccessor', 'com.google.protobuf.UnsafeUtil$JvmMemoryAccessor', - 'com.google.protobuf.UnsafeUtil$MemoryAccessor', - - // from zero-allocation-hashing - 'net.openhft.hashing.HotSpotPrior7u6StringHash', - 'net.openhft.hashing.LongHashFunction', - 'net.openhft.hashing.LongTupleHashFunction', - 'net.openhft.hashing.ModernCompactStringHash', - 'net.openhft.hashing.ModernHotSpotStringHash', - 'net.openhft.hashing.UnsafeAccess', - 'net.openhft.hashing.UnsafeAccess$OldUnsafeAccessBigEndian', - 'net.openhft.hashing.UnsafeAccess$OldUnsafeAccessLittleEndian', - 'net.openhft.hashing.Util' + 'com.google.protobuf.UnsafeUtil$MemoryAccessor' ) } diff --git a/server/licenses/zero-allocation-hashing-0.16.jar.sha1 b/server/licenses/zero-allocation-hashing-0.16.jar.sha1 deleted file mode 100644 index e82e885f269ce..0000000000000 --- a/server/licenses/zero-allocation-hashing-0.16.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -0ca252f328160ed5d027f100a4fe525d6d21daaf \ No newline at end of file diff --git a/server/licenses/zero-allocation-hashing-LICENSE.txt b/server/licenses/zero-allocation-hashing-LICENSE.txt deleted file mode 100644 index 261eeb9e9f8b2..0000000000000 --- a/server/licenses/zero-allocation-hashing-LICENSE.txt +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/server/licenses/zero-allocation-hashing-NOTICE.txt b/server/licenses/zero-allocation-hashing-NOTICE.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java index c2f394c627806..5107feb6b3a26 100644 --- a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java +++ b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java @@ -6,18 +6,39 @@ * compatible open source license. */ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + package org.opensearch.common.util; -import net.openhft.hashing.LongHashFunction; import org.apache.lucene.util.BytesRef; import org.opensearch.common.Numbers; +import org.opensearch.common.hash.T1ha; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; import org.opensearch.core.common.util.ByteArray; -import java.security.AccessController; -import java.security.PrivilegedAction; - /** * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. * @@ -31,15 +52,11 @@ * * @opensearch.internal */ -public class BytesRefHash implements Releasable { - private static final LongHashFunction XX3 = AccessController.doPrivileged( - (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) - ); - +public final class BytesRefHash implements Releasable { private static final long MAX_CAPACITY = 1L << 32; private static final long DEFAULT_INITIAL_CAPACITY = 32; private static final float DEFAULT_LOAD_FACTOR = 0.6f; - private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); + private static final Hasher DEFAULT_HASHER = key -> T1ha.hash(key.bytes, key.offset, key.length); private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal private static final long MASK_FINGERPRINT = 0xFFFFFFFF00000000L; // extract fingerprint diff --git a/server/src/main/resources/org/opensearch/bootstrap/security.policy b/server/src/main/resources/org/opensearch/bootstrap/security.policy index 2fde31cb1d648..77cd0ab05278e 100644 --- a/server/src/main/resources/org/opensearch/bootstrap/security.policy +++ b/server/src/main/resources/org/opensearch/bootstrap/security.policy @@ -48,10 +48,6 @@ grant codeBase "${codebase.opensearch}" { permission java.lang.RuntimePermission "setContextClassLoader"; // needed for SPI class loading permission java.lang.RuntimePermission "accessDeclaredMembers"; - - // needed for zero-allocation-hashing - permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; - permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; }; //// Very special jar permissions: @@ -89,12 +85,6 @@ grant codeBase "${codebase.zstd-jni}" { permission java.lang.RuntimePermission "loadLibrary.*"; }; -grant codeBase "${codebase.zero-allocation-hashing}" { - permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; - permission java.lang.RuntimePermission "accessDeclaredMembers"; - permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; -}; - //// Everything else: grant { diff --git a/server/src/main/resources/org/opensearch/bootstrap/test-framework.policy b/server/src/main/resources/org/opensearch/bootstrap/test-framework.policy index 7d35d439bd373..0abfd7ef22ae7 100644 --- a/server/src/main/resources/org/opensearch/bootstrap/test-framework.policy +++ b/server/src/main/resources/org/opensearch/bootstrap/test-framework.policy @@ -156,6 +156,5 @@ grant { permission java.lang.RuntimePermission "accessDeclaredMembers"; permission java.lang.RuntimePermission "reflectionFactoryAccess"; permission java.lang.RuntimePermission "accessClassInPackage.sun.reflect"; - permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; }; diff --git a/server/src/test/java/org/opensearch/common/NumbersTests.java b/server/src/test/java/org/opensearch/common/NumbersTests.java index ff12b3bc4cc96..d6745f53f3569 100644 --- a/server/src/test/java/org/opensearch/common/NumbersTests.java +++ b/server/src/test/java/org/opensearch/common/NumbersTests.java @@ -238,6 +238,7 @@ public void testNextPowerOfTwo() { long nextPowerOfTwo = Numbers.nextPowerOfTwo(value); assertTrue(nextPowerOfTwo > value); // must be strictly greater + assertTrue((nextPowerOfTwo >>> 1) <= value); // must be greater by no more than one power of two assertEquals(0, nextPowerOfTwo & (nextPowerOfTwo - 1)); // must be a power of two } } diff --git a/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java index d40012accbb7a..b182ba11198be 100644 --- a/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java +++ b/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java @@ -33,9 +33,9 @@ package org.opensearch.common.util; import org.apache.lucene.tests.util.TestUtil; -import net.openhft.hashing.LongHashFunction; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import org.opensearch.common.hash.T1ha; import org.opensearch.common.settings.Settings; import org.opensearch.core.indices.breaker.NoneCircuitBreakerService; import org.opensearch.test.OpenSearchTestCase; @@ -59,11 +59,11 @@ private void newHash() { if (hash != null) { hash.close(); } - LongHashFunction hasher = LongHashFunction.xx3(randomLong()); + long seed = randomLong(); hash = new BytesRefHash( randomIntBetween(1, 100), // random capacity 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution - key -> hasher.hashBytes(key.bytes, key.offset, key.length), + key -> T1ha.hash(key.bytes, key.offset, key.length, seed), randomBigArrays() ); } From 1c76d9f420c665df08d024cad1bc5706ac4f9606 Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Wed, 16 Aug 2023 11:59:20 +0530 Subject: [PATCH 4/9] Update t1ha1 to use unsignedMultiplyHigh on JDK 18 and above Signed-off-by: Ketan Verma --- .../common/hash/HashFunctionBenchmark.java | 4 +- .../common/util/BytesRefHashBenchmark.java | 8 +- .../common/hash/{T1ha.java => T1ha1.java} | 54 ++++++- .../hash/{T1haTests.java => T1Ha1Tests.java} | 150 +++++++++++++----- .../opensearch/common/util/BytesRefHash.java | 4 +- .../org/opensearch/common/NumbersTests.java | 4 +- .../common/util/BytesRefHashTests.java | 4 +- 7 files changed, 170 insertions(+), 58 deletions(-) rename libs/common/src/main/java/org/opensearch/common/hash/{T1ha.java => T1ha1.java} (80%) rename libs/common/src/test/java/org/opensearch/common/hash/{T1haTests.java => T1Ha1Tests.java} (53%) diff --git a/benchmarks/src/main/java/org/opensearch/common/hash/HashFunctionBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/hash/HashFunctionBenchmark.java index bae6a813fc5a1..8842337a468a1 100644 --- a/benchmarks/src/main/java/org/opensearch/common/hash/HashFunctionBenchmark.java +++ b/benchmarks/src/main/java/org/opensearch/common/hash/HashFunctionBenchmark.java @@ -36,7 +36,7 @@ public void hash(Blackhole bh, Options opts) { @State(Scope.Benchmark) public static class Options { - @Param({ "MURMUR3", "T1HA" }) + @Param({ "MURMUR3", "T1HA1" }) public Type type; @Param({ @@ -152,7 +152,7 @@ public void setup() { public enum Type { MURMUR3((data, offset, length) -> StringHelper.murmurhash3_x86_32(data, offset, length, 0)), - T1HA((data, offset, length) -> T1ha.hash(data, offset, length, 0)); + T1HA1((data, offset, length) -> T1ha1.hash(data, offset, length, 0)); private final Hasher hasher; diff --git a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java index 379653a53bfd2..3071d1f4c7d7e 100644 --- a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java +++ b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java @@ -22,7 +22,7 @@ import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import org.opensearch.common.hash.T1ha; +import org.opensearch.common.hash.T1ha1; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; @@ -58,7 +58,7 @@ public void add(Blackhole bh, Options opts) { @State(Scope.Benchmark) public static class Options { - @Param({ "MURMUR3", "T1HA" }) + @Param({ "MURMUR3", "T1HA1" }) public Type type; @Param({ @@ -213,11 +213,11 @@ public void close() { } }), - T1HA(() -> new HashTable() { + T1HA1(() -> new HashTable() { private final BytesRefHash table = new BytesRefHash( 1, 0.6f, - key -> T1ha.hash(key.bytes, key.offset, key.length, 0), + key -> T1ha1.hash(key.bytes, key.offset, key.length, 0), BigArrays.NON_RECYCLING_INSTANCE ); diff --git a/libs/common/src/main/java/org/opensearch/common/hash/T1ha.java b/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java similarity index 80% rename from libs/common/src/main/java/org/opensearch/common/hash/T1ha.java rename to libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java index 185875f4a2d68..3a640257af32b 100644 --- a/libs/common/src/main/java/org/opensearch/common/hash/T1ha.java +++ b/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java @@ -8,7 +8,9 @@ package org.opensearch.common.hash; +import java.lang.invoke.MethodHandle; import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; import java.lang.invoke.VarHandle; import java.nio.ByteOrder; @@ -23,13 +25,14 @@ * *

* To overcome language and performance limitations, this implementation differs slightly from the reference - * implementation in C++, so the returned values will vary. + * implementation in C++, so the returned values may vary before JDK 18. * *

* Intended for little-endian systems but returns the same result on big-endian, albeit marginally slower. */ -public class T1ha { +public class T1ha1 { private static final long SEED = System.nanoTime(); + private static final Mux64 MUX_64_IMPL = fastestMux64Impl(); private static final VarHandle LONG_HANDLE = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.LITTLE_ENDIAN); private static final VarHandle INT_HANDLE = MethodHandles.byteArrayViewVarHandle(int[].class, ByteOrder.LITTLE_ENDIAN); @@ -153,11 +156,7 @@ private static long h32(byte[] input, int offset, int length, long a, long b) { * XOR the high and low parts of the full 128-bit product. */ private static long mux64(long a, long b) { - // Ideally, the following should be used to match the reference implementation: - // return Math.unsignedMultiplyHigh(a, b) ^ (a * b); - // Since unsignedMultiplyHigh isn't available before JDK 18, and calculating it without intrinsics is quite slow, - // the multiplyHigh method is used instead. Slight loss in quality is imperceptible for our use-case: a hash table. - return Math.multiplyHigh(a, b) ^ (a * b); + return MUX_64_IMPL.mux64(a, b); } /** @@ -223,4 +222,45 @@ private static long fetch16(byte[] input, int offset) { private static long fetch8(byte[] input, int offset) { return input[offset] & 0xFFL; } + + /** + * The implementation of mux64. + */ + @FunctionalInterface + private interface Mux64 { + long mux64(long a, long b); + } + + /** + * Provides the fastest available implementation of mux64 on this platform. + * + *

+ * Ideally, the following should be returned to match the reference implementation: + * {@code Math.unsignedMultiplyHigh(a, b) ^ (a * b)} + * + *

+ * Since unsignedMultiplyHigh isn't available before JDK 18, and calculating it without intrinsics is quite slow, + * the multiplyHigh method is used instead. Slight loss in quality is imperceptible for our use-case: a hash table. + * {@code Math.multiplyHigh(a, b) ^ (a * b)} + * + *

+ * This indirection can be removed once we stop supporting older JDKs. + */ + private static Mux64 fastestMux64Impl() { + try { + final MethodHandle unsignedMultiplyHigh = MethodHandles.publicLookup() + .findStatic(Math.class, "unsignedMultiplyHigh", MethodType.methodType(long.class, long.class, long.class)); + return (a, b) -> { + try { + return (long) unsignedMultiplyHigh.invokeExact(a, b) ^ (a * b); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }; + } catch (NoSuchMethodException e) { + return (a, b) -> Math.multiplyHigh(a, b) ^ (a * b); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + } } diff --git a/libs/common/src/test/java/org/opensearch/common/hash/T1haTests.java b/libs/common/src/test/java/org/opensearch/common/hash/T1Ha1Tests.java similarity index 53% rename from libs/common/src/test/java/org/opensearch/common/hash/T1haTests.java rename to libs/common/src/test/java/org/opensearch/common/hash/T1Ha1Tests.java index d6132e235d89b..bd72a5dcac703 100644 --- a/libs/common/src/test/java/org/opensearch/common/hash/T1haTests.java +++ b/libs/common/src/test/java/org/opensearch/common/hash/T1Ha1Tests.java @@ -8,7 +8,10 @@ package org.opensearch.common.hash; -public class T1haTests extends HashFunctionTestCase { +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; + +public class T1Ha1Tests extends HashFunctionTestCase { /** * Inspired from the tests defined in the reference implementation: @@ -81,39 +84,94 @@ public void testSelfCheck() { 'w', 'x' }; - /* Reference hashes when using {@link Math::unsignedMultiplyHigh} in the mux64 step. - These values match the ones defined in the reference implementation: - https://github.com/erthink/t1ha/blob/master/src/t1ha1_selfcheck.c#L51-L72 - - We don't expect our implementation to return these values as we use {@link Math::multiplyHigh} instead. - Keeping it here for context. - - long[] reference = { - 0L, - 0x6A580668D6048674L, 0xA2FE904AFF0D0879L, 0xE3AB9C06FAF4D023L, 0x6AF1C60874C95442L, - 0xB3557E561A6C5D82L, 0x0AE73C696F3D37C0L, 0x5EF25F7062324941L, 0x9B784F3B4CE6AF33L, - 0x6993BB206A74F070L, 0xF1E95DF109076C4CL, 0x4E1EB70C58E48540L, 0x5FDD7649D8EC44E4L, - 0x559122C706343421L, 0x380133D58665E93DL, 0x9CE74296C8C55AE4L, 0x3556F9A5757AB6D0L, - 0xF62751F7F25C469EL, 0x851EEC67F6516D94L, 0xED463EE3848A8695L, 0xDC8791FEFF8ED3ACL, - 0x2569C744E1A282CFL, 0xF90EB7C1D70A80B9L, 0x68DFA6A1B8050A4CL, 0x94CCA5E8210D2134L, - 0xF5CC0BEABC259F52L, 0x40DBC1F51618FDA7L, 0x0807945BF0FB52C6L, 0xE5EF7E09DE70848DL, - 0x63E1DF35FEBE994AL, 0x2025E73769720D5AL, 0xAD6120B2B8A152E1L, 0x2A71D9F13959F2B7L, - 0x8A20849A27C32548L, 0x0BCBC9FE3B57884EL, 0x0E028D255667AEADL, 0xBE66DAD3043AB694L, - 0xB00E4C1238F9E2D4L, 0x5C54BDE5AE280E82L, 0x0E22B86754BC3BC4L, 0x016707EBF858B84DL, - 0x990015FBC9E095EEL, 0x8B9AF0A3E71F042FL, 0x6AA56E88BD380564L, 0xAACE57113E681A0FL, - 0x19F81514AFA9A22DL, 0x80DABA3D62BEAC79L, 0x715210412CABBF46L, 0xD8FA0B9E9D6AA93FL, - 0x6C2FC5A4109FD3A2L, 0x5B3E60EEB51DDCD8L, 0x0A7C717017756FE7L, 0xA73773805CA31934L, - 0x4DBD6BB7A31E85FDL, 0x24F619D3D5BC2DB4L, 0x3E4AF35A1678D636L, 0x84A1A8DF8D609239L, - 0x359C862CD3BE4FCDL, 0xCF3A39F5C27DC125L, 0xC0FF62F8FD5F4C77L, 0x5E9F2493DDAA166CL, - 0x17424152BE1CA266L, 0xA78AFA5AB4BBE0CDL, 0x7BFB2E2CEF118346L, 0x647C3E0FF3E3D241L, - 0x0352E4055C13242EL, 0x6F42FC70EB660E38L, 0x0BEBAD4FABF523BAL, 0x9269F4214414D61DL, - 0x1CA8760277E6006CL, 0x7BAD25A859D87B5DL, 0xAD645ADCF7414F1DL, 0xB07F517E88D7AFB3L, - 0xB321C06FB5FFAB5CL, 0xD50F162A1EFDD844L, 0x1DFD3D1924FBE319L, 0xDFAEAB2F09EF7E78L, - 0xA7603B5AF07A0B1EL, 0x41CD044C0E5A4EE3L, 0xF64D2F86E813BF33L, 0xFF9FDB99305EB06AL - }; */ + // Reference hashes when using {@link Math::unsignedMultiplyHigh} in the mux64 step. + // These values match the ones defined in the reference implementation: + // https://github.com/erthink/t1ha/blob/master/src/t1ha1_selfcheck.c#L51-L72 + long[] referenceUnsignedMultiplyHigh = { + 0L, + 0x6A580668D6048674L, + 0xA2FE904AFF0D0879L, + 0xE3AB9C06FAF4D023L, + 0x6AF1C60874C95442L, + 0xB3557E561A6C5D82L, + 0x0AE73C696F3D37C0L, + 0x5EF25F7062324941L, + 0x9B784F3B4CE6AF33L, + 0x6993BB206A74F070L, + 0xF1E95DF109076C4CL, + 0x4E1EB70C58E48540L, + 0x5FDD7649D8EC44E4L, + 0x559122C706343421L, + 0x380133D58665E93DL, + 0x9CE74296C8C55AE4L, + 0x3556F9A5757AB6D0L, + 0xF62751F7F25C469EL, + 0x851EEC67F6516D94L, + 0xED463EE3848A8695L, + 0xDC8791FEFF8ED3ACL, + 0x2569C744E1A282CFL, + 0xF90EB7C1D70A80B9L, + 0x68DFA6A1B8050A4CL, + 0x94CCA5E8210D2134L, + 0xF5CC0BEABC259F52L, + 0x40DBC1F51618FDA7L, + 0x0807945BF0FB52C6L, + 0xE5EF7E09DE70848DL, + 0x63E1DF35FEBE994AL, + 0x2025E73769720D5AL, + 0xAD6120B2B8A152E1L, + 0x2A71D9F13959F2B7L, + 0x8A20849A27C32548L, + 0x0BCBC9FE3B57884EL, + 0x0E028D255667AEADL, + 0xBE66DAD3043AB694L, + 0xB00E4C1238F9E2D4L, + 0x5C54BDE5AE280E82L, + 0x0E22B86754BC3BC4L, + 0x016707EBF858B84DL, + 0x990015FBC9E095EEL, + 0x8B9AF0A3E71F042FL, + 0x6AA56E88BD380564L, + 0xAACE57113E681A0FL, + 0x19F81514AFA9A22DL, + 0x80DABA3D62BEAC79L, + 0x715210412CABBF46L, + 0xD8FA0B9E9D6AA93FL, + 0x6C2FC5A4109FD3A2L, + 0x5B3E60EEB51DDCD8L, + 0x0A7C717017756FE7L, + 0xA73773805CA31934L, + 0x4DBD6BB7A31E85FDL, + 0x24F619D3D5BC2DB4L, + 0x3E4AF35A1678D636L, + 0x84A1A8DF8D609239L, + 0x359C862CD3BE4FCDL, + 0xCF3A39F5C27DC125L, + 0xC0FF62F8FD5F4C77L, + 0x5E9F2493DDAA166CL, + 0x17424152BE1CA266L, + 0xA78AFA5AB4BBE0CDL, + 0x7BFB2E2CEF118346L, + 0x647C3E0FF3E3D241L, + 0x0352E4055C13242EL, + 0x6F42FC70EB660E38L, + 0x0BEBAD4FABF523BAL, + 0x9269F4214414D61DL, + 0x1CA8760277E6006CL, + 0x7BAD25A859D87B5DL, + 0xAD645ADCF7414F1DL, + 0xB07F517E88D7AFB3L, + 0xB321C06FB5FFAB5CL, + 0xD50F162A1EFDD844L, + 0x1DFD3D1924FBE319L, + 0xDFAEAB2F09EF7E78L, + 0xA7603B5AF07A0B1EL, + 0x41CD044C0E5A4EE3L, + 0xF64D2F86E813BF33L, + 0xFF9FDB99305EB06AL }; // Reference hashes when using {@link Math::multiplyHigh} in the mux64 step. - long[] reference = { + long[] referenceMultiplyHigh = { 0L, 0xCE510B7405E0A2CAL, 0xC0A2DA74A8271FCBL, @@ -196,21 +254,23 @@ public void testSelfCheck() { 0x615730E301ED12E2L, 0x1A2D4AA43B6C0103L }; + long[] reference = hasUnsignedMultiplyHigh() ? referenceUnsignedMultiplyHigh : referenceMultiplyHigh; + int offset = 0; - assertEquals(reference[offset++], T1ha.hash(null, 0, 0, 0L)); // empty-zero - assertEquals(reference[offset++], T1ha.hash(null, 0, 0, ~0L)); // empty-all1 - assertEquals(reference[offset++], T1ha.hash(testPattern, 0, 64, 0L)); // bin64-zero + assertEquals(reference[offset++], T1ha1.hash(null, 0, 0, 0L)); // empty-zero + assertEquals(reference[offset++], T1ha1.hash(null, 0, 0, ~0L)); // empty-all1 + assertEquals(reference[offset++], T1ha1.hash(testPattern, 0, 64, 0L)); // bin64-zero long seed = 1; for (int i = 1; i < 64; i++) { - assertEquals(reference[offset++], T1ha.hash(testPattern, 0, i, seed)); // bin%i-1p%i + assertEquals(reference[offset++], T1ha1.hash(testPattern, 0, i, seed)); // bin%i-1p%i seed <<= 1; } seed = ~0L; for (int i = 1; i <= 7; i++) { seed <<= 1; - assertEquals(reference[offset++], T1ha.hash(testPattern, i, 64 - i, seed)); // align%i_F%i + assertEquals(reference[offset++], T1ha1.hash(testPattern, i, 64 - i, seed)); // align%i_F%i } byte[] testPatternLong = new byte[512]; @@ -218,12 +278,24 @@ public void testSelfCheck() { testPatternLong[i] = (byte) i; } for (int i = 0; i <= 7; i++) { - assertEquals(reference[offset++], T1ha.hash(testPatternLong, i, 128 + i * 17, seed)); // long-%05i + assertEquals(reference[offset++], T1ha1.hash(testPatternLong, i, 128 + i * 17, seed)); // long-%05i + } + } + + private static boolean hasUnsignedMultiplyHigh() { + try { + MethodHandles.publicLookup() + .findStatic(Math.class, "unsignedMultiplyHigh", MethodType.methodType(long.class, long.class, long.class)); + return true; + } catch (NoSuchMethodException e) { + return false; + } catch (IllegalAccessException e) { + throw new RuntimeException(e); } } @Override public long hash(byte[] input) { - return T1ha.hash(input, 0, input.length); + return T1ha1.hash(input, 0, input.length); } } diff --git a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java index 5107feb6b3a26..2985a21a5100e 100644 --- a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java +++ b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java @@ -34,7 +34,7 @@ import org.apache.lucene.util.BytesRef; import org.opensearch.common.Numbers; -import org.opensearch.common.hash.T1ha; +import org.opensearch.common.hash.T1ha1; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; import org.opensearch.core.common.util.ByteArray; @@ -56,7 +56,7 @@ public final class BytesRefHash implements Releasable { private static final long MAX_CAPACITY = 1L << 32; private static final long DEFAULT_INITIAL_CAPACITY = 32; private static final float DEFAULT_LOAD_FACTOR = 0.6f; - private static final Hasher DEFAULT_HASHER = key -> T1ha.hash(key.bytes, key.offset, key.length); + private static final Hasher DEFAULT_HASHER = key -> T1ha1.hash(key.bytes, key.offset, key.length); private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal private static final long MASK_FINGERPRINT = 0xFFFFFFFF00000000L; // extract fingerprint diff --git a/server/src/test/java/org/opensearch/common/NumbersTests.java b/server/src/test/java/org/opensearch/common/NumbersTests.java index d6745f53f3569..7990ba74f162a 100644 --- a/server/src/test/java/org/opensearch/common/NumbersTests.java +++ b/server/src/test/java/org/opensearch/common/NumbersTests.java @@ -225,7 +225,7 @@ public void testToUnsignedBigInteger() { public void testNextPowerOfTwo() { // Negative values: for (int i = 0; i < 1000; i++) { - long value = randomLongBetween(-500, -1); + long value = randomLongBetween(-500000, -1); assertEquals(1, Numbers.nextPowerOfTwo(value)); } @@ -234,7 +234,7 @@ public void testNextPowerOfTwo() { // Positive values: for (int i = 0; i < 1000; i++) { - long value = randomLongBetween(1, 500); + long value = randomLongBetween(1, 500000); long nextPowerOfTwo = Numbers.nextPowerOfTwo(value); assertTrue(nextPowerOfTwo > value); // must be strictly greater diff --git a/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java index b182ba11198be..adcec8f07f702 100644 --- a/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java +++ b/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java @@ -35,7 +35,7 @@ import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; -import org.opensearch.common.hash.T1ha; +import org.opensearch.common.hash.T1ha1; import org.opensearch.common.settings.Settings; import org.opensearch.core.indices.breaker.NoneCircuitBreakerService; import org.opensearch.test.OpenSearchTestCase; @@ -63,7 +63,7 @@ private void newHash() { hash = new BytesRefHash( randomIntBetween(1, 100), // random capacity 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution - key -> T1ha.hash(key.bytes, key.offset, key.length, seed), + key -> T1ha1.hash(key.bytes, key.offset, key.length, seed), randomBigArrays() ); } From 374db9772460b96859ab46c4dfb53b3cc8e67f2f Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Wed, 16 Aug 2023 19:17:03 +0530 Subject: [PATCH 5/9] Add link to the reference implementation for t1ha1 Signed-off-by: Ketan Verma --- .../src/main/java/org/opensearch/common/hash/T1ha1.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java b/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java index 3a640257af32b..92be34a64fb13 100644 --- a/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java +++ b/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java @@ -24,8 +24,9 @@ * a fast portable hash function with reasonable quality for checksums, hash tables, and thin fingerprinting. * *

- * To overcome language and performance limitations, this implementation differs slightly from the reference - * implementation in C++, so the returned values may vary before JDK 18. + * To overcome language and performance limitations, this implementation differs slightly from the + * reference implementation in C++, + * so the returned values may vary before JDK 18. * *

* Intended for little-endian systems but returns the same result on big-endian, albeit marginally slower. From 832f61a24e61c7a12f3a7295c96f69c4616fa7c7 Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Thu, 17 Aug 2023 21:36:40 +0530 Subject: [PATCH 6/9] Annotate t1ha1 with @opensearch.internal Signed-off-by: Ketan Verma --- .../src/main/java/org/opensearch/common/hash/T1ha1.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java b/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java index 92be34a64fb13..b02e3d777364b 100644 --- a/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java +++ b/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java @@ -30,8 +30,10 @@ * *

* Intended for little-endian systems but returns the same result on big-endian, albeit marginally slower. + * + * @opensearch.internal */ -public class T1ha1 { +public final class T1ha1 { private static final long SEED = System.nanoTime(); private static final Mux64 MUX_64_IMPL = fastestMux64Impl(); @@ -53,6 +55,11 @@ public class T1ha1 { private static final int s1 = 17; private static final int s2 = 31; + /** + * No public constructor. + */ + private T1ha1() {} + /** * Returns the hash code for the specified range of the given {@code byte} array. * @param input the input byte array From 7b53727213e2b239645f9810f3597b54a94ffacf Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Thu, 17 Aug 2023 22:33:21 +0530 Subject: [PATCH 7/9] Run spotless Signed-off-by: Ketan Verma --- .../org/opensearch/common/util/BytesRefHashBenchmark.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java index 3071d1f4c7d7e..fef12b6d9f84a 100644 --- a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java +++ b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java @@ -10,6 +10,9 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; +import org.opensearch.common.hash.T1ha1; +import org.opensearch.common.lease.Releasable; +import org.opensearch.common.lease.Releasables; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -22,9 +25,6 @@ import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import org.opensearch.common.hash.T1ha1; -import org.opensearch.common.lease.Releasable; -import org.opensearch.common.lease.Releasables; import java.util.HashSet; import java.util.Random; From f9c874d3a27c24e38165c0907d82cf73e51682b6 Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Wed, 23 Aug 2023 17:39:39 +0530 Subject: [PATCH 8/9] Add pre-computed hashes to speed up reinserts Signed-off-by: Ketan Verma --- .../opensearch/common/util/BytesRefHash.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java index 2985a21a5100e..efa3c470265bb 100644 --- a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java +++ b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java @@ -144,6 +144,12 @@ public final class BytesRefHash implements Releasable { */ private ByteArray keys; + /** + * Pre-computed hashes of the stored keys. + * It is used to speed up reinserts when doubling the capacity. + */ + private LongArray hashes; + public BytesRefHash(final BigArrays bigArrays) { this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); } @@ -175,6 +181,7 @@ public BytesRefHash(final long initialCapacity, final float loadFactor, final Ha offsets = bigArrays.newLongArray(initialCapacity + 1, false); offsets.set(0, 0); keys = bigArrays.newByteArray(initialCapacity * 3, false); + hashes = bigArrays.newLongArray(initialCapacity, false); } /** @@ -193,7 +200,7 @@ public long add(final BytesRef key) { } else { table.set(idx, val); } - return append(key); + return append(key, hash); } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { return -1 - ordinal; } @@ -254,13 +261,15 @@ public long size() { /** * Appends the key in the keys' and offsets' tables. */ - private long append(final BytesRef key) { + private long append(final BytesRef key, final long hash) { final long start = offsets.get(size); final long end = start + key.length; offsets = bigArrays.grow(offsets, size + 2); offsets.set(size + 1, end); keys = bigArrays.grow(keys, end); keys.set(start, key.bytes, key.offset, key.length); + hashes = bigArrays.grow(hashes, size + 1); + hashes.set(size, hash); return size++; } @@ -282,7 +291,7 @@ private void growAndInsert(final long hash, final long value) { table.set(hash & mask, value); for (long ordinal = 0; ordinal < size; ordinal++) { - reinsert(ordinal, hasher.hash(get(ordinal, scratch))); + reinsert(ordinal, hashes.get(ordinal)); } } @@ -300,7 +309,7 @@ private void reinsert(final long ordinal, final long hash) { @Override public void close() { - Releasables.close(table, offsets, keys); + Releasables.close(table, offsets, keys, hashes); } /** From 58d3394a70622365902d55e500d64dbe6523fef0 Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Fri, 25 Aug 2023 09:11:57 +0530 Subject: [PATCH 9/9] Refactor HashFunctionTestCase Signed-off-by: Ketan Verma --- .../org/opensearch/common/hash/T1ha1.java | 3 + .../common/hash/HashFunctionTestCase.java | 137 ------------------ .../opensearch/common/hash/T1Ha1Tests.java | 21 ++- .../opensearch/common/util/BytesRefHash.java | 2 + .../common/util/ReorganizingLongHash.java | 2 + .../common/hash/AvalancheStats.java | 63 ++++++++ .../common/hash/HashFunctionTestCase.java | 79 ++++++++++ .../hash/HashFunctionTestCaseTests.java | 68 +++++++++ 8 files changed, 233 insertions(+), 142 deletions(-) delete mode 100644 libs/common/src/test/java/org/opensearch/common/hash/HashFunctionTestCase.java create mode 100644 test/framework/src/main/java/org/opensearch/common/hash/AvalancheStats.java create mode 100644 test/framework/src/main/java/org/opensearch/common/hash/HashFunctionTestCase.java create mode 100644 test/framework/src/test/java/org/opensearch/common/hash/HashFunctionTestCaseTests.java diff --git a/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java b/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java index b02e3d777364b..07b2306eda4e5 100644 --- a/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java +++ b/libs/common/src/main/java/org/opensearch/common/hash/T1ha1.java @@ -8,6 +8,8 @@ package org.opensearch.common.hash; +import org.opensearch.common.annotation.InternalApi; + import java.lang.invoke.MethodHandle; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; @@ -33,6 +35,7 @@ * * @opensearch.internal */ +@InternalApi public final class T1ha1 { private static final long SEED = System.nanoTime(); private static final Mux64 MUX_64_IMPL = fastestMux64Impl(); diff --git a/libs/common/src/test/java/org/opensearch/common/hash/HashFunctionTestCase.java b/libs/common/src/test/java/org/opensearch/common/hash/HashFunctionTestCase.java deleted file mode 100644 index 6f7f813bffa24..0000000000000 --- a/libs/common/src/test/java/org/opensearch/common/hash/HashFunctionTestCase.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.hash; - -import org.opensearch.common.Randomness; -import org.opensearch.test.OpenSearchTestCase; - -import java.util.Locale; -import java.util.Random; - -public abstract class HashFunctionTestCase extends OpenSearchTestCase { - private static final int[] INPUT_BITS = new int[] { 24, 32, 40, 48, 56, 64, 72, 80, 96, 112, 128, 160, 512, 1024 }; - private static final int OUTPUT_BITS = 64; - private static final int ITERATIONS = 1000; - private static final double BIAS_THRESHOLD = 0.01; // 1% - - public abstract long hash(byte[] input); - - /** - * Tests if the hash function shows an avalanche effect, i.e, flipping a single input bit - * should flip half the output bits. - */ - public final void testAvalanche() { - for (int inputBits : INPUT_BITS) { - AvalancheStats stats = simulate(inputBits, OUTPUT_BITS, new RandomInputGenerator(inputBits)); - if (stats.bias() >= BIAS_THRESHOLD) { - fail("bias exceeds threshold: " + stats); - } - } - } - - private AvalancheStats simulate(int inputBits, int outputBits, InputGenerator inputGenerator) { - int[][] flips = new int[inputBits][outputBits]; - - for (int iter = 0; iter < ITERATIONS; iter++) { - byte[] input = inputGenerator.next(); - long hash = hash(input); - - for (int i = 0; i < inputBits; i++) { - flip(input, i); // flip one bit - long newHash = hash(input); // recompute the hash; half the bits should have flipped - flip(input, i); // return to original - - long diff = hash ^ newHash; - for (int o = 0; o < OUTPUT_BITS; o++) { - if ((diff & 1) == 1) { - flips[i][o] += 1; - } - diff >>>= 1; - } - } - } - - return new AvalancheStats(flips); - } - - private static void flip(byte[] input, int position) { - int offset = position / 8; - int bit = position & 7; - input[offset] ^= (1 << bit); - } - - @FunctionalInterface - interface InputGenerator { - byte[] next(); - } - - private static class RandomInputGenerator implements InputGenerator { - private final Random random = Randomness.get(); - private final byte[] input; - - public RandomInputGenerator(int size) { - input = new byte[size]; - } - - @Override - public byte[] next() { - random.nextBytes(input); - return input; - } - } - - private static class AvalancheStats { - private final int inputBits; - private final int outputBits; - private final double bias; - private final double sumOfSquaredErrors; - - public AvalancheStats(int[][] flips) { - this.inputBits = flips.length; - this.outputBits = flips[0].length; - double sumOfBiases = 0; - double sumOfSquaredErrors = 0; - - for (int i = 0; i < inputBits; i++) { - for (int o = 0; o < outputBits; o++) { - sumOfSquaredErrors += Math.pow(0.5 - ((double) flips[i][o] / ITERATIONS), 2); - sumOfBiases += 2 * ((double) flips[i][o] / ITERATIONS) - 1; - } - } - - this.bias = Math.abs(sumOfBiases / (inputBits * outputBits)); - this.sumOfSquaredErrors = sumOfSquaredErrors; - } - - public double bias() { - return bias; - } - - public double diffusion() { - return 1 - bias; - } - - public double sumOfSquaredErrors() { - return sumOfSquaredErrors; - } - - @Override - public String toString() { - return String.format( - Locale.ROOT, - "AvalancheStats{inputBits=%d, outputBits=%d, bias=%.4f%%, diffusion=%.4f%%, sumOfSquaredErrors=%.2f}", - inputBits, - outputBits, - bias() * 100, - diffusion() * 100, - sumOfSquaredErrors() - ); - } - } -} diff --git a/libs/common/src/test/java/org/opensearch/common/hash/T1Ha1Tests.java b/libs/common/src/test/java/org/opensearch/common/hash/T1Ha1Tests.java index bd72a5dcac703..e348fbf759bdd 100644 --- a/libs/common/src/test/java/org/opensearch/common/hash/T1Ha1Tests.java +++ b/libs/common/src/test/java/org/opensearch/common/hash/T1Ha1Tests.java @@ -10,8 +10,12 @@ import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; +import java.lang.invoke.VarHandle; +import java.nio.ByteOrder; public class T1Ha1Tests extends HashFunctionTestCase { + private static final VarHandle LONG_HANDLE = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.LITTLE_ENDIAN); + private final byte[] scratch = new byte[8]; /** * Inspired from the tests defined in the reference implementation: @@ -282,6 +286,18 @@ public void testSelfCheck() { } } + @Override + public byte[] hash(byte[] input) { + long hash = T1ha1.hash(input, 0, input.length); + LONG_HANDLE.set(scratch, 0, hash); + return scratch; + } + + @Override + public int outputBits() { + return 64; + } + private static boolean hasUnsignedMultiplyHigh() { try { MethodHandles.publicLookup() @@ -293,9 +309,4 @@ private static boolean hasUnsignedMultiplyHigh() { throw new RuntimeException(e); } } - - @Override - public long hash(byte[] input) { - return T1ha1.hash(input, 0, input.length); - } } diff --git a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java index efa3c470265bb..4afba2905019a 100644 --- a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java +++ b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java @@ -34,6 +34,7 @@ import org.apache.lucene.util.BytesRef; import org.opensearch.common.Numbers; +import org.opensearch.common.annotation.InternalApi; import org.opensearch.common.hash.T1ha1; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; @@ -52,6 +53,7 @@ * * @opensearch.internal */ +@InternalApi public final class BytesRefHash implements Releasable { private static final long MAX_CAPACITY = 1L << 32; private static final long DEFAULT_INITIAL_CAPACITY = 32; diff --git a/server/src/main/java/org/opensearch/common/util/ReorganizingLongHash.java b/server/src/main/java/org/opensearch/common/util/ReorganizingLongHash.java index d6c29fcae3a94..86e7227cb6c85 100644 --- a/server/src/main/java/org/opensearch/common/util/ReorganizingLongHash.java +++ b/server/src/main/java/org/opensearch/common/util/ReorganizingLongHash.java @@ -9,6 +9,7 @@ package org.opensearch.common.util; import org.opensearch.common.Numbers; +import org.opensearch.common.annotation.InternalApi; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; @@ -26,6 +27,7 @@ * * @opensearch.internal */ +@InternalApi public class ReorganizingLongHash implements Releasable { private static final long MAX_CAPACITY = 1L << 32; private static final long DEFAULT_INITIAL_CAPACITY = 32; diff --git a/test/framework/src/main/java/org/opensearch/common/hash/AvalancheStats.java b/test/framework/src/main/java/org/opensearch/common/hash/AvalancheStats.java new file mode 100644 index 0000000000000..c1600abcacd3e --- /dev/null +++ b/test/framework/src/main/java/org/opensearch/common/hash/AvalancheStats.java @@ -0,0 +1,63 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.hash; + +import java.util.Locale; + +/** + * Represents the avalanche statistics of a hash function. + */ +public class AvalancheStats { + private final int inputBits; + private final int outputBits; + private final double bias; + private final double sumOfSquaredErrors; + + public AvalancheStats(int[][] flips, int iterations) { + this.inputBits = flips.length; + this.outputBits = flips[0].length; + double sumOfBiases = 0; + double sumOfSquaredErrors = 0; + + for (int i = 0; i < inputBits; i++) { + for (int o = 0; o < outputBits; o++) { + sumOfSquaredErrors += Math.pow(0.5 - ((double) flips[i][o] / iterations), 2); + sumOfBiases += 2 * ((double) flips[i][o] / iterations) - 1; + } + } + + this.bias = Math.abs(sumOfBiases / (inputBits * outputBits)); + this.sumOfSquaredErrors = sumOfSquaredErrors; + } + + public double bias() { + return bias; + } + + public double diffusion() { + return 1 - bias; + } + + public double sumOfSquaredErrors() { + return sumOfSquaredErrors; + } + + @Override + public String toString() { + return String.format( + Locale.ROOT, + "AvalancheStats{inputBits=%d, outputBits=%d, bias=%.4f%%, diffusion=%.4f%%, sumOfSquaredErrors=%.2f}", + inputBits, + outputBits, + bias() * 100, + diffusion() * 100, + sumOfSquaredErrors() + ); + } +} diff --git a/test/framework/src/main/java/org/opensearch/common/hash/HashFunctionTestCase.java b/test/framework/src/main/java/org/opensearch/common/hash/HashFunctionTestCase.java new file mode 100644 index 0000000000000..e272fe0962047 --- /dev/null +++ b/test/framework/src/main/java/org/opensearch/common/hash/HashFunctionTestCase.java @@ -0,0 +1,79 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.hash; + +import org.opensearch.common.Randomness; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.Arrays; +import java.util.Random; + +/** + * Base class for testing the quality of hash functions. + */ +public abstract class HashFunctionTestCase extends OpenSearchTestCase { + private static final int[] INPUT_BITS = new int[] { 24, 32, 40, 48, 56, 64, 72, 80, 96, 112, 128, 160, 512, 1024 }; + private static final int ITERATIONS = 1000; + private static final double BIAS_THRESHOLD = 0.01; // 1% + + public abstract byte[] hash(byte[] input); + + public abstract int outputBits(); + + /** + * Tests if the hash function shows an avalanche effect, i.e, flipping a single input bit + * should flip half the output bits. + */ + public void testAvalanche() { + for (int inputBits : INPUT_BITS) { + AvalancheStats stats = simulate(inputBits); + if (stats.bias() >= BIAS_THRESHOLD) { + fail("bias exceeds threshold: " + stats); + } + } + } + + private AvalancheStats simulate(int inputBits) { + int outputBits = outputBits(); + assert inputBits % 8 == 0; // using full bytes for simplicity + assert outputBits % 8 == 0; // using full bytes for simplicity + byte[] input = new byte[inputBits >>> 3]; + Random random = Randomness.get(); + int[][] flips = new int[inputBits][outputBits]; + + for (int iter = 0; iter < ITERATIONS; iter++) { + random.nextBytes(input); + byte[] hash = Arrays.copyOf(hash(input), outputBits >>> 3); // copying since the underlying byte-array is reused + + for (int i = 0; i < inputBits; i++) { + flipBit(input, i); // flip one bit + byte[] newHash = hash(input); // recompute the hash; half the bits should have flipped + flipBit(input, i); // return to original + + for (int o = 0; o < outputBits; o++) { + flips[i][o] += getBit(hash, o) ^ getBit(newHash, o); + } + } + } + + return new AvalancheStats(flips, ITERATIONS); + } + + private static void flipBit(byte[] input, int position) { + int offset = position / 8; + int bit = position & 7; + input[offset] ^= (1 << bit); + } + + private static int getBit(byte[] input, int position) { + int offset = position / 8; + int bit = position & 7; + return (input[offset] >>> bit) & 1; + } +} diff --git a/test/framework/src/test/java/org/opensearch/common/hash/HashFunctionTestCaseTests.java b/test/framework/src/test/java/org/opensearch/common/hash/HashFunctionTestCaseTests.java new file mode 100644 index 0000000000000..d5fdaf10999fc --- /dev/null +++ b/test/framework/src/test/java/org/opensearch/common/hash/HashFunctionTestCaseTests.java @@ -0,0 +1,68 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.hash; + +import org.apache.lucene.util.StringHelper; +import org.opensearch.test.OpenSearchTestCase; + +import java.lang.invoke.MethodHandles; +import java.lang.invoke.VarHandle; +import java.nio.ByteOrder; +import java.util.Arrays; + +public class HashFunctionTestCaseTests extends OpenSearchTestCase { + private static final VarHandle INT_HANDLE = MethodHandles.byteArrayViewVarHandle(int[].class, ByteOrder.LITTLE_ENDIAN); + + /** + * Asserts the positive case where a hash function passes the avalanche test. + */ + public void testStrongHashFunction() { + HashFunctionTestCase murmur3 = new HashFunctionTestCase() { + private final byte[] scratch = new byte[4]; + + @Override + public byte[] hash(byte[] input) { + int hash = StringHelper.murmurhash3_x86_32(input, 0, input.length, StringHelper.GOOD_FAST_HASH_SEED); + INT_HANDLE.set(scratch, 0, hash); + return scratch; + } + + @Override + public int outputBits() { + return 32; + } + }; + + murmur3.testAvalanche(); + } + + /** + * Asserts the negative case where a hash function fails the avalanche test. + */ + public void testWeakHashFunction() { + HashFunctionTestCase arraysHashCode = new HashFunctionTestCase() { + private final byte[] scratch = new byte[4]; + + @Override + public byte[] hash(byte[] input) { + int hash = Arrays.hashCode(input); + INT_HANDLE.set(scratch, 0, hash); + return scratch; + } + + @Override + public int outputBits() { + return 32; + } + }; + + AssertionError ex = expectThrows(AssertionError.class, arraysHashCode::testAvalanche); + assertTrue(ex.getMessage().contains("bias exceeds threshold")); + } +}