-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-47547][CORE] Add BloomFilter V2 and use it as default
#50933
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 37 commits
3c5a843
08cbfeb
e3cb08e
c4e3f58
1a0b66f
d912b66
f589e2c
f597c76
4ea633d
6696106
b75e187
2d8a9f1
4a30794
d9d6980
39a46c9
7f235e7
16be3a9
e91b5ca
897c1d4
013bfe4
6d44c1e
925bf12
6f28882
ed6caac
7d4ef74
c52ead3
0ab8276
d2477bf
413c4fe
4599fcb
1ee2e13
c501b2a
1f5cfb6
f60d55f
0314963
f2df338
4aaff83
e214bd7
99f7343
58e3066
c06cb38
b99ef3a
ce3ad76
626e459
b0f5b45
6849dbe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -17,9 +17,12 @@ | |||
|
|
||||
| package org.apache.spark.util.sketch; | ||||
|
|
||||
| import java.io.BufferedInputStream; | ||||
| import java.io.ByteArrayInputStream; | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.io.OutputStream; | ||||
| import java.nio.ByteBuffer; | ||||
|
|
||||
| /** | ||||
| * A Bloom filter is a space-efficient probabilistic data structure that offers an approximate | ||||
|
|
@@ -42,6 +45,7 @@ | |||
| public abstract class BloomFilter { | ||||
|
|
||||
| public enum Version { | ||||
|
|
||||
LuciferYang marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||
| /** | ||||
| * {@code BloomFilter} binary format version 1. All values written in big-endian order: | ||||
| * <ul> | ||||
|
|
@@ -51,7 +55,22 @@ public enum Version { | |||
| * <li>The words/longs (numWords * 64 bit)</li> | ||||
| * </ul> | ||||
| */ | ||||
| V1(1); | ||||
| V1(1), | ||||
|
|
||||
| /** | ||||
| * {@code BloomFilter} binary format version 2. | ||||
| * Fixes the int32 truncation issue with V1 indexes, but by changing the bit pattern, | ||||
| * it will become incompatible with V1 serializations. | ||||
| * All values written in big-endian order: | ||||
| * <ul> | ||||
| * <li>Version number, always 2 (32 bit)</li> | ||||
| * <li>Number of hash functions (32 bit)</li> | ||||
| * <li>Integer seed to initialize hash functions (32 bit) </li> | ||||
| * <li>Total number of words of the underlying bit array (32 bit)</li> | ||||
| * <li>The words/longs (numWords * 64 bit)</li> | ||||
| * </ul> | ||||
| */ | ||||
| V2(2); | ||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we want to add
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added some comments in d2477bf |
||||
|
|
||||
| private final int versionNumber; | ||||
|
|
||||
|
|
@@ -175,14 +194,26 @@ public long cardinality() { | |||
| * the stream. | ||||
| */ | ||||
| public static BloomFilter readFrom(InputStream in) throws IOException { | ||||
| return BloomFilterImpl.readFrom(in); | ||||
| // peek into the inputstream so we can determine the version | ||||
LuciferYang marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||
| BufferedInputStream bin = new BufferedInputStream(in); | ||||
LuciferYang marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
| bin.mark(4); | ||||
| int version = ByteBuffer.wrap(bin.readNBytes(4)).getInt(); | ||||
| bin.reset(); | ||||
|
|
||||
| return switch (version) { | ||||
| case 1 -> BloomFilterImpl.readFrom(bin); | ||||
| case 2 -> BloomFilterImplV2.readFrom(bin); | ||||
| default -> throw new IllegalArgumentException("Unknown BloomFilter version: " + version); | ||||
| }; | ||||
| } | ||||
|
|
||||
| /** | ||||
| * Reads in a {@link BloomFilter} from a byte array. | ||||
| */ | ||||
| public static BloomFilter readFrom(byte[] bytes) throws IOException { | ||||
| return BloomFilterImpl.readFrom(bytes); | ||||
| try (ByteArrayInputStream bis = new ByteArrayInputStream(bytes)) { | ||||
| return readFrom(bis); | ||||
| } | ||||
| } | ||||
|
|
||||
| /** | ||||
|
|
@@ -247,15 +278,28 @@ public static BloomFilter create(long expectedNumItems, double fpp) { | |||
| "False positive probability must be within range (0.0, 1.0)" | ||||
| ); | ||||
| } | ||||
|
|
||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please recover this irrelevant removal. We recommend not to fix a style change and the functional change. For example, even inside this PR, this removal is inconsistent from your code style because this PR added a new empty line at line 199 before
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed in 4599fcb
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This still doesn't look good and there is an extra blank line below. |
||||
| return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp)); | ||||
| } | ||||
|
|
||||
|
|
||||
| /** | ||||
| * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will | ||||
| * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter. | ||||
| */ | ||||
| public static BloomFilter create(long expectedNumItems, long numBits) { | ||||
| return create(Version.V2, expectedNumItems, numBits, BloomFilterImplV2.DEFAULT_SEED); | ||||
| } | ||||
|
|
||||
| public static BloomFilter create(long expectedNumItems, long numBits, int seed) { | ||||
| return create(Version.V2, expectedNumItems, numBits, seed); | ||||
| } | ||||
|
|
||||
| public static BloomFilter create( | ||||
| Version version, | ||||
LuciferYang marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||
| long expectedNumItems, | ||||
| long numBits, | ||||
| int seed | ||||
| ) { | ||||
| if (expectedNumItems <= 0) { | ||||
| throw new IllegalArgumentException("Expected insertions must be positive"); | ||||
| } | ||||
|
|
@@ -264,6 +308,12 @@ public static BloomFilter create(long expectedNumItems, long numBits) { | |||
| throw new IllegalArgumentException("Number of bits must be positive"); | ||||
| } | ||||
|
|
||||
| return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits); | ||||
| int numHashFunctions = optimalNumOfHashFunctions(expectedNumItems, numBits); | ||||
|
|
||||
| return switch (version) { | ||||
| case V1 -> new BloomFilterImpl(numHashFunctions, numBits); | ||||
| case V2 -> new BloomFilterImplV2(numHashFunctions, numBits, seed); | ||||
| default -> throw new IllegalArgumentException("Unknown BloomFilter version: " + version); | ||||
LuciferYang marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||
| }; | ||||
| } | ||||
| } | ||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,184 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.util.sketch; | ||
|
|
||
| import java.util.Objects; | ||
|
|
||
| abstract class BloomFilterBase extends BloomFilter { | ||
|
|
||
| public static final int DEFAULT_SEED = 0; | ||
|
|
||
| protected int seed; | ||
| protected int numHashFunctions; | ||
| protected BitArray bits; | ||
|
|
||
| protected BloomFilterBase(int numHashFunctions, long numBits) { | ||
| this(numHashFunctions, numBits, DEFAULT_SEED); | ||
| } | ||
|
|
||
| protected BloomFilterBase(int numHashFunctions, long numBits, int seed) { | ||
| this(new BitArray(numBits), numHashFunctions, seed); | ||
| } | ||
|
|
||
| protected BloomFilterBase(BitArray bits, int numHashFunctions, int seed) { | ||
| this.bits = bits; | ||
| this.numHashFunctions = numHashFunctions; | ||
| this.seed = seed; | ||
| } | ||
|
|
||
| protected BloomFilterBase() {} | ||
|
|
||
| @Override | ||
| public boolean equals(Object other) { | ||
| if (other == this) { | ||
| return true; | ||
| } | ||
|
|
||
| if (!(other instanceof BloomFilterBase that)) { | ||
| return false; | ||
| } | ||
|
|
||
| return | ||
| this.getClass() == that.getClass() | ||
| && this.numHashFunctions == that.numHashFunctions | ||
| && this.seed == that.seed | ||
| // TODO: this.bits can be null temporarily, during deserialization, | ||
| // should we worry about this? | ||
| && this.bits.equals(that.bits); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(numHashFunctions, seed, bits); | ||
| } | ||
|
|
||
| @Override | ||
| public double expectedFpp() { | ||
| return Math.pow((double) bits.cardinality() / bits.bitSize(), numHashFunctions); | ||
| } | ||
|
|
||
| @Override | ||
| public long bitSize() { | ||
| return bits.bitSize(); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean put(Object item) { | ||
| if (item instanceof String str) { | ||
| return putString(str); | ||
| } else if (item instanceof byte[] bytes) { | ||
| return putBinary(bytes); | ||
| } else { | ||
| return putLong(Utils.integralToLong(item)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public boolean putString(String item) { | ||
| return putBinary(Utils.getBytesFromUTF8String(item)); | ||
| } | ||
|
|
||
| @Override | ||
| public abstract boolean putBinary(byte[] item); | ||
|
|
||
| @Override | ||
| public boolean mightContainString(String item) { | ||
| return mightContainBinary(Utils.getBytesFromUTF8String(item)); | ||
| } | ||
|
|
||
| @Override | ||
| public abstract boolean mightContainBinary(byte[] item) ; | ||
|
|
||
| @Override | ||
| public abstract boolean putLong(long item); | ||
|
|
||
| @Override | ||
| public abstract boolean mightContainLong(long item); | ||
|
|
||
| @Override | ||
| public boolean mightContain(Object item) { | ||
| if (item instanceof String str) { | ||
| return mightContainString(str); | ||
| } else if (item instanceof byte[] bytes) { | ||
| return mightContainBinary(bytes); | ||
| } else { | ||
| return mightContainLong(Utils.integralToLong(item)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public boolean isCompatible(BloomFilter other) { | ||
| if (other == null) { | ||
| return false; | ||
| } | ||
|
|
||
| if (!(other instanceof BloomFilterBase that)) { | ||
| return false; | ||
| } | ||
|
|
||
| return | ||
| this.getClass() == that.getClass() | ||
| && this.bitSize() == that.bitSize() | ||
| && this.numHashFunctions == that.numHashFunctions | ||
| && this.seed == that.seed; | ||
| } | ||
|
|
||
| @Override | ||
| public BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeException { | ||
| BloomFilterBase otherImplInstance = checkCompatibilityForMerge(other); | ||
|
|
||
| this.bits.putAll(otherImplInstance.bits); | ||
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public BloomFilter intersectInPlace(BloomFilter other) throws IncompatibleMergeException { | ||
| BloomFilterBase otherImplInstance = checkCompatibilityForMerge(other); | ||
|
|
||
| this.bits.and(otherImplInstance.bits); | ||
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public long cardinality() { | ||
| return this.bits.cardinality(); | ||
| } | ||
|
|
||
| protected abstract BloomFilterBase checkCompatibilityForMerge(BloomFilter other) | ||
| throws IncompatibleMergeException; | ||
|
|
||
| public record HiLoHash(int hi, int lo) {} | ||
|
|
||
| protected HiLoHash hashLongToIntPair(long item, int seed) { | ||
| // Here we first hash the input long element into 2 int hash values, h1 and h2, then produce n | ||
| // hash values by `h1 + i * h2` with 1 <= i <= numHashFunctions. | ||
| // Note that `CountMinSketch` use a different strategy, it hash the input long element with | ||
| // every i to produce n hash values. | ||
| // TODO: the strategy of `CountMinSketch` looks more advanced, should we follow it here? | ||
| int h1 = Murmur3_x86_32.hashLong(item, seed); | ||
| int h2 = Murmur3_x86_32.hashLong(item, h1); | ||
| return new HiLoHash(h1, h2); | ||
| } | ||
|
|
||
| protected HiLoHash hashBytesToIntPair(byte[] item, int seed) { | ||
| int h1 = Murmur3_x86_32.hashUnsafeBytes(item, Platform.BYTE_ARRAY_OFFSET, item.length, seed); | ||
| int h2 = Murmur3_x86_32.hashUnsafeBytes(item, Platform.BYTE_ARRAY_OFFSET, item.length, h1); | ||
| return new HiLoHash(h1, h2); | ||
| } | ||
|
|
||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the management of dependency versions, they should be placed in the parent
pom.xml. However, ifTestSparkBloomFiltercan be removed from the current pr, then it seems that this dependency is no longer needed either.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll defer addressing this, until we decide what should happen with
TestSparkBloomFilter.(remove & move the versions under managed dependencies)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we keep the dependency then please move the version to the main pom.