apache · ishnagy · May 12, 2025 · May 13, 2025 · May 13, 2025 · May 13, 2025
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
@@ -51,6 +51,13 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>org.junit-pioneer</groupId>
+      <artifactId>junit-pioneer</artifactId>
+      <version>2.3.0</version>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
 
   <build>

diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
@@ -17,9 +17,12 @@
 
 package org.apache.spark.util.sketch;
 
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
 
 /**
  * A Bloom filter is a space-efficient probabilistic data structure that offers an approximate
@@ -42,6 +45,7 @@
 public abstract class BloomFilter {
 
   public enum Version {
+
     /**
      * {@code BloomFilter} binary format version 1. All values written in big-endian order:
      * <ul>
@@ -51,7 +55,22 @@ public enum Version {
      *   <li>The words/longs (numWords * 64 bit)</li>
      * </ul>
      */
-    V1(1);
+    V1(1),
+
+    /**
+     * {@code BloomFilter} binary format version 2.
+     * Fixes the int32 truncation issue with V1 indexes, but by changing the bit pattern,
+     * it will become incompatible with V1 serializations.
+     * All values written in big-endian order:
+     * <ul>
+     *   <li>Version number, always 2 (32 bit)</li>
+     *   <li>Number of hash functions (32 bit)</li>
+     *   <li>Integer seed to initialize hash functions (32 bit) </li>
+     *   <li>Total number of words of the underlying bit array (32 bit)</li>
+     *   <li>The words/longs (numWords * 64 bit)</li>
+     * </ul>
+     */
+    V2(2);
      *   <li>Version number, always 1 (32 bit)</li> 
      *   <li>Version number, always 1 (32 bit)</li> 
 
     private final int versionNumber;
 
@@ -175,14 +194,26 @@ public long cardinality() {
    * the stream.
    */
   public static BloomFilter readFrom(InputStream in) throws IOException {
-    return BloomFilterImpl.readFrom(in);
+    // peek into the inputstream so we can determine the version
+    BufferedInputStream bin = new BufferedInputStream(in);
+    bin.mark(4);
+    int version = ByteBuffer.wrap(bin.readNBytes(4)).getInt();
+    bin.reset();
+
+    return switch (version) {
+      case 1 -> BloomFilterImpl.readFrom(bin);
+      case 2 -> BloomFilterImplV2.readFrom(bin);
+      default -> throw new IllegalArgumentException("Unknown BloomFilter version: " + version);
+    };
   }
 
   /**
    * Reads in a {@link BloomFilter} from a byte array.
    */
   public static BloomFilter readFrom(byte[] bytes) throws IOException {
-    return BloomFilterImpl.readFrom(bytes);
+    try (ByteArrayInputStream bis = new ByteArrayInputStream(bytes)) {
+      return readFrom(bis);
+    }
   }
 
   /**
@@ -247,15 +278,28 @@ public static BloomFilter create(long expectedNumItems, double fpp) {
         "False positive probability must be within range (0.0, 1.0)"
       );
     }
-
     return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp));
   }
 
+
   /**
    * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will
    * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter.
    */
   public static BloomFilter create(long expectedNumItems, long numBits) {
+    return create(Version.V2, expectedNumItems, numBits, BloomFilterImplV2.DEFAULT_SEED);
+  }
+
+  public static BloomFilter create(long expectedNumItems, long numBits, int seed) {
+    return create(Version.V2, expectedNumItems, numBits, seed);
+  }
+
+  public static BloomFilter create(
+    Version version,
+    long expectedNumItems,
+    long numBits,
+    int seed
+  ) {
     if (expectedNumItems <= 0) {
       throw new IllegalArgumentException("Expected insertions must be positive");
     }
@@ -264,6 +308,12 @@ public static BloomFilter create(long expectedNumItems, long numBits) {
       throw new IllegalArgumentException("Number of bits must be positive");
     }
 
-    return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits);
+    int numHashFunctions = optimalNumOfHashFunctions(expectedNumItems, numBits);
+
+    return switch (version) {
+      case V1 -> new BloomFilterImpl(numHashFunctions, numBits);
+      case V2 -> new BloomFilterImplV2(numHashFunctions, numBits, seed);
+      default -> throw new IllegalArgumentException("Unknown BloomFilter version: " + version);
+    };
   }
 }
diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterBase.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterBase.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.sketch;
+
+import java.util.Objects;
+
+abstract class BloomFilterBase extends BloomFilter {
+
+  public static final int DEFAULT_SEED = 0;
+
+  protected int seed;
+  protected int numHashFunctions;
+  protected BitArray bits;
+
+  protected BloomFilterBase(int numHashFunctions, long numBits) {
+    this(numHashFunctions, numBits, DEFAULT_SEED);
+  }
+
+  protected BloomFilterBase(int numHashFunctions, long numBits, int seed) {
+    this(new BitArray(numBits), numHashFunctions, seed);
+  }
+
+  protected BloomFilterBase(BitArray bits, int numHashFunctions, int seed) {
+    this.bits = bits;
+    this.numHashFunctions = numHashFunctions;
+    this.seed = seed;
+  }
+
+  protected BloomFilterBase() {}
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
+    }
+
+    if (!(other instanceof BloomFilterBase that)) {
+      return false;
+    }
+
+    return
+      this.getClass() == that.getClass()
+      && this.numHashFunctions == that.numHashFunctions
+      && this.seed == that.seed
+      // TODO: this.bits can be null temporarily, during deserialization,
+      //  should we worry about this?
+      && this.bits.equals(that.bits);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(numHashFunctions, seed, bits);
+  }
+
+  @Override
+  public double expectedFpp() {
+    return Math.pow((double) bits.cardinality() / bits.bitSize(), numHashFunctions);
+  }
+
+  @Override
+  public long bitSize() {
+    return bits.bitSize();
+  }
+
+  @Override
+  public boolean put(Object item) {
+    if (item instanceof String str) {
+      return putString(str);
+    } else if (item instanceof byte[] bytes) {
+      return putBinary(bytes);
+    } else {
+      return putLong(Utils.integralToLong(item));
+    }
+  }
+
+  @Override
+  public boolean putString(String item) {
+    return putBinary(Utils.getBytesFromUTF8String(item));
+  }
+
+  @Override
+  public abstract boolean putBinary(byte[] item);
+
+  @Override
+  public boolean mightContainString(String item) {
+    return mightContainBinary(Utils.getBytesFromUTF8String(item));
+  }
+
+  @Override
+  public abstract boolean mightContainBinary(byte[] item) ;
+
+  @Override
+  public abstract boolean putLong(long item);
+
+  @Override
+  public abstract boolean mightContainLong(long item);
+
+  @Override
+  public boolean mightContain(Object item) {
+    if (item instanceof String str) {
+      return mightContainString(str);
+    } else if (item instanceof byte[] bytes) {
+      return mightContainBinary(bytes);
+    } else {
+      return mightContainLong(Utils.integralToLong(item));
+    }
+  }
+
+  @Override
+  public boolean isCompatible(BloomFilter other) {
+    if (other == null) {
+      return false;
+    }
+
+    if (!(other instanceof BloomFilterBase that)) {
+      return false;
+    }
+
+    return
+      this.getClass() == that.getClass()
+      && this.bitSize() == that.bitSize()
+      && this.numHashFunctions == that.numHashFunctions
+      && this.seed == that.seed;
+  }
+
+  @Override
+  public BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeException {
+    BloomFilterBase otherImplInstance = checkCompatibilityForMerge(other);
+
+    this.bits.putAll(otherImplInstance.bits);
+    return this;
+  }
+
+  @Override
+  public BloomFilter intersectInPlace(BloomFilter other) throws IncompatibleMergeException {
+    BloomFilterBase otherImplInstance = checkCompatibilityForMerge(other);
+
+    this.bits.and(otherImplInstance.bits);
+    return this;
+  }
+
+  @Override
+  public long cardinality() {
+    return this.bits.cardinality();
+  }
+
+  protected abstract BloomFilterBase checkCompatibilityForMerge(BloomFilter other)
+    throws IncompatibleMergeException;
+
+  public record HiLoHash(int hi, int lo) {}
+
+  protected HiLoHash hashLongToIntPair(long item, int seed) {
+    // Here we first hash the input long element into 2 int hash values, h1 and h2, then produce n
+    // hash values by `h1 + i * h2` with 1 <= i <= numHashFunctions.
+    // Note that `CountMinSketch` use a different strategy, it hash the input long element with
+    // every i to produce n hash values.
+    // TODO: the strategy of `CountMinSketch` looks more advanced, should we follow it here?
+    int h1 = Murmur3_x86_32.hashLong(item, seed);
+    int h2 = Murmur3_x86_32.hashLong(item, h1);
+    return new HiLoHash(h1, h2);
+  }
+
+  protected HiLoHash hashBytesToIntPair(byte[] item, int seed) {
+    int h1 = Murmur3_x86_32.hashUnsafeBytes(item, Platform.BYTE_ARRAY_OFFSET, item.length, seed);
+    int h2 = Murmur3_x86_32.hashUnsafeBytes(item, Platform.BYTE_ARRAY_OFFSET, item.length, h1);
+    return new HiLoHash(h1, h2);
+  }
+
+}