apache · gszadovszky · Mar 19, 2019 · Feb 14, 2019 · Mar 10, 2019 · Mar 15, 2019
diff --git a/...umn/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java b/...umn/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java
@@ -18,11 +18,13 @@
  */
 
 package org.apache.parquet.column.values.bloomfilter;
+
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import org.apache.parquet.Preconditions;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.io.api.Binary;
+
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
@@ -139,7 +141,7 @@ private BlockSplitBloomFilter(byte[] bitset, HashStrategy hashStrategy) {
         hashFunction = Hashing.murmur3_128(DEFAULT_SEED);
         break;
       default:
-        throw new RuntimeException("Not supported hash strategy");
+        throw new RuntimeException("Unsupported hash strategy");
     }
   }
 
@@ -255,40 +257,35 @@ public static int optimalNumOfBits(long n, double p) {
   }
 
   @Override
-  public long hash(int value) {
-    ByteBuffer plain = ByteBuffer.allocate(Integer.SIZE/Byte.SIZE);
-    plain.order(ByteOrder.LITTLE_ENDIAN).putInt(value);
-    return hashFunction.hashBytes(plain.array()).asLong();
+  public long getBitsetSize() {
+    return this.bitset.length;
   }
 
-  @Override
-  public long hash(long value) {
-    ByteBuffer plain = ByteBuffer.allocate(Long.SIZE/Byte.SIZE);
-    plain.order(ByteOrder.LITTLE_ENDIAN).putLong(value);
-    return hashFunction.hashBytes(plain.array()).asLong();
-  }
 
   @Override
-  public long hash(double value) {
-    ByteBuffer plain = ByteBuffer.allocate(Double.SIZE/Byte.SIZE);
-    plain.order(ByteOrder.LITTLE_ENDIAN).putDouble(value);
-    return hashFunction.hashBytes(plain.array()).asLong();
-  }
+  public long hash(Object value) {
+    ByteBuffer plain = null;
 
-  @Override
-  public long hash(float value) {
-    ByteBuffer plain = ByteBuffer.allocate(Float.SIZE/Byte.SIZE);
-    plain.order(ByteOrder.LITTLE_ENDIAN).putFloat(value);
-    return hashFunction.hashBytes(plain.array()).asLong();
-  }
+    if (value instanceof Binary) {
+      return hashFunction.hashBytes(((Binary) value).getBytes()).asLong();
+    }
 
-  @Override
-  public long hash(Binary value) {
-    return hashFunction.hashBytes(value.getBytes()).asLong();
-  }
+    if (value instanceof Integer) {
+      plain = ByteBuffer.allocate(Integer.SIZE/Byte.SIZE);
+      plain.order(ByteOrder.LITTLE_ENDIAN).putInt(((Integer)value).intValue());
+    } else if (value instanceof Long) {
+      plain = ByteBuffer.allocate(Long.SIZE/Byte.SIZE);
+      plain.order(ByteOrder.LITTLE_ENDIAN).putLong(((Long)value).longValue());
+    } else if (value instanceof Float) {
+      plain = ByteBuffer.allocate(Float.SIZE/Byte.SIZE);
+      plain.order(ByteOrder.LITTLE_ENDIAN).putFloat(((Float)value).floatValue());
+    } else if (value instanceof Double) {
+      plain = ByteBuffer.allocate(Double.SIZE/ Byte.SIZE);
+      plain.order(ByteOrder.LITTLE_ENDIAN).putDouble(((Double)value).doubleValue());
+    } else {
+      throw new RuntimeException("Parquet Bloom filter: Not supported type");
+    }
 
-  @Override
-  public long getBitsetSize() {
-    return this.bitset.length;
+    return hashFunction.hashBytes(plain.array()).asLong();
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java
@@ -18,7 +18,6 @@
  */
 package org.apache.parquet.column.values.bloomfilter;
 
-import org.apache.parquet.io.api.Binary;
 import java.io.IOException;
 import java.io.OutputStream;
 
@@ -27,7 +26,7 @@
  * in a set. The Bloom filter usually consists of a bit set that represents a elements set,
  * a hash strategy and a Bloom filter algorithm.
  */
-public interface BloomFilter {
+public interface BloomFilter<T> {
   // Bloom filter Hash strategy.
   enum HashStrategy {
     MURMUR3_X64_128(0);
@@ -71,49 +70,17 @@ enum Algorithm {
   boolean findHash(long hash);
 
   /**
-   * Compute hash for int value by using its plain encoding result.
-   *
-   * @param value the value to hash
-   * @return hash result
-   */
-  long hash(int value);
-
-  /**
-   * Compute hash for long value by using its plain encoding result.
-   *
-   * @param value the value to hash
-   * @return hash result
-   */
-  long hash(long value) ;
-
-  /**
-   * Compute hash for double value by using its plain encoding result.
-   *
-   * @param value the value to hash
-   * @return hash result
-   */
-  long hash(double value);
-
-  /**
-   * Compute hash for float value by using its plain encoding result.
+   * Get the number of bytes for bitset in this Bloom filter.
    *
-   * @param value the value to hash
-   * @return hash result
+   * @return The number of bytes for bitset in this Bloom filter.
    */
-  long hash(float value);
+  long getBitsetSize();
 
   /**
-   * Compute hash for Binary value by using its plain encoding result.
+   * Compute hash for value by using its plain encoding result.
    *
    * @param value the value to hash
    * @return hash result
    */
-  long hash(Binary value);
-
-  /**
-   * Get the number of bytes for bitset in this Bloom filter.
-   *
-   * @return The number of bytes for bitset in this Bloom filter.
-   */
-  long getBitsetSize();
+  long hash (T value);
 }
diff --git a/...lumn/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java b/...lumn/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java
diff --git a/...-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java b/...-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java
diff --git a/...uet-hadoop/src/main/java/org/apache/parquet/filter2/BloomFilterLevel/BloomFilterImpl.java b/...uet-hadoop/src/main/java/org/apache/parquet/filter2/BloomFilterLevel/BloomFilterImpl.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.filter2.BloomFilterLevel;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.parquet.column.values.bloomfilter.BloomFilter;
+import org.apache.parquet.filter2.predicate.FilterPredicate;
+import org.apache.parquet.filter2.predicate.Operators;
+import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
+import org.apache.parquet.hadoop.BloomFilterReader;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnPath;
+
+import static org.apache.parquet.Preconditions.checkNotNull;
+
+public class BloomFilterImpl implements FilterPredicate.Visitor<Boolean>{
+  private static final Logger LOG = LoggerFactory.getLogger(BloomFilterImpl.class);
+  private static final boolean BLOCK_MIGHT_MATCH = false;
+  private static final boolean BLOCK_CANNOT_MATCH = true;
+
+  private final Map<ColumnPath, ColumnChunkMetaData> columns = new HashMap<ColumnPath, ColumnChunkMetaData>();
+
+  public static boolean canDrop(FilterPredicate pred, List<ColumnChunkMetaData> columns, BloomFilterReader bloomFilterReader) {
+    checkNotNull(pred, "pred");
+    checkNotNull(columns, "columns");
+    return pred.accept(new BloomFilterImpl(columns, bloomFilterReader));
+  }
+
+  private BloomFilterImpl(List<ColumnChunkMetaData> columnsList, BloomFilterReader bloomFilterReader) {
+    for (ColumnChunkMetaData chunk : columnsList) {
+      columns.put(chunk.getPath(), chunk);
+    }
+
+    this.bloomFilterReader = bloomFilterReader;
+  }
+
+  private BloomFilterReader bloomFilterReader;
+
+  private ColumnChunkMetaData getColumnChunk(ColumnPath columnPath) {
+    return columns.get(columnPath);
+  }
+
+  // is this column chunk composed entirely of nulls?
+  // assumes the column chunk's statistics is not empty
+  private boolean isAllNulls(ColumnChunkMetaData column) {
+    return BLOCK_MIGHT_MATCH;
+  }
+
+  @Override
+  public <T extends Comparable<T>> Boolean visit(Operators.Eq<T> eq) {
+    T value = eq.getValue();
+
+    if (value == null) {
+      // the bloom filter bitset contains only non-null values so isn't helpful. this
+      // could check the column stats, but the StatisticsFilter is responsible
+      return BLOCK_MIGHT_MATCH;
+    }
+
+    Operators.Column<T> filterColumn = eq.getColumn();
+    ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
+    if (meta == null) {
+      // the column isn't in this file so all values are null, but the value
+      // must be non-null because of the above check.
+      return BLOCK_CANNOT_MATCH;
+    }
+
+    try {
+      BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(meta);
+      if (bloomFilter != null && !bloomFilter.findHash(bloomFilter.hash(value))) {
+        return BLOCK_CANNOT_MATCH;
+      }
+    } catch (RuntimeException e) {
+      LOG.warn(e.getMessage());
+      return BLOCK_MIGHT_MATCH;
+    }
+
+    return BLOCK_MIGHT_MATCH;
+  }
+
+  @Override
+  public <T extends Comparable<T>> Boolean visit(Operators.NotEq<T> notEq) {
+    return BLOCK_MIGHT_MATCH;
+  }
+
+  @Override
+  public <T extends Comparable<T>> Boolean visit(Operators.Lt<T> lt) {
+    return BLOCK_MIGHT_MATCH;
+  }
+
+  @Override
+  public <T extends Comparable<T>> Boolean visit(Operators.LtEq<T> ltEq) {
+    return BLOCK_MIGHT_MATCH;
+  }
+
+  @Override
+  public <T extends Comparable<T>> Boolean visit(Operators.Gt<T> gt) {
+    return BLOCK_MIGHT_MATCH;
+  }
+
+  @Override
+  public <T extends Comparable<T>> Boolean visit(Operators.GtEq<T> gtEq) {
+    return BLOCK_MIGHT_MATCH;
+  }
+
+  @Override
+  public Boolean visit(Operators.And and) {
+    return and.getLeft().accept(this) || and.getRight().accept(this);
+  }
+
+  @Override
+  public Boolean visit(Operators.Or or) {
+    return or.getLeft().accept(this) && or.getRight().accept(this);
+  }
+
+  @Override
+  public Boolean visit(Operators.Not not) {
+    throw new IllegalArgumentException(
+      "This predicate contains a not! Did you forget to run this predicate through LogicalInverseRewriter? " + not);
+  }
+
+  private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(Operators.UserDefined<T, U> ud, boolean inverted) {
+    return BLOCK_MIGHT_MATCH;
+  }
+
+  @Override
+  public <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(Operators.UserDefined<T, U> udp) {
+    return visit(udp, false);
+  }
+
+  @Override
+  public <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(Operators.LogicalNotUserDefined<T, U> udp) {
+    return visit(udp.getUserDefined(), true);
+  }
+}