diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/compat/RowGroupFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/compat/RowGroupFilter.java index 73ff9aae67..44f0f7b03f 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/compat/RowGroupFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/compat/RowGroupFilter.java @@ -22,6 +22,7 @@ import java.util.Collections; import java.util.List; import java.util.Objects; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.parquet.filter2.bloomfilterlevel.BloomFilterImpl; import org.apache.parquet.filter2.compat.FilterCompat.Filter; @@ -34,8 +35,6 @@ import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.schema.MessageType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Given a {@link Filter} applies it to a list of BlockMetaData (row groups) @@ -103,11 +102,13 @@ public List visit(FilterCompat.FilterPredicateCompat filterPredic drop = StatisticsFilter.canDrop(filterPredicate, block.getColumns()); } - if(!drop && levels.contains(FilterLevel.DICTIONARY)) { - drop = DictionaryFilter.canDrop(filterPredicate, block.getColumns(), reader.getDictionaryReader(block)); + // used to mark whether the column has used a dictionary, if dictionary has been used, skip `BloomFilter` to save time. + AtomicBoolean hasDictionaryUsed = new AtomicBoolean(false); + if (!drop && levels.contains(FilterLevel.DICTIONARY)) { + drop = DictionaryFilter.canDrop(filterPredicate, block.getColumns(), reader.getDictionaryReader(block), hasDictionaryUsed); } - if (!drop && levels.contains(FilterLevel.BLOOMFILTER)) { + if (!drop && !hasDictionaryUsed.get() && levels.contains(FilterLevel.BLOOMFILTER)) { drop = BloomFilterImpl.canDrop(filterPredicate, block.getColumns(), reader.getBloomFilterDataReader(block)); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java index 992aaa8244..a98c020cf3 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java @@ -41,6 +41,7 @@ import java.util.Map; import java.util.Objects; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.IntFunction; /** @@ -52,20 +53,27 @@ public class DictionaryFilter implements FilterPredicate.Visitor { private static final boolean BLOCK_MIGHT_MATCH = false; private static final boolean BLOCK_CANNOT_MATCH = true; - public static boolean canDrop(FilterPredicate pred, List columns, DictionaryPageReadStore dictionaries) { + public static boolean canDrop(FilterPredicate pred, List columns, DictionaryPageReadStore dictionaries, + AtomicBoolean hasDictionaryUsed) { Objects.requireNonNull(pred, "pred cannnot be null"); Objects.requireNonNull(columns, "columns cannnot be null"); - return pred.accept(new DictionaryFilter(columns, dictionaries)); + return pred.accept(new DictionaryFilter(columns, dictionaries, hasDictionaryUsed)); + } + + public static boolean canDrop(FilterPredicate pred, List columns, DictionaryPageReadStore dictionaries) { + return canDrop(pred, columns, dictionaries, new AtomicBoolean(false)); } private final Map columns = new HashMap(); private final DictionaryPageReadStore dictionaries; + private AtomicBoolean hasDictionaryUsed; - private DictionaryFilter(List columnsList, DictionaryPageReadStore dictionaries) { + private DictionaryFilter(List columnsList, DictionaryPageReadStore dictionaries, + AtomicBoolean hasDictionaryUsed) { for (ColumnChunkMetaData chunk : columnsList) { columns.put(chunk.getPath(), chunk); } - + this.hasDictionaryUsed = hasDictionaryUsed; this.dictionaries = dictionaries; } @@ -113,7 +121,9 @@ private > Set expandDictionary(ColumnChunkMetaData me for (int i = 0; i <= dict.getMaxId(); i++) { dictSet.add((T) dictValueProvider.apply(i)); } - + if (!hasNonDictionaryPages(meta)) { + hasDictionaryUsed.set(true); + } return dictSet; } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java index 4fa933e754..88ba4b1f63 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java @@ -68,6 +68,7 @@ import java.util.List; import java.util.Set; import java.util.UUID; +import java.util.concurrent.atomic.AtomicBoolean; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; @@ -333,8 +334,16 @@ public void testEqFixed() throws Exception { canDrop(eq(b, toBinary("-2", 17)), ccmd, dictionaries)); } + AtomicBoolean hasDictionaryUsed = new AtomicBoolean(false); assertFalse("Should not drop block for -1", - canDrop(eq(b, toBinary("-1", 17)), ccmd, dictionaries)); + canDrop(eq(b, toBinary("-1", 17)), ccmd, dictionaries, hasDictionaryUsed)); + + // Only V2 supports dictionary encoding for FIXED_LEN_BYTE_ARRAY values + if (version == PARQUET_2_0) { + assertTrue(hasDictionaryUsed.get()); + } else { + assertFalse(hasDictionaryUsed.get()); + } assertFalse("Should not drop block for null", canDrop(eq(b, null), ccmd, dictionaries));