Skip to content

Commit 50b1f47

Browse files
committed
PARQUET-1510: Fix notEq for optional columns with null values. (#603)
Dictionaries cannot contain null values, so notEq filters cannot conclude that a block cannot match using only the dictionary. Instead, it must also check whether the block may have at least one null value. If there are no null values, then the existing check is correct.
1 parent 4f945b9 commit 50b1f47

File tree

2 files changed

+18
-2
lines changed

2 files changed

+18
-2
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,10 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
178178

179179
try {
180180
Set<T> dictSet = expandDictionary(meta);
181-
if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value)) {
181+
boolean mayContainNull = (meta.getStatistics() == null
182+
|| !meta.getStatistics().isNumNullsSet()
183+
|| meta.getStatistics().getNumNulls() > 0);
184+
if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value) && !mayContainNull) {
182185
return BLOCK_CANNOT_MATCH;
183186
}
184187
} catch (IOException e) {

parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ public class DictionaryFilterTest {
7979
"message test { "
8080
+ "required binary binary_field; "
8181
+ "required binary single_value_field; "
82+
+ "optional binary optional_single_value_field; "
8283
+ "required int32 int32_field; "
8384
+ "required int64 int64_field; "
8485
+ "required double double_field; "
@@ -112,6 +113,11 @@ private static void writeData(SimpleGroupFactory f, ParquetWriter<Group> writer)
112113
.append("fallback_binary_field", i < (nElements / 2) ?
113114
ALPHABET.substring(index, index+1) : UUID.randomUUID().toString());
114115

116+
// 10% of the time, leave the field null
117+
if (index % 10 > 0) {
118+
group.append("optional_single_value_field", "sharp");
119+
}
120+
115121
writer.write(group);
116122
}
117123
writer.close();
@@ -165,7 +171,7 @@ public void tearDown() throws Exception {
165171
@SuppressWarnings("deprecation")
166172
public void testDictionaryEncodedColumns() throws Exception {
167173
Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList(
168-
"binary_field", "single_value_field", "int32_field", "int64_field",
174+
"binary_field", "single_value_field", "optional_single_value_field", "int32_field", "int64_field",
169175
"double_field", "float_field"));
170176
for (ColumnChunkMetaData column : ccmd) {
171177
String name = column.getPath().toDotString();
@@ -208,6 +214,7 @@ public void testEqBinary() throws Exception {
208214
@Test
209215
public void testNotEqBinary() throws Exception {
210216
BinaryColumn sharp = binaryColumn("single_value_field");
217+
BinaryColumn sharpAndNull = binaryColumn("optional_single_value_field");
211218
BinaryColumn b = binaryColumn("binary_field");
212219

213220
assertTrue("Should drop block with only the excluded value",
@@ -216,6 +223,12 @@ public void testNotEqBinary() throws Exception {
216223
assertFalse("Should not drop block with any other value",
217224
canDrop(notEq(sharp, Binary.fromString("applause")), ccmd, dictionaries));
218225

226+
assertFalse("Should not drop block with only the excluded value and null",
227+
canDrop(notEq(sharpAndNull, Binary.fromString("sharp")), ccmd, dictionaries));
228+
229+
assertFalse("Should not drop block with any other value",
230+
canDrop(notEq(sharpAndNull, Binary.fromString("applause")), ccmd, dictionaries));
231+
219232
assertFalse("Should not drop block with a known value",
220233
canDrop(notEq(b, Binary.fromString("x")), ccmd, dictionaries));
221234

0 commit comments

Comments
 (0)