From f79a9e70f5168426edbb425e27816aeb7421edd1 Mon Sep 17 00:00:00 2001 From: Shardul Mahadik Date: Wed, 30 Sep 2020 09:27:27 -0700 Subject: [PATCH] ORC: Fix predicate pushdown for Not In and Not Equals --- .../data/TestMetricsRowGroupFilter.java | 14 ++++---- .../orc/ExpressionToSearchArgument.java | 35 ++++++++++++++----- .../orc/TestExpressionToSearchArgument.java | 9 ++--- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index a0e37e7f6637..7bcc4f783b22 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -738,12 +738,8 @@ public void testIntegerNotIn() { shouldRead = shouldRead(notIn("all_nulls", 1, 2)); Assert.assertTrue("Should read: notIn on all nulls column", shouldRead); - // ORC-623: ORC seems to incorrectly skip a row group for a notIn(column, {X, ...}) predicate on a column which - // has only 1 non-null value X but also has nulls - if (format != FileFormat.ORC) { - shouldRead = shouldRead(notIn("some_nulls", "aaa", "some")); - Assert.assertTrue("Should read: notIn on some nulls column", shouldRead); - } + shouldRead = shouldRead(notIn("some_nulls", "aaa", "some")); + Assert.assertTrue("Should read: notIn on some nulls column", shouldRead); shouldRead = shouldRead(notIn("no_nulls", "aaa", "")); if (format == FileFormat.PARQUET) { @@ -755,6 +751,12 @@ public void testIntegerNotIn() { } } + @Test + public void testSomeNullsNotEq() { + boolean shouldRead = shouldRead(notEqual("some_nulls", "some")); + Assert.assertTrue("Should read: notEqual on some nulls column", shouldRead); + } + private boolean shouldRead(Expression expression) { return shouldRead(expression, true); } diff --git a/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java b/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java index 57f7a0f8f85d..ed3e564aa780 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java @@ -168,11 +168,19 @@ public Action eq(Bound expr, Literal lit) { @Override public Action notEq(Bound expr, Literal lit) { - return () -> this.builder.startNot() - .equals(idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type()), - literal(expr.ref().type(), lit.value())) - .end(); + // NOTE: ORC uses SQL semantics for Search Arguments, so an expression like + // `col != 1` will exclude rows where col is NULL along with rows where col = 1 + // In contrast, Iceberg's Expressions will keep rows with NULL values + // So the equivalent ORC Search Argument for an Iceberg Expression `col != x` + // is `col IS NULL OR col != x` + return () -> { + this.builder.startOr(); + isNull(expr).invoke(); + this.builder.startNot(); + eq(expr, lit).invoke(); + this.builder.end(); // end NOT + this.builder.end(); // end OR + }; } @Override @@ -185,10 +193,19 @@ public Action in(Bound expr, Set literalSet) { @Override public Action notIn(Bound expr, Set literalSet) { - return () -> this.builder.startNot() - .in(idToColumnName.get(expr.ref().fieldId()), type(expr.ref().type()), - literalSet.stream().map(lit -> literal(expr.ref().type(), lit)).toArray(Object[]::new)) - .end(); + // NOTE: ORC uses SQL semantics for Search Arguments, so an expression like + // `col NOT IN {1}` will exclude rows where col is NULL along with rows where col = 1 + // In contrast, Iceberg's Expressions will keep rows with NULL values + // So the equivalent ORC Search Argument for an Iceberg Expression `col NOT IN {x}` + // is `col IS NULL OR col NOT IN {x}` + return () -> { + this.builder.startOr(); + isNull(expr).invoke(); + this.builder.startNot(); + in(expr, literalSet).invoke(); + this.builder.end(); // end NOT + this.builder.end(); // end OR + }; } @Override diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestExpressionToSearchArgument.java b/orc/src/test/java/org/apache/iceberg/orc/TestExpressionToSearchArgument.java index b56b65aa268b..eb4d800370ee 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestExpressionToSearchArgument.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestExpressionToSearchArgument.java @@ -95,9 +95,9 @@ public void testPrimitiveTypes() { .startNot().lessThanEquals("`float`", Type.FLOAT, 5.0).end() .startNot().lessThan("`double`", Type.FLOAT, 500.0).end() .equals("`boolean`", Type.BOOLEAN, true) - .startNot().equals("`string`", Type.STRING, "test").end() + .startOr().isNull("`string`", Type.STRING).startNot().equals("`string`", Type.STRING, "test").end().end() .in("`decimal`", Type.DECIMAL, new HiveDecimalWritable("-123.45"), new HiveDecimalWritable("123.45")) - .startNot().in("`time`", Type.LONG, 100L, 200L).end() + .startOr().isNull("`time`", Type.LONG).startNot().in("`time`", Type.LONG, 100L, 200L).end().end() .end() .build(); @@ -382,8 +382,9 @@ public void testModifiedComplexSchemaNameMapping() { .equals("`newMap_r5`.`_key`", Type.STRING, "country") // Drops listOfStruct.long .equals("`listOfStruct`.`_elem`.`newLong_r10`", Type.LONG, 100L) - .startNot() - .equals("`listOfPeople`.`_elem`.`name`", Type.STRING, "Bob") + .startOr() + .isNull("`listOfPeople`.`_elem`.`name`", Type.STRING) + .startNot().equals("`listOfPeople`.`_elem`.`name`", Type.STRING, "Bob").end() .end() .lessThan("`listOfPeople`.`_elem`.`age_r14`", Type.LONG, 30L) .end()