diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundLiteralPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/BoundLiteralPredicate.java index e0770ce231e7..17b043a3ba27 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundLiteralPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundLiteralPredicate.java @@ -69,6 +69,8 @@ public boolean test(T value) { return cmp.compare(value, literal.value()) != 0; case STARTS_WITH: return String.valueOf(value).startsWith((String) literal.value()); + case NOT_STARTS_WITH: + return !String.valueOf(value).startsWith((String) literal.value()); default: throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + op()); } @@ -91,6 +93,8 @@ public String toString() { return term() + " != " + literal; case STARTS_WITH: return term() + " startsWith \"" + literal + "\""; + case NOT_STARTS_WITH: + return term() + " notStartsWith \"" + literal + "\""; case IN: return term() + " in { " + literal + " }"; case NOT_IN: diff --git a/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java b/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java index a4d7b67b8bde..3b7ad54167d2 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java @@ -151,5 +151,10 @@ public Boolean notIn(Bound valueExpr, Set literalSet) { public Boolean startsWith(Bound valueExpr, Literal lit) { return ((String) valueExpr.eval(struct)).startsWith((String) lit.value()); } + + @Override + public Boolean notStartsWith(Bound valueExpr, Literal lit) { + return !startsWith(valueExpr, lit); + } } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Expression.java b/api/src/main/java/org/apache/iceberg/expressions/Expression.java index b49f0070ec8a..4ff852dd545d 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Expression.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Expression.java @@ -43,7 +43,8 @@ enum Operation { NOT, AND, OR, - STARTS_WITH; + STARTS_WITH, + NOT_STARTS_WITH; /** * Returns the operation used when this is negated. @@ -74,6 +75,10 @@ public Operation negate() { return Operation.NOT_IN; case NOT_IN: return Operation.IN; + case STARTS_WITH: + return Operation.NOT_STARTS_WITH; + case NOT_STARTS_WITH: + return Operation.STARTS_WITH; default: throw new IllegalArgumentException("No negation for operation: " + this); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java index 318b2dc06b1d..389e25ba5d1b 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java @@ -102,15 +102,19 @@ public R notEq(BoundReference ref, Literal lit) { } public R in(BoundReference ref, Set literalSet) { - throw new UnsupportedOperationException("In operation is not supported by the visitor"); + throw new UnsupportedOperationException("In expression is not supported by the visitor"); } public R notIn(BoundReference ref, Set literalSet) { - throw new UnsupportedOperationException("notIn operation is not supported by the visitor"); + throw new UnsupportedOperationException("notIn expression is not supported by the visitor"); } public R startsWith(BoundReference ref, Literal lit) { - throw new UnsupportedOperationException("Unsupported operation."); + throw new UnsupportedOperationException("startsWith expression is not supported by the visitor"); + } + + public R notStartsWith(BoundReference ref, Literal lit) { + throw new UnsupportedOperationException("notStartsWith expression is not supported by the visitor"); } /** @@ -151,6 +155,8 @@ public R predicate(BoundPredicate pred) { return notEq((BoundReference) pred.term(), literalPred.literal()); case STARTS_WITH: return startsWith((BoundReference) pred.term(), literalPred.literal()); + case NOT_STARTS_WITH: + return notStartsWith((BoundReference) pred.term(), literalPred.literal()); default: throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + pred.op()); } @@ -242,6 +248,10 @@ public R startsWith(Bound expr, Literal lit) { throw new UnsupportedOperationException("Unsupported operation."); } + public R notStartsWith(Bound expr, Literal lit) { + throw new UnsupportedOperationException("Unsupported operation."); + } + @Override public R predicate(BoundPredicate pred) { if (pred.isLiteralPredicate()) { @@ -261,6 +271,8 @@ public R predicate(BoundPredicate pred) { return notEq(pred.term(), literalPred.literal()); case STARTS_WITH: return startsWith(pred.term(), literalPred.literal()); + case NOT_STARTS_WITH: + return notStartsWith(pred.term(), literalPred.literal()); default: throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + pred.op()); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Expressions.java b/api/src/main/java/org/apache/iceberg/expressions/Expressions.java index 8a5fc8c0cefa..d06d7c51693d 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Expressions.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Expressions.java @@ -195,6 +195,14 @@ public static UnboundPredicate startsWith(UnboundTerm expr, Stri return new UnboundPredicate<>(Expression.Operation.STARTS_WITH, expr, value); } + public static UnboundPredicate notStartsWith(String name, String value) { + return new UnboundPredicate<>(Expression.Operation.NOT_STARTS_WITH, ref(name), value); + } + + public static UnboundPredicate notStartsWith(UnboundTerm expr, String value) { + return new UnboundPredicate<>(Expression.Operation.NOT_STARTS_WITH, expr, value); + } + public static UnboundPredicate in(String name, T... values) { return predicate(Operation.IN, name, Lists.newArrayList(values)); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index a87a38dd996c..7bfa91fcc4df 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -408,6 +408,52 @@ public Boolean startsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } + @Override + public Boolean notStartsWith(BoundReference ref, Literal lit) { + Integer id = ref.fieldId(); + + if (mayContainNull(id)) { + return ROWS_MIGHT_MATCH; + } + + ByteBuffer prefixAsBytes = lit.toByteBuffer(); + + Comparator comparator = Comparators.unsignedBytes(); + + // notStartsWith will match unless all values must start with the prefix. This happens when the lower and upper + // bounds both start with the prefix. + if (lowerBounds != null && upperBounds != null && + lowerBounds.containsKey(id) && upperBounds.containsKey(id)) { + ByteBuffer lower = lowerBounds.get(id); + // if lower is shorter than the prefix then lower doesn't start with the prefix + if (lower.remaining() < prefixAsBytes.remaining()) { + return ROWS_MIGHT_MATCH; + } + + int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); + if (cmp == 0) { + ByteBuffer upper = upperBounds.get(id); + // if upper is shorter than the prefix then upper can't start with the prefix + if (upper.remaining() < prefixAsBytes.remaining()) { + return ROWS_MIGHT_MATCH; + } + + cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); + if (cmp == 0) { + // both bounds match the prefix, so all rows must match the prefix and therefore do not satisfy + // the predicate + return ROWS_CANNOT_MATCH; + } + } + } + + return ROWS_MIGHT_MATCH; + } + + private boolean mayContainNull(Integer id) { + return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) != 0); + } + private boolean containsNullsOnly(Integer id) { return valueCounts != null && valueCounts.containsKey(id) && nullCounts != null && nullCounts.containsKey(id) && diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 88c08d2e57df..721beab4df48 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -340,6 +340,50 @@ public Boolean startsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } + @Override + public Boolean notStartsWith(BoundReference ref, Literal lit) { + int pos = Accessors.toPosition(ref.accessor()); + PartitionFieldSummary fieldStats = stats.get(pos); + + if (fieldStats.containsNull()) { + return ROWS_MIGHT_MATCH; + } + + ByteBuffer lower = fieldStats.lowerBound(); + ByteBuffer upper = fieldStats.upperBound(); + + // notStartsWith will match unless all values must start with the prefix. This happens when the lower and upper + // bounds both start with the prefix. + if (lower != null && upper != null) { + ByteBuffer prefixAsBytes = lit.toByteBuffer(); + Comparator comparator = Comparators.unsignedBytes(); + + // if lower is shorter than the prefix, it can't start with the prefix + if (lower.remaining() < prefixAsBytes.remaining()) { + return ROWS_MIGHT_MATCH; + } + + // truncate lower bound to the prefix and check for equality + int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); + if (cmp == 0) { + // the lower bound starts with the prefix; check the upper bound + // if upper is shorter than the prefix, it can't start with the prefix + if (upper.remaining() < prefixAsBytes.remaining()) { + return ROWS_MIGHT_MATCH; + } + + // truncate upper bound so that its length in bytes is not greater than the length of prefix + cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); + if (cmp == 0) { + // both bounds match the prefix, so all rows must match the prefix and none do not match + return ROWS_CANNOT_MATCH; + } + } + } + + return ROWS_MIGHT_MATCH; + } + private boolean allValuesAreNull(PartitionFieldSummary summary, Type.TypeID typeId) { // containsNull encodes whether at least one partition value is null, // lowerBound is null if all partition values are null diff --git a/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java index 791d484ecb75..eb765900d4f7 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java @@ -214,6 +214,11 @@ public Expression startsWith(BoundReference ref, Literal lit) { return ((String) ref.eval(struct)).startsWith((String) lit.value()) ? alwaysTrue() : alwaysFalse(); } + @Override + public Expression notStartsWith(BoundReference ref, Literal lit) { + return ((String) ref.eval(struct)).startsWith((String) lit.value()) ? alwaysFalse() : alwaysTrue(); + } + @Override @SuppressWarnings("unchecked") public Expression predicate(BoundPredicate pred) { diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index 4c4fe7809401..3351669914b7 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -441,6 +441,12 @@ public Boolean startsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_NOT_MATCH; } + @Override + public Boolean notStartsWith(BoundReference ref, Literal lit) { + // TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds are ["a", "b"]. + return ROWS_MIGHT_NOT_MATCH; + } + private boolean canContainNulls(Integer id) { return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) > 0); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java index 1a53395995c5..8143f25c72e4 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java @@ -245,6 +245,8 @@ public String toString() { return term() + " != " + literal(); case STARTS_WITH: return term() + " startsWith \"" + literal() + "\""; + case NOT_STARTS_WITH: + return term() + " notStartsWith \"" + literal() + "\""; case IN: return term() + " in (" + COMMA.join(literals()) + ")"; case NOT_IN: diff --git a/api/src/main/java/org/apache/iceberg/transforms/Truncate.java b/api/src/main/java/org/apache/iceberg/transforms/Truncate.java index b3802f32376f..b00ec6b1d80f 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Truncate.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Truncate.java @@ -285,7 +285,29 @@ public UnboundPredicate project(String name, if (predicate.isUnaryPredicate()) { return Expressions.predicate(predicate.op(), name); } else if (predicate.isLiteralPredicate()) { - return ProjectionUtil.truncateArray(name, predicate.asLiteralPredicate(), this); + BoundLiteralPredicate pred = predicate.asLiteralPredicate(); + switch (pred.op()) { + case STARTS_WITH: + if (pred.literal().value().length() < width()) { + return Expressions.predicate(pred.op(), name, pred.literal().value()); + } else if (pred.literal().value().length() == width()) { + return Expressions.equal(name, pred.literal().value()); + } + + return ProjectionUtil.truncateArray(name, pred, this); + + case NOT_STARTS_WITH: + if (pred.literal().value().length() < width()) { + return Expressions.predicate(pred.op(), name, pred.literal().value()); + } else if (pred.literal().value().length() == width()) { + return Expressions.notEqual(name, pred.literal().value()); + } + + return null; + + default: + return ProjectionUtil.truncateArray(name, pred, this); + } } else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.IN) { return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this); } @@ -303,14 +325,27 @@ public UnboundPredicate projectStrict(String name, return Expressions.predicate(predicate.op(), name); } else if (predicate instanceof BoundLiteralPredicate) { BoundLiteralPredicate pred = predicate.asLiteralPredicate(); - if (pred.op() == Expression.Operation.STARTS_WITH) { - if (pred.literal().value().length() < width()) { - return Expressions.predicate(pred.op(), name, pred.literal().value()); - } else if (pred.literal().value().length() == width()) { - return Expressions.equal(name, pred.literal().value()); - } - } else { - return ProjectionUtil.truncateArrayStrict(name, pred, this); + switch (pred.op()) { + case STARTS_WITH: + if (pred.literal().value().length() < width()) { + return Expressions.predicate(pred.op(), name, pred.literal().value()); + } else if (pred.literal().value().length() == width()) { + return Expressions.equal(name, pred.literal().value()); + } + + return null; + + case NOT_STARTS_WITH: + if (pred.literal().value().length() < width()) { + return Expressions.predicate(pred.op(), name, pred.literal().value()); + } else if (pred.literal().value().length() == width()) { + return Expressions.notEqual(name, pred.literal().value()); + } + + return Expressions.predicate(pred.op(), name, apply(pred.literal().value()).toString()); + + default: + return ProjectionUtil.truncateArrayStrict(name, pred, this); } } else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.NOT_IN) { return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this); diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java index 911e3ff300c0..d17534ae833c 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java @@ -47,8 +47,10 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.expressions.Expressions.predicate; +import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -220,6 +222,29 @@ public void testNotEqual() { } + @Test + public void testStartsWith() { + StructType struct = StructType.of(required(24, "s", Types.StringType.get())); + Evaluator evaluator = new Evaluator(struct, startsWith("s", "abc")); + Assert.assertTrue("abc startsWith abc should be true", evaluator.eval(TestHelpers.Row.of("abc"))); + Assert.assertFalse("xabc startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("xabc"))); + Assert.assertFalse("Abc startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("Abc"))); + Assert.assertFalse("a startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("a"))); + Assert.assertTrue("abcd startsWith abc should be true", evaluator.eval(TestHelpers.Row.of("abcd"))); + } + + @Test + public void testNotStartsWith() { + StructType struct = StructType.of(required(24, "s", Types.StringType.get())); + Evaluator evaluator = new Evaluator(struct, notStartsWith("s", "abc")); + Assert.assertFalse("abc notStartsWith abc should be false", evaluator.eval(TestHelpers.Row.of("abc"))); + Assert.assertTrue("xabc notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("xabc"))); + Assert.assertTrue("Abc notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("Abc"))); + Assert.assertTrue("a notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("a"))); + Assert.assertFalse("abcde notStartsWith abc should be false", evaluator.eval(TestHelpers.Row.of("abcde"))); + Assert.assertTrue("Abcde notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("Abcde"))); + } + @Test public void testAlwaysTrue() { Evaluator evaluator = new Evaluator(STRUCT, alwaysTrue()); diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java index ce7abbf891b4..d55057e9be87 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java @@ -35,6 +35,7 @@ import static org.apache.iceberg.expressions.Expressions.greaterThan; import static org.apache.iceberg.expressions.Expressions.lessThan; import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Types.NestedField.required; @@ -43,7 +44,8 @@ public class TestExpressionBinding { private static final StructType STRUCT = StructType.of( required(0, "x", Types.IntegerType.get()), required(1, "y", Types.IntegerType.get()), - required(2, "z", Types.IntegerType.get()) + required(2, "z", Types.IntegerType.get()), + required(3, "data", Types.StringType.get()) ); @Test @@ -146,6 +148,18 @@ public void testStartsWith() { Assert.assertEquals("Should bind s correctly", 0, pred.term().ref().fieldId()); } + @Test + public void testNotStartsWith() { + StructType struct = StructType.of(required(21, "s", Types.StringType.get())); + Expression expr = notStartsWith("s", "abc"); + Expression boundExpr = Binder.bind(struct, expr); + TestHelpers.assertAllReferencesBound("NotStartsWith", boundExpr); + // Make sure the expression is a NotStartsWith + BoundPredicate pred = TestHelpers.assertAndUnwrap(boundExpr, BoundPredicate.class); + Assert.assertEquals("Should be right operation", Expression.Operation.NOT_STARTS_WITH, pred.op()); + Assert.assertEquals("Should bind term to correct field id", 21, pred.term().ref().fieldId()); + } + @Test public void testAlwaysTrue() { Assert.assertEquals("Should not change alwaysTrue", diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java index 3bb34072ffc6..4a67a5af11b6 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java @@ -46,10 +46,12 @@ import static org.apache.iceberg.expressions.Expressions.notEqual; import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.expressions.Expressions.predicate; import static org.apache.iceberg.expressions.Expressions.ref; import static org.apache.iceberg.expressions.Expressions.rewriteNot; +import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.expressions.Expressions.truncate; import static org.apache.iceberg.expressions.Expressions.year; @@ -95,7 +97,9 @@ public void testSimplifyNot() { @Test public void testRewriteNot() { - StructType struct = StructType.of(NestedField.optional(1, "a", Types.IntegerType.get())); + StructType struct = StructType.of( + NestedField.optional(1, "a", Types.IntegerType.get()), + NestedField.optional(2, "s", Types.StringType.get())); Expression[][] expressions = new Expression[][] { // (rewritten pred, original pred) pairs { isNull("a"), isNull("a") }, @@ -123,6 +127,8 @@ public void testRewriteNot() { { and(notEqual("a", 5), notNull("a")), and(notEqual("a", 5), notNull("a")) }, { or(equal("a", 5), isNull("a")), not(and(notEqual("a", 5), notNull("a"))) }, { or(equal("a", 5), notNull("a")), or(equal("a", 5), not(isNull("a"))) }, + { startsWith("s", "hello"), not(notStartsWith("s", "hello")) }, + { notStartsWith("s", "world"), not(startsWith("s", "world")) } }; for (Expression[] pair : expressions) { diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java index d57b7ea62aff..42c9a89299b0 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java @@ -50,6 +50,8 @@ public void testExpressions() throws Exception { Expressions.isNull("maybeNull2"), Expressions.isNaN("maybeNaN"), Expressions.notNaN("maybeNaN2"), + Expressions.startsWith("col", "abc"), + Expressions.notStartsWith("col", "abc"), Expressions.not(Expressions.greaterThan("a", 10)), Expressions.and(Expressions.greaterThanOrEqual("a", 0), Expressions.lessThan("a", 3)), Expressions.or(Expressions.lessThan("a", 0), Expressions.greaterThan("a", 10)), @@ -57,6 +59,8 @@ public void testExpressions() throws Exception { Expressions.in("a", 5, 6, 7).bind(schema.asStruct()), Expressions.notIn("s", "abc", "xyz").bind(schema.asStruct()), Expressions.isNull("a").bind(schema.asStruct()), + Expressions.startsWith("s", "abc").bind(schema.asStruct()), + Expressions.notStartsWith("s", "xyz").bind(schema.asStruct()) }; for (Expression expression : expressions) { diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index f41b8dcd7b76..de60e7853a0d 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -45,6 +45,7 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Conversions.toByteBuffer; @@ -63,7 +64,9 @@ public class TestInclusiveManifestEvaluator { optional(10, "all_nans", Types.DoubleType.get()), optional(11, "both_nan_and_null", Types.FloatType.get()), optional(12, "no_nan_or_null", Types.DoubleType.get()), - optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()) + optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()), + optional(14, "all_same_value_or_null", Types.StringType.get()), + optional(15, "no_nulls_same_value_a", Types.StringType.get()) ); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) @@ -79,6 +82,8 @@ public class TestInclusiveManifestEvaluator { .identity("both_nan_and_null") .identity("no_nan_or_null") .identity("all_nulls_missing_nan_float") + .identity("all_same_value_or_null") + .identity("no_nulls_same_value_a") .build(); private static final int INT_MIN_VALUE = 30; @@ -109,7 +114,9 @@ public class TestInclusiveManifestEvaluator { new TestHelpers.TestFieldSummary(false, false, toByteBuffer(Types.FloatType.get(), 0F), toByteBuffer(Types.FloatType.get(), 20F)), - new TestHelpers.TestFieldSummary(true, null, null) + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN) ), null); @Test @@ -128,6 +135,9 @@ public void testAllNulls() { shouldRead = ManifestEvaluator.forRowFilter(startsWith("all_nulls_missing_nan", "asad"), SPEC, true).eval(FILE); Assert.assertFalse("Should skip: startsWith on all null column", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_nulls_missing_nan", "asad"), SPEC, true).eval(FILE); + Assert.assertTrue("Should read: notStartsWith on all null column", shouldRead); } @Test @@ -203,7 +213,7 @@ public void testMissingStats() { lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("id"), notNull("id"), startsWith("all_nulls_missing_nan", "a"), - isNaN("float"), notNaN("float") + isNaN("float"), notNaN("float"), notStartsWith("all_nulls_missing_nan", "a") }; for (Expression expr : exprs) { @@ -439,6 +449,47 @@ public void testStringStartsWith() { Assert.assertFalse("Should skip: range doesn't match", shouldRead); } + @Test + public void testStringNotStartsWith() { + boolean shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "a"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "aa"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "dddd"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "z"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("no_nulls", "a"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "zzzz"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "1"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "a"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches on null", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "aa"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "A"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + // Iceberg does not implement SQL's 3-way boolean logic, so the choice of an all null column matching is + // by definition in order to surface more values to the query engine to allow it to make its own decision. + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_nulls_missing_nan", "A"), SPEC, false).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("no_nulls_same_value_a", "a"), SPEC, false).eval(FILE); + Assert.assertFalse("Should not read: all values start with the prefix", shouldRead); + } + @Test public void testIntegerIn() { boolean shouldRead = ManifestEvaluator.forRowFilter( diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 4cb6376def7d..fcaa352ef4aa 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -49,6 +49,7 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Conversions.toByteBuffer; @@ -178,6 +179,9 @@ public void testAllNulls() { shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("all_nulls", "a")).eval(FILE); Assert.assertFalse("Should skip: startsWith on all null column", shouldRead); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("all_nulls", "a")).eval(FILE); + Assert.assertTrue("Should read: notStartsWith on all null column", shouldRead); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("some_nulls")).eval(FILE); Assert.assertTrue("Should read: column with some nulls contains a non-null value", shouldRead); @@ -540,6 +544,46 @@ public void testStringStartsWith() { Assert.assertFalse("Should not read: range doesn't match", shouldRead); } + @Test + public void testStringNotStartsWith() { + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "a"), true).eval(FILE); + Assert.assertTrue("Should read: no stats", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "a"), true).eval(FILE_2); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aa"), true).eval(FILE_2); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa"), true).eval(FILE_2); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "1s"), true).eval(FILE_3); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "1str1x"), true).eval(FILE_3); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "ff"), true).eval(FILE_4); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aB"), true).eval(FILE_2); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "dWX"), true).eval(FILE_2); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "5"), true).eval(FILE_3); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "3str3x"), true).eval(FILE_3); + Assert.assertTrue("Should read: range matches", shouldRead); + + String aboveMax = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString(); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", aboveMax), true).eval(FILE_4); + Assert.assertTrue("Should read: range matches", shouldRead); + } + @Test public void testIntegerIn() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java b/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java index db2029d3044b..765d832efb41 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java @@ -44,6 +44,8 @@ import static org.apache.iceberg.expressions.Expression.Operation.NOT_IN; import static org.apache.iceberg.expressions.Expression.Operation.NOT_NAN; import static org.apache.iceberg.expressions.Expression.Operation.NOT_NULL; +import static org.apache.iceberg.expressions.Expression.Operation.NOT_STARTS_WITH; +import static org.apache.iceberg.expressions.Expression.Operation.STARTS_WITH; import static org.apache.iceberg.expressions.Expressions.ref; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -108,6 +110,25 @@ public void testComparisonPredicateBinding() { } } + @Test + @SuppressWarnings("unchecked") + public void testPredicateBindingForStringPrefixComparisons() { + StructType struct = StructType.of(required(17, "x", Types.StringType.get())); + + for (Expression.Operation op : Arrays.asList(STARTS_WITH, NOT_STARTS_WITH)) { + UnboundPredicate unbound = new UnboundPredicate<>(op, ref("x"), "s"); + + Expression expr = unbound.bind(struct); + BoundPredicate bound = assertAndUnwrap(expr); + + Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); + Assert.assertEquals("Should not alter literal value", + "s", bound.asLiteralPredicate().literal().value()); + Assert.assertEquals("Should reference correct field ID", 17, bound.ref().fieldId()); + Assert.assertEquals("Should not change the comparison operation", op, bound.op()); + } + } + @Test @SuppressWarnings("unchecked") public void testLiteralConversion() { diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestNotStartsWith.java b/api/src/test/java/org/apache/iceberg/transforms/TestNotStartsWith.java new file mode 100644 index 000000000000..4e4b6da6dc90 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/transforms/TestNotStartsWith.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.transforms; + +import org.apache.iceberg.DataFile; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.TestHelpers.Row; +import org.apache.iceberg.TestHelpers.TestDataFile; +import org.apache.iceberg.expressions.Binder; +import org.apache.iceberg.expressions.BoundPredicate; +import org.apache.iceberg.expressions.Evaluator; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.InclusiveMetricsEvaluator; +import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.expressions.Projections; +import org.apache.iceberg.expressions.StrictMetricsEvaluator; +import org.apache.iceberg.expressions.True; +import org.apache.iceberg.expressions.UnboundPredicate; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.StringType; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.types.Conversions.toByteBuffer; +import static org.apache.iceberg.types.Types.NestedField.optional; + +public class TestNotStartsWith { + + private static final String COLUMN = "someStringCol"; + private static final Schema SCHEMA = new Schema(optional(1, COLUMN, Types.StringType.get())); + + // All 50 rows have someStringCol = 'bbb', none are null (despite being optional). + private static final DataFile FILE_1 = new TestDataFile("file_1.avro", Row.of(), 50, + // any value counts, including nulls + ImmutableMap.of(1, 50L), + // null value counts + ImmutableMap.of(1, 0L), + // nan value counts + null, + // lower bounds + ImmutableMap.of(1, toByteBuffer(StringType.get(), "bbb")), + // upper bounds + ImmutableMap.of(1, toByteBuffer(StringType.get(), "bbb"))); + + @Test + public void testTruncateProjections() { + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).truncate(COLUMN, 4).build(); + + assertProjectionInclusive(spec, notStartsWith(COLUMN, "ab"), "ab", Expression.Operation.NOT_STARTS_WITH); + assertProjectionInclusive(spec, notStartsWith(COLUMN, "abab"), "abab", Expression.Operation.NOT_EQ); + // When literal is longer than partition spec's truncation width, we always read for an inclusive projection + // when using notStartsWith. + Expression projection = Projections.inclusive(spec).project(notStartsWith(COLUMN, "ababab")); + Assert.assertTrue(projection instanceof True); + + assertProjectionStrict(spec, notStartsWith(COLUMN, "ab"), "ab", Expression.Operation.NOT_STARTS_WITH); + assertProjectionStrict(spec, notStartsWith(COLUMN, "abab"), "abab", Expression.Operation.NOT_EQ); + assertProjectionStrict(spec, notStartsWith(COLUMN, "ababab"), "abab", Expression.Operation.NOT_STARTS_WITH); + assertProjectionStrict(spec, notStartsWith(COLUMN, "abcde"), "abcd", Expression.Operation.NOT_STARTS_WITH); + } + + @Test + public void testTruncateStringWhenProjectedPredicateTermIsLongerThanWidth() { + Truncate trunc = Truncate.get(Types.StringType.get(), 2); + UnboundPredicate expr = notStartsWith(COLUMN, "abcde"); + BoundPredicate boundExpr = (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); + + UnboundPredicate projected = trunc.projectStrict(COLUMN, boundExpr); + Evaluator evaluator = new Evaluator(SCHEMA.asStruct(), projected); + + Assert.assertEquals("The projected literal should be truncated to the truncation width", + projected.literal().value(), "ab"); + + Assert.assertFalse("notStartsWith(abcde, truncate(abcde,2)) => false", + evaluator.eval(TestHelpers.Row.of("abcde"))); + + Assert.assertFalse("notStartsWith(abcde, truncate(ab, 2)) => false", + evaluator.eval(TestHelpers.Row.of("ab"))); + + Assert.assertFalse("notStartsWith(abcde, truncate(abcdz, 2)) => false", + evaluator.eval(TestHelpers.Row.of("abcdz"))); + + Assert.assertTrue("notStartsWith(abcde, truncate(a, 2)) => true", + evaluator.eval(TestHelpers.Row.of("a"))); + + Assert.assertTrue("notStartsWith(abcde, truncate(aczcde, 2)) => true", + evaluator.eval(TestHelpers.Row.of("aczcde"))); + } + + @Test + public void testTruncateStringWhenProjectedPredicateTermIsShorterThanWidth() { + Truncate trunc = Truncate.get(Types.StringType.get(), 16); + UnboundPredicate expr = notStartsWith(COLUMN, "ab"); + BoundPredicate boundExpr = (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); + + UnboundPredicate projected = trunc.projectStrict(COLUMN, boundExpr); + Evaluator evaluator = new Evaluator(SCHEMA.asStruct(), projected); + + Assert.assertEquals("The projected literal should not be truncated as its size is shorter than truncation width", + projected.literal().value(), "ab"); + + Assert.assertFalse("notStartsWith(ab, truncate(abcde, 16)) => false", + evaluator.eval(TestHelpers.Row.of("abcde"))); + + Assert.assertFalse("notStartsWith(ab, truncate(ab, 16)) => false", + evaluator.eval(TestHelpers.Row.of("ab"))); + + Assert.assertTrue("notStartsWith(ab, truncate(a, 16)) => true", + evaluator.eval(TestHelpers.Row.of("a"))); + } + + @Test + public void testTruncateStringWhenProjectedPredicateTermIsEqualToWidth() { + Truncate trunc = Truncate.get(Types.StringType.get(), 7); + UnboundPredicate expr = notStartsWith(COLUMN, "abcdefg"); + BoundPredicate boundExpr = (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); + + UnboundPredicate projected = trunc.projectStrict(COLUMN, boundExpr); + Evaluator evaluator = new Evaluator(SCHEMA.asStruct(), projected); + + Assert.assertEquals("The projected literal should not be truncated as its size is equal to truncation width", + projected.literal().value(), "abcdefg"); + + Assert.assertFalse("notStartsWith(abcdefg, truncate(abcdefg, 7)) => false", + evaluator.eval(TestHelpers.Row.of("abcdefg"))); + + Assert.assertTrue("notStartsWith(abcdefg, truncate(ab, 2)) => true", + evaluator.eval(TestHelpers.Row.of("ab"))); + + Assert.assertTrue("notStartsWith(abcdefg, truncate(a, 16)) => true", + evaluator.eval(TestHelpers.Row.of("a"))); + } + + @Test + public void testStrictMetricsEvaluatorForNotStartsWith() { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "bbb")).eval(FILE_1); + Assert.assertFalse("Should not match: strict metrics eval is always false for notStartsWith", shouldRead); + } + + @Test + public void testInclusiveMetricsEvaluatorForNotStartsWith() { + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "aaa")).eval(FILE_1); + Assert.assertTrue("Should match: some columns meet the filter criteria", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "b")).eval(FILE_1); + Assert.assertFalse("Should not match: no columns match the filter criteria", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "bb")).eval(FILE_1); + Assert.assertFalse("Should not match: no columns match the filter criteria", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "bbb")).eval(FILE_1); + Assert.assertFalse("Should not match: no columns match the filter criteria", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "bbbb")).eval(FILE_1); + Assert.assertTrue("Should match: some columns match the filter criteria", shouldRead); + } + + private void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, + String expectedLiteral, Expression.Operation expectedOp) { + Expression projection = Projections.inclusive(spec).project(filter); + assertProjection(spec, expectedLiteral, projection, expectedOp); + } + + private void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, + String expectedLiteral, Expression.Operation expectedOp) { + Expression projection = Projections.strict(spec).project(filter); + assertProjection(spec, expectedLiteral, projection, expectedOp); + } + + private void assertProjection(PartitionSpec spec, String expectedLiteral, Expression projection, + Expression.Operation expectedOp) { + UnboundPredicate predicate = assertAndUnwrapUnbound(projection); + Literal literal = predicate.literal(); + Truncate transform = (Truncate) spec.getFieldsBySourceId(1).get(0).transform(); + String output = transform.toHumanString((String) literal.value()); + + Assert.assertEquals(expectedOp, predicate.op()); + Assert.assertEquals(expectedLiteral, output); + } +} diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestResiduals.java b/api/src/test/java/org/apache/iceberg/transforms/TestResiduals.java index 92c0a1efe902..8257b9e50ed4 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestResiduals.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestResiduals.java @@ -158,6 +158,8 @@ public void testUnpartitionedResiduals() { Expressions.notIn("f", 1, 2, 3), Expressions.notNaN("g"), Expressions.isNaN("h"), + Expressions.startsWith("data", "abcd"), + Expressions.notStartsWith("data", "abcd") }; for (Expression expr : expressions) { diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java b/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java index 35db15dbb042..e694d279880b 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java @@ -49,7 +49,7 @@ public void testTruncateProjections() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).truncate(COLUMN, 4).build(); assertProjectionInclusive(spec, startsWith(COLUMN, "ab"), "ab", Expression.Operation.STARTS_WITH); - assertProjectionInclusive(spec, startsWith(COLUMN, "abab"), "abab", Expression.Operation.STARTS_WITH); + assertProjectionInclusive(spec, startsWith(COLUMN, "abab"), "abab", Expression.Operation.EQ); assertProjectionInclusive(spec, startsWith(COLUMN, "ababab"), "abab", Expression.Operation.STARTS_WITH); assertProjectionStrict(spec, startsWith(COLUMN, "ab"), "ab", Expression.Operation.STARTS_WITH); @@ -68,8 +68,8 @@ public void testTruncateString() { UnboundPredicate projected = trunc.project(COLUMN, boundExpr); Evaluator evaluator = new Evaluator(SCHEMA.asStruct(), projected); - Assert.assertTrue("startsWith(abcde, truncate(abcde,2)) => true", - evaluator.eval(TestHelpers.Row.of("abcde"))); + Assert.assertTrue("startsWith(abcde, truncate(abcdg,2)) => true", + evaluator.eval(TestHelpers.Row.of("abcdg"))); } private void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java b/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java index 04df5d18ce94..03f5897349a8 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java @@ -75,6 +75,8 @@ public void testTruncateString() { "abcde", trunc.apply("abcdefg")); Assert.assertEquals("Should not pad strings shorter than length", "abc", trunc.apply("abc")); + Assert.assertEquals("Should not alter strings equal to length", + "abcde", trunc.apply("abcde")); } @Test diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesResiduals.java b/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesResiduals.java index 79b3982c76c4..2bb6e2458d99 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesResiduals.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesResiduals.java @@ -36,6 +36,7 @@ import static org.apache.iceberg.expressions.Expressions.lessThan; import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.startsWith; public class TestTruncatesResiduals { @@ -179,5 +180,13 @@ public void testStringTruncateTransformResiduals() { assertResidualValue(spec, startsWith("value", "bcd"), "ab", Expression.Operation.FALSE); assertResidualPredicate(spec, startsWith("value", "bcd"), "bc"); assertResidualValue(spec, startsWith("value", "bcd"), "cd", Expression.Operation.FALSE); + assertResidualPredicate(spec, startsWith("value", "bcd"), "bcdd"); + + // not starts with + assertResidualValue(spec, notStartsWith("value", "bcd"), "ab", Expression.Operation.TRUE); + assertResidualPredicate(spec, notStartsWith("value", "bcd"), "bc"); + assertResidualValue(spec, notStartsWith("value", "bcd"), "cd", Expression.Operation.TRUE); + assertResidualPredicate(spec, notStartsWith("value", "bcd"), "bcd"); + assertResidualPredicate(spec, notStartsWith("value", "bcd"), "bcdd"); } } diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index f03544dc0e3f..31c3ea87bff9 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -82,6 +82,7 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -379,7 +380,7 @@ public void testMissingStatsParquet() { lessThan("no_stats_parquet", "a"), lessThanOrEqual("no_stats_parquet", "b"), equal("no_stats_parquet", "c"), greaterThan("no_stats_parquet", "d"), greaterThanOrEqual("no_stats_parquet", "e"), notEqual("no_stats_parquet", "f"), isNull("no_stats_parquet"), notNull("no_stats_parquet"), - startsWith("no_stats_parquet", "a") + startsWith("no_stats_parquet", "a"), notStartsWith("no_stats_parquet", "a") }; for (Expression expr : exprs) { @@ -718,6 +719,40 @@ public void testStringStartsWith() { Assert.assertFalse("Should not read: range doesn't match", shouldRead); } + @Test + public void testStringNotStartsWith() { + Assume.assumeFalse("ORC row group filter does not support StringStartsWith", format == FileFormat.ORC); + boolean shouldRead = shouldRead(notStartsWith("str", "1")); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = shouldRead(notStartsWith("str", "0st")); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = shouldRead(notStartsWith("str", "1str1")); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = shouldRead(notStartsWith("str", "1str1_xgd")); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = shouldRead(notStartsWith("str", "2str")); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = shouldRead(notStartsWith("str", "9xstr")); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = shouldRead(notStartsWith("required", "r")); + Assert.assertFalse("Should not read: range doesn't match", shouldRead); + + shouldRead = shouldRead(notStartsWith("required", "requ")); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = shouldRead(notStartsWith("some_nulls", "ssome")); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = shouldRead(notStartsWith("some_nulls", "som")); + Assert.assertTrue("Should read: range matches", shouldRead); + } + @Test public void testIntegerIn() { boolean shouldRead = shouldRead(in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); diff --git a/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java b/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java index 4211cf09ccd5..54ed24a04e5c 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java @@ -245,6 +245,13 @@ public Action startsWith(Bound expr, Literal lit) { return () -> this.builder.literal(TruthValue.YES_NO_NULL); } + @Override + public Action notStartsWith(Bound expr, Literal lit) { + // Cannot push down NOT_STARTS_WITH operator to ORC, so return TruthValue.YES_NO_NULL which signifies + // that this predicate cannot help with filtering + return () -> this.builder.literal(TruthValue.YES_NO_NULL); + } + @Override public Action predicate(BoundPredicate pred) { if (UNSUPPORTED_TYPES.contains(pred.ref().type().typeId())) { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java index b970c6c476bb..5dc0c598f319 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java @@ -377,6 +377,25 @@ public Boolean startsWith(BoundReference ref, Literal lit) { return ROWS_CANNOT_MATCH; } + @Override + public Boolean notStartsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Boolean hasNonDictPage = isFallback.get(id); + if (hasNonDictPage == null || hasNonDictPage) { + return ROWS_MIGHT_MATCH; + } + + Set dictionary = dict(id, lit.comparator()); + for (T item : dictionary) { + if (!item.toString().startsWith(lit.value().toString())) { + return ROWS_MIGHT_MATCH; + } + } + + return ROWS_CANNOT_MATCH; + } + @SuppressWarnings("unchecked") private Set dict(int id, Comparator comparator) { Preconditions.checkNotNull(dictionaries, "Dictionary is required"); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index f83d70100443..965959fd8072 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -473,6 +473,63 @@ public Boolean startsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } + @Override + @SuppressWarnings("unchecked") + public Boolean notStartsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + Long valueCount = valueCounts.get(id); + + if (valueCount == null) { + // the column is not present and is all nulls + return ROWS_MIGHT_MATCH; + } + + Statistics colStats = (Statistics) stats.get(id); + if (colStats != null && !colStats.isEmpty()) { + if (mayContainNull(colStats)) { + return ROWS_MIGHT_MATCH; + } + + if (hasNonNullButNoMinMax(colStats, valueCount)) { + return ROWS_MIGHT_MATCH; + } + + Binary lower = colStats.genericGetMin(); + Binary upper = colStats.genericGetMax(); + + // notStartsWith will match unless all values must start with the prefix. this happens when the lower and upper + // bounds both start with the prefix. + if (lower != null && upper != null) { + ByteBuffer prefix = lit.toByteBuffer(); + Comparator comparator = Comparators.unsignedBytes(); + + // if lower is shorter than the prefix, it can't start with the prefix + if (lower.length() < prefix.remaining()) { + return ROWS_MIGHT_MATCH; + } + + // truncate lower bound to the prefix and check for equality + int cmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), prefix.remaining()), prefix); + if (cmp == 0) { + // the lower bound starts with the prefix; check the upper bound + // if upper is shorter than the prefix, it can't start with the prefix + if (upper.length() < prefix.remaining()) { + return ROWS_MIGHT_MATCH; + } + + // truncate upper bound so that its length in bytes is not greater than the length of prefix + cmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), prefix.remaining()), prefix); + if (cmp == 0) { + // both bounds match the prefix, so all rows must match the prefix and none do not match + return ROWS_CANNOT_MATCH; + } + } + } + } + + return ROWS_MIGHT_MATCH; + } + @SuppressWarnings("unchecked") private T min(Statistics statistics, int id) { return (T) conversions.get(id).apply(statistics.genericGetMin()); @@ -504,6 +561,10 @@ static boolean hasNonNullButNoMinMax(Statistics statistics, long valueCount) { (statistics.getMaxBytes() == null || statistics.getMinBytes() == null); } + private static boolean mayContainNull(Statistics statistics) { + return !statistics.isNumNullsSet() || statistics.getNumNulls() > 0; + } + private static Function converterFor(PrimitiveType parquetType, Type icebergType) { Function fromParquet = ParquetConversions.converterFromParquet(parquetType); if (icebergType != null) { diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java index a7685a643077..178a258e4ff6 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java @@ -79,6 +79,7 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -241,6 +242,10 @@ public void testAssumptions() { NullPointerException.class, "Cannot create expression literal from null", () -> startsWith("col", null)); + TestHelpers.assertThrows("Should reject null literal in notStartsWith expression", + NullPointerException.class, + "Cannot create expression literal from null", + () -> notStartsWith("col", null)); } @Test @@ -357,6 +362,46 @@ public void testStartsWith() { Assert.assertFalse("Should skip: no match in dictionary", shouldRead); } + @Test + public void testNotStartsWith() { + boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("non_dict", "re")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue("Should read: no dictionary", shouldRead); + + shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "re")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse("Should skip: no match in dictionary", shouldRead); + + shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "req")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse("Should skip: no match in dictionary", shouldRead); + + shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "s!")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); + + shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("no_stats", UUID.randomUUID().toString())) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue("Should read: no stats but dictionary is present", shouldRead); + + shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "reqs")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); + + shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "somex")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); + + shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "some")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse("Should skip: no match in dictionary", shouldRead); + + shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("no_nulls", "xxx")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); + } + + @Test public void testMissingColumn() { TestHelpers.assertThrows("Should complain about missing column in expression", diff --git a/site/docs/api.md b/site/docs/api.md index c8f387f94c1b..b7048a236e03 100644 --- a/site/docs/api.md +++ b/site/docs/api.md @@ -196,6 +196,7 @@ Supported predicate expressions are: * `in` * `notIn` * `startsWith` +* `notStartsWith` Supported expression operations are: diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java index 2ce75277034b..c7fd701fbfaf 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java @@ -579,6 +579,8 @@ public String predicate(UnboundPredicate pred) { return pred.ref().name() + " != " + sqlString(pred.literal()); case STARTS_WITH: return pred.ref().name() + " LIKE '" + pred.literal() + "%'"; + case NOT_STARTS_WITH: + return pred.ref().name() + " NOT LIKE '" + pred.literal() + "%'"; case IN: return pred.ref().name() + " IN (" + sqlString(pred.literals()) + ")"; case NOT_IN: diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index a74882d42033..d51fd3c4e8eb 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -26,6 +26,7 @@ import java.util.List; import java.util.Locale; import java.util.UUID; +import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; @@ -40,6 +41,7 @@ import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.transforms.Transform; @@ -60,6 +62,7 @@ import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.sources.GreaterThan; import org.apache.spark.sql.sources.LessThan; +import org.apache.spark.sql.sources.Not; import org.apache.spark.sql.sources.StringStartsWith; import org.apache.spark.sql.types.IntegerType$; import org.apache.spark.sql.types.LongType$; @@ -421,6 +424,19 @@ public void testPartitionedByDataStartsWithFilter() { Assert.assertEquals(1, scan.planInputPartitions().length); } + @Test + public void testPartitionedByDataNotStartsWithFilter() { + Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + + SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); + Batch scan = builder.build().toBatch(); + + Assert.assertEquals(9, scan.planInputPartitions().length); + } + @Test public void testPartitionedByIdStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); @@ -437,6 +453,22 @@ public void testPartitionedByIdStartsWith() { Assert.assertEquals(1, scan.planInputPartitions().length); } + @Test + public void testPartitionedByIdNotStartsWith() { + Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); + + CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( + "path", table.location()) + ); + + SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); + Batch scan = builder.build().toBatch(); + + Assert.assertEquals(9, scan.planInputPartitions().length); + } + @Test public void testUnpartitionedStartsWith() { Dataset df = spark.read() @@ -453,6 +485,27 @@ public void testUnpartitionedStartsWith() { Assert.assertEquals("junction", matchedData.get(0)); } + @Test + public void testUnpartitionedNotStartsWith() { + Dataset df = spark.read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); + + List matchedData = df.select("data") + .where("data NOT LIKE 'jun%'") + .as(Encoders.STRING()) + .collectAsList(); + + List expected = testRecords(SCHEMA).stream() + .map(r -> r.getField("data").toString()) + .filter(d -> !d.startsWith("jun")) + .collect(Collectors.toList()); + + Assert.assertEquals(9, matchedData.size()); + Assert.assertEquals(Sets.newHashSet(expected), Sets.newHashSet(matchedData)); + } + private static Record projectFlat(Schema projection, Record record) { Record result = GenericRecord.create(projection); List fields = projection.asStruct().fields();