diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index c225f21da8a8..23be1ae17329 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -29,6 +30,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; +import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -467,8 +469,34 @@ public Boolean startsWith(BoundReference ref, Literal lit) { @Override public Boolean notStartsWith(BoundReference ref, Literal lit) { - // TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds - // are ["a", "b"]. + int id = ref.fieldId(); + if (containsNullsOnly(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + if (lowerBounds != null && lowerBounds.containsKey(id)) { + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + if (lower == null || lower.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + int cmp = comparator.compare(lower.subSequence(0, prefix.length()), prefix); + if (cmp == 0) { + return ROWS_MIGHT_NOT_MATCH; + } + } + // lower doesn't match with the prefix, so checking with the upper + if (upperBounds != null && upperBounds.containsKey(id)) { + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + if (upper == null || upper.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + int cmp = comparator.compare(upper.subSequence(0, prefix.length()), prefix); + // upper also does not match with the prefix, so predicate doesn't match strictly + if (cmp != 0) { + return ROWS_MUST_MATCH; + } + } return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..a0925f970ed5 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -32,6 +32,7 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -146,9 +147,11 @@ public class TestStrictMetricsEvaluator { // nan value counts null, // lower bounds - ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")), + ImmutableMap.of( + 5, toByteBuffer(StringType.get(), "bbb"), 3, toByteBuffer(StringType.get(), "aa")), // upper bounds - ImmutableMap.of(5, toByteBuffer(StringType.get(), "eee"))); + ImmutableMap.of( + 5, toByteBuffer(StringType.get(), "eee"), 3, toByteBuffer(StringType.get(), "dC"))); private static final DataFile FILE_3 = new TestDataFile( @@ -168,9 +171,30 @@ public class TestStrictMetricsEvaluator { // nan value counts null, // lower bounds - ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")), + ImmutableMap.of( + 5, toByteBuffer(StringType.get(), "bbb"), 3, toByteBuffer(StringType.get(), "1str1")), // upper bounds - ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + ImmutableMap.of( + 5, + toByteBuffer(StringType.get(), "bbb"), + 3, + toByteBuffer(StringType.get(), "3str3"))); + + private static final DataFile FILE_5 = + new TestDataFile( + "file_4.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(3, 0L), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abcdefghi"))); @Test public void testAllNulls() { @@ -684,4 +708,59 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + @Test + public void testStringNotStartsWith() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a"), true).eval(FILE); + assertThat(shouldRead).as("Should read: no stats").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "x"), true).eval(FILE_2); + assertThat(shouldRead).as("Should read: no stats").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a"), true).eval(FILE_2); + assertThat(shouldRead).as("Should read: range matches").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aa"), true).eval(FILE_2); + assertThat(shouldRead).as("Should read: range matches").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa"), true).eval(FILE_2); + assertThat(shouldRead).as("Should read: range matches").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "1s"), true).eval(FILE_3); + assertThat(shouldRead).as("Should read: range matches").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "1str1x"), true).eval(FILE_3); + assertThat(shouldRead).as("Should read: range matches").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aB"), true).eval(FILE_2); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "dWX"), true).eval(FILE_2); + assertThat(shouldRead).as("Should read: range matches").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "5"), true).eval(FILE_3); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "3str3x"), true).eval(FILE_3); + assertThat(shouldRead).as("Should read: range matches").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abc"), true).eval(FILE_5); + assertThat(shouldRead).as("Should not read: all strings start with prefix").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abcd"), true).eval(FILE_5); + assertThat(shouldRead).as("Should not read: lower shorter than prefix, cannot match").isFalse(); + } }