Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ public boolean test(T value) {
return cmp.compare(value, literal.value()) != 0;
case STARTS_WITH:
return String.valueOf(value).startsWith((String) literal.value());
case NOT_STARTS_WITH:
return !String.valueOf(value).startsWith((String) literal.value());
default:
throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + op());
}
Expand All @@ -91,6 +93,8 @@ public String toString() {
return term() + " != " + literal;
case STARTS_WITH:
return term() + " startsWith \"" + literal + "\"";
case NOT_STARTS_WITH:
return term() + " notStartsWith \"" + literal + "\"";
case IN:
return term() + " in { " + literal + " }";
case NOT_IN:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,5 +151,10 @@ public <T> Boolean notIn(Bound<T> valueExpr, Set<T> literalSet) {
public <T> Boolean startsWith(Bound<T> valueExpr, Literal<T> lit) {
return ((String) valueExpr.eval(struct)).startsWith((String) lit.value());
}

@Override
public <T> Boolean notStartsWith(Bound<T> valueExpr, Literal<T> lit) {
return !startsWith(valueExpr, lit);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ enum Operation {
NOT,
AND,
OR,
STARTS_WITH;
STARTS_WITH,
NOT_STARTS_WITH;

/**
* Returns the operation used when this is negated.
Expand Down Expand Up @@ -74,6 +75,10 @@ public Operation negate() {
return Operation.NOT_IN;
case NOT_IN:
return Operation.IN;
case STARTS_WITH:
return Operation.NOT_STARTS_WITH;
case NOT_STARTS_WITH:
return Operation.STARTS_WITH;
default:
throw new IllegalArgumentException("No negation for operation: " + this);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,19 @@ public <T> R notEq(BoundReference<T> ref, Literal<T> lit) {
}

public <T> R in(BoundReference<T> ref, Set<T> literalSet) {
throw new UnsupportedOperationException("In operation is not supported by the visitor");
throw new UnsupportedOperationException("In expression is not supported by the visitor");
}

public <T> R notIn(BoundReference<T> ref, Set<T> literalSet) {
throw new UnsupportedOperationException("notIn operation is not supported by the visitor");
throw new UnsupportedOperationException("notIn expression is not supported by the visitor");
}

public <T> R startsWith(BoundReference<T> ref, Literal<T> lit) {
throw new UnsupportedOperationException("Unsupported operation.");
throw new UnsupportedOperationException("startsWith expression is not supported by the visitor");
}

public <T> R notStartsWith(BoundReference<T> ref, Literal<T> lit) {
throw new UnsupportedOperationException("notStartsWith expression is not supported by the visitor");
}

/**
Expand Down Expand Up @@ -151,6 +155,8 @@ public <T> R predicate(BoundPredicate<T> pred) {
return notEq((BoundReference<T>) pred.term(), literalPred.literal());
case STARTS_WITH:
return startsWith((BoundReference<T>) pred.term(), literalPred.literal());
case NOT_STARTS_WITH:
return notStartsWith((BoundReference<T>) pred.term(), literalPred.literal());
default:
throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + pred.op());
}
Expand Down Expand Up @@ -242,6 +248,10 @@ public <T> R startsWith(Bound<T> expr, Literal<T> lit) {
throw new UnsupportedOperationException("Unsupported operation.");
}

public <T> R notStartsWith(Bound<T> expr, Literal<T> lit) {
throw new UnsupportedOperationException("Unsupported operation.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the exception message should specify which operation is not supported. startsWith should also be modified.

Copy link
Contributor Author

@kbendick kbendick Dec 6, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think generally we try to touch as few lines as needed in some cases (or avoid unnecessary changes), as that makes it easier on people who maintain forks.

I don't disagree with you that the error message should be more descriptive, but given what I know about the API and the implemented classes, I don't think these specific base class for BoundVisitor actually get called anywhere. Notice how many of them return null as well.

As for the specific error message, it's just to be consistent with the existing one for startsWith.

Given the size of the PR already, I think it's best to keep unrelated changes out and in a separate PR (particularly updating startsWith - but until then, let's be consistent).

Large PRs like this already face a hurdle.

}

@Override
public <T> R predicate(BoundPredicate<T> pred) {
if (pred.isLiteralPredicate()) {
Expand All @@ -261,6 +271,8 @@ public <T> R predicate(BoundPredicate<T> pred) {
return notEq(pred.term(), literalPred.literal());
case STARTS_WITH:
return startsWith(pred.term(), literalPred.literal());
case NOT_STARTS_WITH:
return notStartsWith(pred.term(), literalPred.literal());
default:
throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + pred.op());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,14 @@ public static UnboundPredicate<String> startsWith(UnboundTerm<String> expr, Stri
return new UnboundPredicate<>(Expression.Operation.STARTS_WITH, expr, value);
}

public static UnboundPredicate<String> notStartsWith(String name, String value) {
return new UnboundPredicate<>(Expression.Operation.NOT_STARTS_WITH, ref(name), value);
}

public static UnboundPredicate<String> notStartsWith(UnboundTerm<String> expr, String value) {
return new UnboundPredicate<>(Expression.Operation.NOT_STARTS_WITH, expr, value);
}

public static <T> UnboundPredicate<T> in(String name, T... values) {
return predicate(Operation.IN, name, Lists.newArrayList(values));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,52 @@ public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) {
Integer id = ref.fieldId();

if (mayContainNull(id)) {
return ROWS_MIGHT_MATCH;
}

ByteBuffer prefixAsBytes = lit.toByteBuffer();

Comparator<ByteBuffer> comparator = Comparators.unsignedBytes();

// notStartsWith will match unless all values must start with the prefix. This happens when the lower and upper
// bounds both start with the prefix.
if (lowerBounds != null && upperBounds != null &&
lowerBounds.containsKey(id) && upperBounds.containsKey(id)) {
ByteBuffer lower = lowerBounds.get(id);
// if lower is shorter than the prefix then lower doesn't start with the prefix
if (lower.remaining() < prefixAsBytes.remaining()) {
return ROWS_MIGHT_MATCH;
}

int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes);
if (cmp == 0) {
ByteBuffer upper = upperBounds.get(id);
// if upper is shorter than the prefix then upper can't start with the prefix
if (upper.remaining() < prefixAsBytes.remaining()) {
return ROWS_MIGHT_MATCH;
}

cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes);
if (cmp == 0) {
// both bounds match the prefix, so all rows must match the prefix and therefore do not satisfy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: both bounds start with the prefix (?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense as they've potentially been truncated.

// the predicate
return ROWS_CANNOT_MATCH;
}
}
}

return ROWS_MIGHT_MATCH;
}

private boolean mayContainNull(Integer id) {
return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) != 0);
}

private boolean containsNullsOnly(Integer id) {
return valueCounts != null && valueCounts.containsKey(id) &&
nullCounts != null && nullCounts.containsKey(id) &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,50 @@ public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) {
int pos = Accessors.toPosition(ref.accessor());
PartitionFieldSummary fieldStats = stats.get(pos);

if (fieldStats.containsNull()) {
return ROWS_MIGHT_MATCH;
}

ByteBuffer lower = fieldStats.lowerBound();
ByteBuffer upper = fieldStats.upperBound();

// notStartsWith will match unless all values must start with the prefix. This happens when the lower and upper
// bounds both start with the prefix.
if (lower != null && upper != null) {
ByteBuffer prefixAsBytes = lit.toByteBuffer();
Comparator<ByteBuffer> comparator = Comparators.unsignedBytes();

// if lower is shorter than the prefix, it can't start with the prefix
if (lower.remaining() < prefixAsBytes.remaining()) {
return ROWS_MIGHT_MATCH;
}

// truncate lower bound to the prefix and check for equality
int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes);
if (cmp == 0) {
// the lower bound starts with the prefix; check the upper bound
// if upper is shorter than the prefix, it can't start with the prefix
if (upper.remaining() < prefixAsBytes.remaining()) {
return ROWS_MIGHT_MATCH;
}

// truncate upper bound so that its length in bytes is not greater than the length of prefix
cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes);
if (cmp == 0) {
// both bounds match the prefix, so all rows must match the prefix and none do not match
return ROWS_CANNOT_MATCH;
}
}
}

return ROWS_MIGHT_MATCH;
}

private boolean allValuesAreNull(PartitionFieldSummary summary, Type.TypeID typeId) {
// containsNull encodes whether at least one partition value is null,
// lowerBound is null if all partition values are null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ public <T> Expression startsWith(BoundReference<T> ref, Literal<T> lit) {
return ((String) ref.eval(struct)).startsWith((String) lit.value()) ? alwaysTrue() : alwaysFalse();
}

@Override
public <T> Expression notStartsWith(BoundReference<T> ref, Literal<T> lit) {
return ((String) ref.eval(struct)).startsWith((String) lit.value()) ? alwaysFalse() : alwaysTrue();
}

@Override
@SuppressWarnings("unchecked")
public <T> Expression predicate(BoundPredicate<T> pred) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,12 @@ public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
return ROWS_MIGHT_NOT_MATCH;
}

@Override
public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) {
// TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds are ["a", "b"].
return ROWS_MIGHT_NOT_MATCH;
}

private boolean canContainNulls(Integer id) {
return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) > 0);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,8 @@ public String toString() {
return term() + " != " + literal();
case STARTS_WITH:
return term() + " startsWith \"" + literal() + "\"";
case NOT_STARTS_WITH:
return term() + " notStartsWith \"" + literal() + "\"";
case IN:
return term() + " in (" + COMMA.join(literals()) + ")";
case NOT_IN:
Expand Down
53 changes: 44 additions & 9 deletions api/src/main/java/org/apache/iceberg/transforms/Truncate.java
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,29 @@ public UnboundPredicate<CharSequence> project(String name,
if (predicate.isUnaryPredicate()) {
return Expressions.predicate(predicate.op(), name);
} else if (predicate.isLiteralPredicate()) {
return ProjectionUtil.truncateArray(name, predicate.asLiteralPredicate(), this);
BoundLiteralPredicate<CharSequence> pred = predicate.asLiteralPredicate();
Copy link
Member

@RussellSpitzer RussellSpitzer Dec 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if I understand this we have a set of optimizations here,

If we have a starts with:
  If our prefix is shorter than the truncate:
    Then behave has normal and compare with normal StartsWith
  If our prefix is the same length as the truncate:
    Then we can do an equal instead
  If our prefix is longer than the truncate:
     Truncate the literal and check that
If we have a notStartsWith:
  If our prefix is shorter than the truncate:
     Behave as normal with notStartsWith
  If our prefix is the same length as truncate:
     Compare with Equals
  If our prefix is longer than the truncate:
      No pruning possible because we don't have enough information

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think that's an accurate summary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another way of thinking about this is that the truncate transform doesn't affect the result of startsWith if truncation is longer than the startsWith prefix. If I'm looking for strings that start with a 2-character pattern, like aa, then truncating strings to 4 characters first doesn't matter.

That's why both STARTS_WITH and NOT_STARTS_WITH have the same logic here and in the strict projection. As long as the truncation length is >= the prefix length, the original startsWith predicate is still valid. (Though we can optimize a little if we know that the string will be no longer than the prefix.)

So the only cases we need to worry about are when the truncated value is shorter than the prefix.

  • For startsWith and inclusive, using a shortened prefix will find values that might match
  • For notStartsWith and inclusive, using a shortened value will exclude values that do match, so there is no projection
  • For startsWith and strict, using a shortened value would include values that do not match, so there is no projection
  • For notStartsWith and strict, using a shortened value will exclude all values that must match

The tests have some good examples.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that is a correct summary.

switch (pred.op()) {
case STARTS_WITH:
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.equal(name, pred.literal().value());
}

return ProjectionUtil.truncateArray(name, pred, this);

case NOT_STARTS_WITH:
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.notEqual(name, pred.literal().value());
}

return null;

default:
return ProjectionUtil.truncateArray(name, pred, this);
}
} else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.IN) {
return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this);
}
Expand All @@ -303,14 +325,27 @@ public UnboundPredicate<CharSequence> projectStrict(String name,
return Expressions.predicate(predicate.op(), name);
} else if (predicate instanceof BoundLiteralPredicate) {
BoundLiteralPredicate<CharSequence> pred = predicate.asLiteralPredicate();
if (pred.op() == Expression.Operation.STARTS_WITH) {
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.equal(name, pred.literal().value());
}
} else {
return ProjectionUtil.truncateArrayStrict(name, pred, this);
switch (pred.op()) {
case STARTS_WITH:
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.equal(name, pred.literal().value());
}

return null;

case NOT_STARTS_WITH:
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.notEqual(name, pred.literal().value());
}

return Expressions.predicate(pred.op(), name, apply(pred.literal().value()).toString());

default:
return ProjectionUtil.truncateArrayStrict(name, pred, this);
}
} else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.NOT_IN) {
return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@
import static org.apache.iceberg.expressions.Expressions.notIn;
import static org.apache.iceberg.expressions.Expressions.notNaN;
import static org.apache.iceberg.expressions.Expressions.notNull;
import static org.apache.iceberg.expressions.Expressions.notStartsWith;
import static org.apache.iceberg.expressions.Expressions.or;
import static org.apache.iceberg.expressions.Expressions.predicate;
import static org.apache.iceberg.expressions.Expressions.startsWith;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;

Expand Down Expand Up @@ -220,6 +222,29 @@ public void testNotEqual() {

}

@Test
public void testStartsWith() {
StructType struct = StructType.of(required(24, "s", Types.StringType.get()));
Evaluator evaluator = new Evaluator(struct, startsWith("s", "abc"));
Assert.assertTrue("abc startsWith abc should be true", evaluator.eval(TestHelpers.Row.of("abc")));
Assert.assertFalse("xabc startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("xabc")));
Assert.assertFalse("Abc startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("Abc")));
Assert.assertFalse("a startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("a")));
Assert.assertTrue("abcd startsWith abc should be true", evaluator.eval(TestHelpers.Row.of("abcd")));
}

@Test
public void testNotStartsWith() {
StructType struct = StructType.of(required(24, "s", Types.StringType.get()));
Evaluator evaluator = new Evaluator(struct, notStartsWith("s", "abc"));
Assert.assertFalse("abc notStartsWith abc should be false", evaluator.eval(TestHelpers.Row.of("abc")));
Assert.assertTrue("xabc notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("xabc")));
Assert.assertTrue("Abc notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("Abc")));
Assert.assertTrue("a notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("a")));
Assert.assertFalse("abcde notStartsWith abc should be false", evaluator.eval(TestHelpers.Row.of("abcde")));
Assert.assertTrue("Abcde notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("Abcde")));
}

@Test
public void testAlwaysTrue() {
Evaluator evaluator = new Evaluator(STRUCT, alwaysTrue());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import static org.apache.iceberg.expressions.Expressions.greaterThan;
import static org.apache.iceberg.expressions.Expressions.lessThan;
import static org.apache.iceberg.expressions.Expressions.not;
import static org.apache.iceberg.expressions.Expressions.notStartsWith;
import static org.apache.iceberg.expressions.Expressions.or;
import static org.apache.iceberg.expressions.Expressions.startsWith;
import static org.apache.iceberg.types.Types.NestedField.required;
Expand All @@ -43,7 +44,8 @@ public class TestExpressionBinding {
private static final StructType STRUCT = StructType.of(
required(0, "x", Types.IntegerType.get()),
required(1, "y", Types.IntegerType.get()),
required(2, "z", Types.IntegerType.get())
required(2, "z", Types.IntegerType.get()),
required(3, "data", Types.StringType.get())
);

@Test
Expand Down Expand Up @@ -146,6 +148,18 @@ public void testStartsWith() {
Assert.assertEquals("Should bind s correctly", 0, pred.term().ref().fieldId());
}

@Test
public void testNotStartsWith() {
StructType struct = StructType.of(required(21, "s", Types.StringType.get()));
Expression expr = notStartsWith("s", "abc");
Expression boundExpr = Binder.bind(struct, expr);
TestHelpers.assertAllReferencesBound("NotStartsWith", boundExpr);
// Make sure the expression is a NotStartsWith
BoundPredicate<?> pred = TestHelpers.assertAndUnwrap(boundExpr, BoundPredicate.class);
Assert.assertEquals("Should be right operation", Expression.Operation.NOT_STARTS_WITH, pred.op());
Assert.assertEquals("Should bind term to correct field id", 21, pred.term().ref().fieldId());
}

@Test
public void testAlwaysTrue() {
Assert.assertEquals("Should not change alwaysTrue",
Expand Down
Loading