-
Notifications
You must be signed in to change notification settings - Fork 3k
[CORE] Add in a NOT_STARTS_WITH operator (including Spark 3.2) #2062
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f666de4
3d91eca
8e1505e
2e81452
9087b63
01e95d1
c3d8e7f
ecd9ed9
111253d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -102,15 +102,19 @@ public <T> R notEq(BoundReference<T> ref, Literal<T> lit) { | |
| } | ||
|
|
||
| public <T> R in(BoundReference<T> ref, Set<T> literalSet) { | ||
| throw new UnsupportedOperationException("In operation is not supported by the visitor"); | ||
| throw new UnsupportedOperationException("In expression is not supported by the visitor"); | ||
| } | ||
|
|
||
| public <T> R notIn(BoundReference<T> ref, Set<T> literalSet) { | ||
| throw new UnsupportedOperationException("notIn operation is not supported by the visitor"); | ||
| throw new UnsupportedOperationException("notIn expression is not supported by the visitor"); | ||
| } | ||
|
|
||
| public <T> R startsWith(BoundReference<T> ref, Literal<T> lit) { | ||
| throw new UnsupportedOperationException("Unsupported operation."); | ||
| throw new UnsupportedOperationException("startsWith expression is not supported by the visitor"); | ||
| } | ||
|
|
||
| public <T> R notStartsWith(BoundReference<T> ref, Literal<T> lit) { | ||
| throw new UnsupportedOperationException("notStartsWith expression is not supported by the visitor"); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -151,6 +155,8 @@ public <T> R predicate(BoundPredicate<T> pred) { | |
| return notEq((BoundReference<T>) pred.term(), literalPred.literal()); | ||
| case STARTS_WITH: | ||
| return startsWith((BoundReference<T>) pred.term(), literalPred.literal()); | ||
| case NOT_STARTS_WITH: | ||
| return notStartsWith((BoundReference<T>) pred.term(), literalPred.literal()); | ||
| default: | ||
| throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + pred.op()); | ||
| } | ||
|
|
@@ -242,6 +248,10 @@ public <T> R startsWith(Bound<T> expr, Literal<T> lit) { | |
| throw new UnsupportedOperationException("Unsupported operation."); | ||
| } | ||
|
|
||
| public <T> R notStartsWith(Bound<T> expr, Literal<T> lit) { | ||
| throw new UnsupportedOperationException("Unsupported operation."); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the exception message should specify which operation is not supported.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think generally we try to touch as few lines as needed in some cases (or avoid unnecessary changes), as that makes it easier on people who maintain forks. I don't disagree with you that the error message should be more descriptive, but given what I know about the API and the implemented classes, I don't think these specific base class for As for the specific error message, it's just to be consistent with the existing one for Given the size of the PR already, I think it's best to keep unrelated changes out and in a separate PR (particularly updating Large PRs like this already face a hurdle. |
||
| } | ||
|
|
||
| @Override | ||
| public <T> R predicate(BoundPredicate<T> pred) { | ||
| if (pred.isLiteralPredicate()) { | ||
|
|
@@ -261,6 +271,8 @@ public <T> R predicate(BoundPredicate<T> pred) { | |
| return notEq(pred.term(), literalPred.literal()); | ||
| case STARTS_WITH: | ||
| return startsWith(pred.term(), literalPred.literal()); | ||
| case NOT_STARTS_WITH: | ||
| return notStartsWith(pred.term(), literalPred.literal()); | ||
| default: | ||
| throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + pred.op()); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -408,6 +408,52 @@ public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) { | |
| return ROWS_MIGHT_MATCH; | ||
| } | ||
|
|
||
| @Override | ||
| public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) { | ||
kbendick marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Integer id = ref.fieldId(); | ||
|
|
||
| if (mayContainNull(id)) { | ||
| return ROWS_MIGHT_MATCH; | ||
kbendick marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| ByteBuffer prefixAsBytes = lit.toByteBuffer(); | ||
|
|
||
| Comparator<ByteBuffer> comparator = Comparators.unsignedBytes(); | ||
|
|
||
| // notStartsWith will match unless all values must start with the prefix. This happens when the lower and upper | ||
| // bounds both start with the prefix. | ||
| if (lowerBounds != null && upperBounds != null && | ||
kbendick marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| lowerBounds.containsKey(id) && upperBounds.containsKey(id)) { | ||
| ByteBuffer lower = lowerBounds.get(id); | ||
kbendick marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // if lower is shorter than the prefix then lower doesn't start with the prefix | ||
| if (lower.remaining() < prefixAsBytes.remaining()) { | ||
| return ROWS_MIGHT_MATCH; | ||
| } | ||
|
|
||
| int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); | ||
| if (cmp == 0) { | ||
| ByteBuffer upper = upperBounds.get(id); | ||
| // if upper is shorter than the prefix then upper can't start with the prefix | ||
| if (upper.remaining() < prefixAsBytes.remaining()) { | ||
| return ROWS_MIGHT_MATCH; | ||
| } | ||
|
|
||
| cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); | ||
| if (cmp == 0) { | ||
| // both bounds match the prefix, so all rows must match the prefix and therefore do not satisfy | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: both bounds start with the prefix (?)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense as they've potentially been truncated. |
||
| // the predicate | ||
| return ROWS_CANNOT_MATCH; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return ROWS_MIGHT_MATCH; | ||
| } | ||
|
|
||
| private boolean mayContainNull(Integer id) { | ||
rdblue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) != 0); | ||
| } | ||
|
|
||
| private boolean containsNullsOnly(Integer id) { | ||
| return valueCounts != null && valueCounts.containsKey(id) && | ||
| nullCounts != null && nullCounts.containsKey(id) && | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -285,7 +285,29 @@ public UnboundPredicate<CharSequence> project(String name, | |
| if (predicate.isUnaryPredicate()) { | ||
| return Expressions.predicate(predicate.op(), name); | ||
| } else if (predicate.isLiteralPredicate()) { | ||
| return ProjectionUtil.truncateArray(name, predicate.asLiteralPredicate(), this); | ||
| BoundLiteralPredicate<CharSequence> pred = predicate.asLiteralPredicate(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So if I understand this we have a set of optimizations here,
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think that's an accurate summary.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another way of thinking about this is that the truncate transform doesn't affect the result of That's why both So the only cases we need to worry about are when the truncated value is shorter than the prefix.
The tests have some good examples.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah that is a correct summary. |
||
| switch (pred.op()) { | ||
| case STARTS_WITH: | ||
| if (pred.literal().value().length() < width()) { | ||
| return Expressions.predicate(pred.op(), name, pred.literal().value()); | ||
| } else if (pred.literal().value().length() == width()) { | ||
| return Expressions.equal(name, pred.literal().value()); | ||
| } | ||
|
|
||
| return ProjectionUtil.truncateArray(name, pred, this); | ||
|
|
||
| case NOT_STARTS_WITH: | ||
| if (pred.literal().value().length() < width()) { | ||
| return Expressions.predicate(pred.op(), name, pred.literal().value()); | ||
| } else if (pred.literal().value().length() == width()) { | ||
| return Expressions.notEqual(name, pred.literal().value()); | ||
| } | ||
|
|
||
| return null; | ||
|
|
||
| default: | ||
| return ProjectionUtil.truncateArray(name, pred, this); | ||
| } | ||
| } else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.IN) { | ||
| return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this); | ||
| } | ||
|
|
@@ -303,14 +325,27 @@ public UnboundPredicate<CharSequence> projectStrict(String name, | |
| return Expressions.predicate(predicate.op(), name); | ||
| } else if (predicate instanceof BoundLiteralPredicate) { | ||
| BoundLiteralPredicate<CharSequence> pred = predicate.asLiteralPredicate(); | ||
| if (pred.op() == Expression.Operation.STARTS_WITH) { | ||
kbendick marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (pred.literal().value().length() < width()) { | ||
| return Expressions.predicate(pred.op(), name, pred.literal().value()); | ||
| } else if (pred.literal().value().length() == width()) { | ||
| return Expressions.equal(name, pred.literal().value()); | ||
| } | ||
| } else { | ||
| return ProjectionUtil.truncateArrayStrict(name, pred, this); | ||
| switch (pred.op()) { | ||
| case STARTS_WITH: | ||
| if (pred.literal().value().length() < width()) { | ||
| return Expressions.predicate(pred.op(), name, pred.literal().value()); | ||
| } else if (pred.literal().value().length() == width()) { | ||
| return Expressions.equal(name, pred.literal().value()); | ||
| } | ||
|
|
||
| return null; | ||
|
|
||
| case NOT_STARTS_WITH: | ||
| if (pred.literal().value().length() < width()) { | ||
| return Expressions.predicate(pred.op(), name, pred.literal().value()); | ||
| } else if (pred.literal().value().length() == width()) { | ||
| return Expressions.notEqual(name, pred.literal().value()); | ||
| } | ||
|
|
||
| return Expressions.predicate(pred.op(), name, apply(pred.literal().value()).toString()); | ||
|
|
||
| default: | ||
| return ProjectionUtil.truncateArrayStrict(name, pred, this); | ||
| } | ||
| } else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.NOT_IN) { | ||
| return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.