Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,29 @@ private static <C extends Comparable<C>> LogicalExpression createAndPredicate(
ExpressionPosition pos
) {
return new ParquetBooleanPredicate<C>(name, args, pos) {
/**
* Evaluates a compound "AND" filter on the statistics of a RowGroup (the filter reads "filterA and filterB").
* Return value :<ul>
* <li>ALL : only if all filters return ALL
* <li>NONE : if one filter at least returns NONE
* <li>SOME : all other cases
* </ul>
*/
@Override
public boolean canDrop(RangeExprEvaluator<C> evaluator) {
// "and" : as long as one branch is OK to drop, we can drop it.
public RowsMatch matches(RangeExprEvaluator<C> evaluator) {
RowsMatch resultMatch = RowsMatch.ALL;
for (LogicalExpression child : this) {
if (child instanceof ParquetFilterPredicate && ((ParquetFilterPredicate)child).canDrop(evaluator)) {
return true;
if (child instanceof ParquetFilterPredicate) {
switch (((ParquetFilterPredicate) child).matches(evaluator)) {
case NONE:
return RowsMatch.NONE; // No row comply to 1 filter part => can drop RG
case SOME:
resultMatch = RowsMatch.SOME;
default: // Do nothing
}
}
}
return false;
return resultMatch;
}
};
}
Expand All @@ -66,15 +80,29 @@ private static <C extends Comparable<C>> LogicalExpression createOrPredicate(
ExpressionPosition pos
) {
return new ParquetBooleanPredicate<C>(name, args, pos) {
/**
* Evaluates a compound "OR" filter on the statistics of a RowGroup (the filter reads "filterA or filterB").
* Return value :<ul>
* <li>NONE : only if all filters return NONE
* <li>ALL : if one filter at least returns ALL
* <li>SOME : all other cases
* </ul>
*/
@Override
public boolean canDrop(RangeExprEvaluator<C> evaluator) {
public RowsMatch matches(RangeExprEvaluator<C> evaluator) {
RowsMatch resultMatch = RowsMatch.NONE;
for (LogicalExpression child : this) {
// "or" : as long as one branch is NOT ok to drop, we can NOT drop it.
if (!(child instanceof ParquetFilterPredicate) || !((ParquetFilterPredicate)child).canDrop(evaluator)) {
return false;
if (child instanceof ParquetFilterPredicate) {
switch (((ParquetFilterPredicate) child).matches(evaluator)) {
case ALL:
return RowsMatch.ALL; // One at least is ALL => can drop filter but not RG
case SOME:
resultMatch = RowsMatch.SOME;
default: // Do nothing
}
}
}
return true;
return resultMatch;
}
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.function.BiPredicate;
import java.util.function.BiFunction;

import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.hasNoNulls;
import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isNullOrEmpty;
import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isAllNulls;

Expand All @@ -38,12 +39,13 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical
implements ParquetFilterPredicate<C> {
private final LogicalExpression left;
private final LogicalExpression right;
private final BiPredicate<Statistics<C>, Statistics<C>> predicate;

private final BiFunction<Statistics<C>, Statistics<C>, RowsMatch> predicate;

private ParquetComparisonPredicate(
LogicalExpression left,
LogicalExpression right,
BiPredicate<Statistics<C>, Statistics<C>> predicate
BiFunction<Statistics<C>, Statistics<C>, RowsMatch> predicate
) {
super(left.getPosition());
this.left = left;
Expand All @@ -65,7 +67,7 @@ public <T, V, E extends Exception> T accept(ExprVisitor<T, V, E> visitor, V valu
}

/**
* Semantics of canDrop() is very similar to what is implemented in Parquet library's
* Semantics of matches() is very similar to what is implemented in Parquet library's
* {@link org.apache.parquet.filter2.statisticslevel.StatisticsFilter} and
* {@link org.apache.parquet.filter2.predicate.FilterPredicate}
*
Expand All @@ -83,23 +85,29 @@ public <T, V, E extends Exception> T accept(ExprVisitor<T, V, E> visitor, V valu
* where Column1 and Column2 are from same parquet table.
*/
@Override
public boolean canDrop(RangeExprEvaluator<C> evaluator) {
public RowsMatch matches(RangeExprEvaluator<C> evaluator) {
Statistics<C> leftStat = left.accept(evaluator, null);
if (isNullOrEmpty(leftStat)) {
return false;
return RowsMatch.SOME;
}

Statistics<C> rightStat = right.accept(evaluator, null);
if (isNullOrEmpty(rightStat)) {
return false;
return RowsMatch.SOME;
}

// if either side is ALL null, = is evaluated to UNKNOWN -> canDrop
if (isAllNulls(leftStat, evaluator.getRowCount()) || isAllNulls(rightStat, evaluator.getRowCount())) {
return true;
return RowsMatch.NONE;
}
if (!leftStat.hasNonNullValue() || !rightStat.hasNonNullValue()) {
return RowsMatch.SOME;
}
return predicate.apply(leftStat, rightStat);
}

return (leftStat.hasNonNullValue() && rightStat.hasNonNullValue()) && predicate.test(leftStat, rightStat);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why check for hasNonNullValue() does not apply anymore?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Condition tested inside checkNull when necessary

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it is the same check. Can you check if isAllNulls is equivalent to !hasNonNullValue() in which case consider replacing isAllNulls() on line 92.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Concerning the min/max not defined, here is an example I found in drill/exec/java-exec/src/test/resources/parquetFilterPush/blnTbl/0_0_3.parquet:

value 1: R:0 D:0 V:
value 2: R:0 D:0 V:
value 3: R:0 D:0 V:

row group 1: RC:3 TS:27 OFFSET:4
col_bln: BOOLEAN UNCOMPRESSED DO:0 FPO:4 SZ:27/27/1.00 VC:3 ENC:BIT_PACKED,PLAIN,RLE ST:[num_nulls: 3, min/max not defined]

In this particular case isAllNulls() is true (all 3 values are null) and !hasNonNullValue() should be also true as there are no non null values.

Copy link
Contributor Author

@jbimbert jbimbert Jun 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Debugger :
System.out.println(leftStat) => min: false, max: false, num_nulls: 3
System.out.println(leftStat.hasNonNullValue()) => true
As you see, the function just states that min and max are defined, even if there are only nulls...

I think, the name of the function is not judicious. But it is a parquet lib function, and we can not modify it :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When isAllNulls() is true, hasNonNullValue() must be false, but if isAllNulls() is false, hasNonNullValue() may be true or false depending on data type and whether or not min and max can be calculated (not all data type have a proper comparator). It means that check for hasNonNullValue() can't be dropped (it is invalid to access or compare generic Min and/or Max without first checking that they are set).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I agree, but isn't it already checked in the previous function, matches?
public RowsMatch matches(RangeExprEvaluator evaluator) {
if (isNullOrEmpty(leftStat)) {
return RowsMatch.SOME;
}
...
return predicate.apply(leftStat, rightStat);
}

where
static boolean isNullOrEmpty(Statistics stat) {
return stat == null || stat.isEmpty();
}

where
public boolean isEmpty() {
return !this.hasNonNullValue && !this.isNumNullsSet();
}

When I call predicate.apply, the check for hasNonNullValue is already done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible that min and max are not set while the number of nulls is set, so isEmpty() is false.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. hasNonNullValue() added in matches().

/**
* If one rowgroup contains some null values, change the RowsMatch.ALL into RowsMatch.SOME (null values should be discarded by filter)
*/
private static RowsMatch checkNull(Statistics leftStat, Statistics rightStat) {
return !hasNoNulls(leftStat) || !hasNoNulls(rightStat) ? RowsMatch.SOME : RowsMatch.ALL;
}

/**
Expand All @@ -109,12 +117,9 @@ private static <C extends Comparable<C>> LogicalExpression createEqualPredicate(
LogicalExpression left,
LogicalExpression right
) {
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
// can drop when left's max < right's min, or right's max < left's min
final C leftMin = leftStat.genericGetMin();
final C rightMin = rightStat.genericGetMin();
return (leftStat.compareMaxToValue(rightMin) < 0) || (rightStat.compareMaxToValue(leftMin) < 0);
}) {
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) ->
leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 || rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0 ? RowsMatch.NONE : RowsMatch.SOME
) {
@Override
public String toString() {
return left + " = " + right;
Expand All @@ -130,9 +135,10 @@ private static <C extends Comparable<C>> LogicalExpression createGTPredicate(
LogicalExpression right
) {
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
// can drop when left's max <= right's min.
final C rightMin = rightStat.genericGetMin();
return leftStat.compareMaxToValue(rightMin) <= 0;
if (leftStat.compareMaxToValue(rightStat.genericGetMin()) <= 0) {
return RowsMatch.NONE;
}
return leftStat.compareMinToValue(rightStat.genericGetMax()) > 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME;
});
}

Expand All @@ -144,9 +150,10 @@ private static <C extends Comparable<C>> LogicalExpression createGEPredicate(
LogicalExpression right
) {
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
// can drop when left's max < right's min.
final C rightMin = rightStat.genericGetMin();
return leftStat.compareMaxToValue(rightMin) < 0;
if (leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0) {
return RowsMatch.NONE;
}
return leftStat.compareMinToValue(rightStat.genericGetMax()) >= 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME;
});
}

Expand All @@ -158,9 +165,10 @@ private static <C extends Comparable<C>> LogicalExpression createLTPredicate(
LogicalExpression right
) {
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
// can drop when right's max <= left's min.
final C leftMin = leftStat.genericGetMin();
return rightStat.compareMaxToValue(leftMin) <= 0;
if (rightStat.compareMaxToValue(leftStat.genericGetMin()) <= 0) {
return RowsMatch.NONE;
}
return leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME;
});
}

Expand All @@ -171,9 +179,10 @@ private static <C extends Comparable<C>> LogicalExpression createLEPredicate(
LogicalExpression left, LogicalExpression right
) {
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
// can drop when right's max < left's min.
final C leftMin = leftStat.genericGetMin();
return rightStat.compareMaxToValue(leftMin) < 0;
if (rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0) {
return RowsMatch.NONE;
}
return leftStat.compareMaxToValue(rightStat.genericGetMin()) <= 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME;
});
}

Expand All @@ -185,11 +194,10 @@ private static <C extends Comparable<C>> LogicalExpression createNEPredicate(
LogicalExpression right
) {
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
// can drop when there is only one unique value.
final C leftMax = leftStat.genericGetMax();
final C rightMax = rightStat.genericGetMax();
return leftStat.compareMinToValue(leftMax) == 0 && rightStat.compareMinToValue(rightMax) == 0 &&
leftStat.compareMaxToValue(rightMax) == 0;
if (leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 || rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0) {
return checkNull(leftStat, rightStat);
}
return leftStat.compareMaxToValue(rightStat.genericGetMax()) == 0 && leftStat.compareMinToValue(rightStat.genericGetMin()) == 0 ? RowsMatch.NONE : RowsMatch.SOME;
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,16 @@
package org.apache.drill.exec.expr.stat;

public interface ParquetFilterPredicate<T extends Comparable<T>> {
boolean canDrop(RangeExprEvaluator<T> evaluator);

/**
* Define the validity of a row group against a filter
* <ul>
* <li>ALL : all rows match the filter (can not drop the row group and can prune the filter)
* <li>NONE : no row matches the filter (can drop the row group)
* <li>SOME : some rows only match the filter or the filter can not be applied (can not drop the row group nor the filter)
* </ul>
*/
enum RowsMatch {ALL, NONE, SOME}

RowsMatch matches(RangeExprEvaluator<T> evaluator);
}
Loading