This repository was archived by the owner on Mar 12, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
[WIP] StorageHandler and PredicatePushdown #11
Merged
Merged
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
6191059
Add IcebergStorageHandler and predicate pushdown initial attempt
cmathiesen 5d66e93
Add constant options
cmathiesen fb567a7
Apply suggests from review
cmathiesen b683216
Add tests and fix AND to work for unlimited list of leaves
cmathiesen f0fb336
Push filter to the Parquet reader and some clean up
cmathiesen 9d11a1c
Revert version change
cmathiesen 84b231f
clean up
cmathiesen 1380857
fix BETWEEN operator conversion
cmathiesen 7ac9972
Address review comments
cmathiesen 1cf711e
Update to README
cmathiesen c784cf3
Fix javadoc and reference in license
cmathiesen b663318
Picking nits
cmathiesen File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
153 changes: 153 additions & 0 deletions
153
src/main/java/com/expediagroup/hiveberg/IcebergFilterFactory.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,153 @@ | ||
| /** | ||
| * Copyright (C) 2020 Expedia, Inc. and the Apache Software Foundation. | ||
| * | ||
| * This class was inspired by code written for converting to Parquet filters: | ||
| * | ||
| * https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/ | ||
| * hive/ql/io/parquet/read/ParquetFilterPredicateConverter.java#L46 | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package com.expediagroup.hiveberg; | ||
|
|
||
| import java.util.List; | ||
| import org.apache.hadoop.hive.ql.io.sarg.ExpressionTree; | ||
| import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; | ||
| import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; | ||
| import org.apache.iceberg.expressions.Expression; | ||
|
|
||
| import static org.apache.iceberg.expressions.Expressions.and; | ||
| import static org.apache.iceberg.expressions.Expressions.equal; | ||
| import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; | ||
| import static org.apache.iceberg.expressions.Expressions.in; | ||
| import static org.apache.iceberg.expressions.Expressions.isNull; | ||
| import static org.apache.iceberg.expressions.Expressions.lessThan; | ||
| import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; | ||
| import static org.apache.iceberg.expressions.Expressions.not; | ||
| import static org.apache.iceberg.expressions.Expressions.notNull; | ||
| import static org.apache.iceberg.expressions.Expressions.or; | ||
|
|
||
| public class IcebergFilterFactory { | ||
|
|
||
| private IcebergFilterFactory () {} | ||
|
|
||
| public static Expression generateFilterExpression(SearchArgument sarg) { | ||
| List<PredicateLeaf> leaves = sarg.getLeaves(); | ||
| List<ExpressionTree> childNodes = sarg.getExpression().getChildren(); | ||
|
|
||
| switch (sarg.getExpression().getOperator()) { | ||
| case OR: | ||
| ExpressionTree orLeft = childNodes.get(0); | ||
| ExpressionTree orRight = childNodes.get(1); | ||
| return or(translate(orLeft, leaves), translate(orRight, leaves)); | ||
| case AND: | ||
| ExpressionTree andLeft = childNodes.get(0); | ||
| ExpressionTree andRight = childNodes.get(1); | ||
| if(childNodes.size() > 2) { | ||
| Expression[] evaluatedChildren = getLeftoverLeaves(childNodes, leaves); | ||
| return and( | ||
| translate(andLeft, leaves), translate(andRight, leaves), evaluatedChildren); | ||
| } else { | ||
| return and(translate(andLeft, leaves), translate(andRight, leaves)); | ||
| } | ||
| case NOT: | ||
| return not(translateLeaf(sarg.getLeaves().get(0))); | ||
| case LEAF: | ||
| return translateLeaf(sarg.getLeaves().get(0)); | ||
| case CONSTANT: | ||
| return null; | ||
massdosage marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| default: | ||
| throw new IllegalStateException("Unknown operator: " + sarg.getExpression().getOperator()); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Remove first 2 nodes already evaluated and return an array of the evaluated leftover nodes. | ||
| * @param allChildNodes All child nodes to be evaluated for the AND expression. | ||
| * @param leaves All instances of the leaf nodes. | ||
| * @return Array of leftover evaluated nodes. | ||
| */ | ||
| private static Expression[] getLeftoverLeaves(List<ExpressionTree> allChildNodes, List<PredicateLeaf> leaves) { | ||
| allChildNodes.remove(0); | ||
| allChildNodes.remove(0); | ||
|
|
||
| Expression[] evaluatedLeaves = new Expression[allChildNodes.size()]; | ||
| for(int i = 0; i < allChildNodes.size(); i ++) { | ||
| Expression filter = translate(allChildNodes.get(i), leaves); | ||
| evaluatedLeaves[i] = filter; | ||
| } | ||
| return evaluatedLeaves; | ||
| } | ||
|
|
||
| /** | ||
| * Recursive method to traverse down the ExpressionTree to evaluate each expression and its leaf nodes. | ||
| * @param tree Current ExpressionTree where the 'top' node is being evaluated. | ||
| * @param leaves List of all leaf nodes within the tree. | ||
| * @return Expression that is translated from the Hive SearchArgument. | ||
| */ | ||
| private static Expression translate(ExpressionTree tree, List<PredicateLeaf> leaves) { | ||
| switch (tree.getOperator()) { | ||
| case OR: | ||
| return or(translate(tree.getChildren().get(0), leaves), | ||
| translate(tree.getChildren().get(1), leaves)); | ||
| case AND: | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handle case where there is more than 2 children from the AND operator |
||
| if(tree.getChildren().size() > 2) { | ||
| Expression[] evaluatedChildren = getLeftoverLeaves(tree.getChildren(), leaves); | ||
| return and(translate(tree.getChildren().get(0), leaves), | ||
| translate(tree.getChildren().get(1), leaves), evaluatedChildren); | ||
| } else { | ||
| return and(translate(tree.getChildren().get(0), leaves), | ||
| translate(tree.getChildren().get(1), leaves)); | ||
| } | ||
| case NOT: | ||
| return not(translate(tree.getChildren().get(0), leaves)); | ||
| case LEAF: | ||
| return translateLeaf(leaves.get(tree.getLeaf())); | ||
| case CONSTANT: | ||
| //We are unsure of how the CONSTANT case works, so using the approach of: | ||
| //https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ | ||
| // ParquetFilterPredicateConverter.java#L116 | ||
| return null; | ||
| default: | ||
| throw new IllegalStateException("Unknown operator: " + tree.getOperator()); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Translate leaf nodes from Hive operator to Iceberg operator. | ||
| * @param leaf Leaf node | ||
| * @return Expression fully translated from Hive PredicateLeaf | ||
| */ | ||
| private static Expression translateLeaf(PredicateLeaf leaf) { | ||
| String column = leaf.getColumnName(); | ||
| switch (leaf.getOperator()){ | ||
| case EQUALS: | ||
| return equal(column, leaf.getLiteral()); | ||
| case NULL_SAFE_EQUALS: | ||
| return equal(notNull(column).ref().name(), leaf.getLiteral()); //TODO: Unsure.. | ||
| case LESS_THAN: | ||
| return lessThan(column, leaf.getLiteral()); | ||
| case LESS_THAN_EQUALS: | ||
| return lessThanOrEqual(column, leaf.getLiteral()); | ||
| case IN: | ||
| return in(column, leaf.getLiteralList()); | ||
| case BETWEEN: | ||
| return and((greaterThanOrEqual(column, leaf.getLiteralList().get(0))), | ||
| lessThanOrEqual(column, leaf.getLiteralList().get(1))); | ||
| case IS_NULL: | ||
| return isNull(column); | ||
| default: | ||
| throw new IllegalStateException("Unknown operator: " + leaf.getOperator()); | ||
| } | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,7 +32,8 @@ class IcebergReaderFactory { | |
| IcebergReaderFactory() { | ||
| } | ||
|
|
||
| public CloseableIterable<Record> createReader(DataFile file, FileScanTask currentTask, InputFile inputFile, Schema tableSchema, boolean reuseContainers) { | ||
| public CloseableIterable<Record> createReader(DataFile file, FileScanTask currentTask, InputFile inputFile, | ||
| Schema tableSchema, boolean reuseContainers) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For ORC reader, perhaps add a comment advising the reader to look here to determine when this can be wired up for ORC: apache/iceberg#787 |
||
| switch (file.format()) { | ||
| case AVRO: | ||
| return buildAvroReader(currentTask, inputFile, tableSchema, reuseContainers); | ||
|
|
@@ -42,11 +43,11 @@ public CloseableIterable<Record> createReader(DataFile file, FileScanTask curren | |
| return buildParquetReader(currentTask, inputFile, tableSchema, reuseContainers); | ||
|
|
||
| default: | ||
| throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", file.format().name(), file.path())); | ||
| throw new UnsupportedOperationException( | ||
| String.format("Cannot read %s file: %s", file.format().name(), file.path())); | ||
| } | ||
| } | ||
|
|
||
| // FIXME: use generic reader function | ||
| private CloseableIterable buildAvroReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) { | ||
| Avro.ReadBuilder builder = Avro.read(file) | ||
| .createReaderFunc(DataReader::create) | ||
|
|
@@ -60,7 +61,7 @@ private CloseableIterable buildAvroReader(FileScanTask task, InputFile file, Sch | |
| return builder.build(); | ||
| } | ||
|
|
||
| // FIXME: use generic reader function | ||
| //Predicate pushdown support for ORC can be tracked here: https://github.com/apache/incubator-iceberg/issues/787 | ||
| private CloseableIterable buildOrcReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) { | ||
| ORC.ReadBuilder builder = ORC.read(file) | ||
| // .createReaderFunc() // FIXME: implement | ||
|
|
@@ -70,11 +71,12 @@ private CloseableIterable buildOrcReader(FileScanTask task, InputFile file, Sche | |
| return builder.build(); | ||
| } | ||
|
|
||
| // FIXME: use generic reader function | ||
| private CloseableIterable buildParquetReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) { | ||
| private CloseableIterable buildParquetReader(FileScanTask task, InputFile file, Schema schema, | ||
| boolean reuseContainers) { | ||
| Parquet.ReadBuilder builder = Parquet.read(file) | ||
| .createReaderFunc(messageType -> GenericParquetReaders.buildReader(schema, messageType)) | ||
| .project(schema) | ||
| .filter(task.residual()) | ||
| .split(task.start(), task.length()); | ||
|
|
||
| if (reuseContainers) { | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.