-
Notifications
You must be signed in to change notification settings - Fork 3k
Push down partition filter to Spark when Importing File Based Tables #3745
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3ef0170
46602b8
f72721e
79828e2
51ee198
19cb5c7
7d95086
ffc5c6e
3b4ba27
f9b2f40
f8822e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ | |
| import java.util.Objects; | ||
| import java.util.Set; | ||
| import java.util.stream.Collectors; | ||
| import org.apache.hadoop.fs.FileStatus; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.iceberg.MetadataTableType; | ||
| import org.apache.iceberg.MetadataTableUtils; | ||
|
|
@@ -48,6 +49,7 @@ | |
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
| import org.apache.iceberg.spark.SparkTableUtil.SparkPartition; | ||
| import org.apache.iceberg.spark.source.SparkTable; | ||
|
|
@@ -77,6 +79,7 @@ | |
| import org.apache.spark.sql.connector.expressions.Transform; | ||
| import org.apache.spark.sql.execution.datasources.FileStatusCache; | ||
| import org.apache.spark.sql.execution.datasources.InMemoryFileIndex; | ||
| import org.apache.spark.sql.execution.datasources.PartitionDirectory; | ||
| import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation; | ||
| import org.apache.spark.sql.types.IntegerType; | ||
| import org.apache.spark.sql.types.LongType; | ||
|
|
@@ -745,9 +748,11 @@ public static TableIdentifier identifierToTableIdentifier(Identifier identifier) | |
| * @param spark a Spark session | ||
| * @param rootPath a table identifier | ||
| * @param format format of the file | ||
| * @param partitionFilter partitionFilter of the file | ||
| * @return all table's partitions | ||
| */ | ||
| public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format) { | ||
| public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format, | ||
| Map<String, String> partitionFilter) { | ||
| FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark); | ||
| Map<String, String> emptyMap = Collections.emptyMap(); | ||
|
|
||
|
|
@@ -768,9 +773,23 @@ public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPa | |
|
|
||
| org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec(); | ||
| StructType schema = spec.partitionColumns(); | ||
| if (schema.isEmpty()) { | ||
| return Lists.newArrayList(); | ||
| } | ||
|
|
||
| List<org.apache.spark.sql.catalyst.expressions.Expression> filterExpressions = | ||
| SparkUtil.partitionMapToExpression(schema, partitionFilter); | ||
| Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaPartitionFilters = | ||
| JavaConverters.asScalaBufferConverter(filterExpressions).asScala().toSeq(); | ||
|
|
||
| List<org.apache.spark.sql.catalyst.expressions.Expression> dataFilters = Lists.newArrayList(); | ||
| Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaDataFilters = | ||
| JavaConverters.asScalaBufferConverter(dataFilters).asScala().toSeq(); | ||
|
|
||
| Seq<PartitionDirectory> filteredPartitions = fileIndex.listFiles(scalaPartitionFilters, scalaDataFilters); | ||
|
|
||
| return JavaConverters | ||
| .seqAsJavaListConverter(spec.partitions()) | ||
| .seqAsJavaListConverter(filteredPartitions) | ||
| .asJava() | ||
| .stream() | ||
| .map(partition -> { | ||
|
|
@@ -781,7 +800,11 @@ public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPa | |
| Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); | ||
| values.put(field.name(), String.valueOf(value)); | ||
| }); | ||
| return new SparkPartition(values, partition.path().toString(), format); | ||
|
|
||
| FileStatus fileStatus = | ||
rdblue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because here partition is PartitionDirectory listFiles returns a Seq of PartitionDirectory Before my change, partition is PartitionPath
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great, thanks for the context! I assumed that it would use the same values. |
||
|
|
||
| return new SparkPartition(values, fileStatus.getPath().getParent().toString(), format); | ||
| }).collect(Collectors.toList()); | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there an easier way to construct an empty sequence? Also, since this is always empty, can you put the
dataFiltersdefinition and this line next to one another? The line to createscalaPartitionFilterscan be next to the line above that createsfilterExpressions.