-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-4502][SQL]Support parquet nested struct pruning and add relevant test #14957
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
7eaa428
46153f1
46c2474
1c34877
23465ba
ab8f5ec
92ed369
5697911
d9aa397
d093c82
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,8 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation | |
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.execution.FileSourceScanExec | ||
| import org.apache.spark.sql.execution.SparkPlan | ||
| import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat | ||
| import org.apache.spark.sql.types.{StructField, StructType} | ||
|
|
||
| /** | ||
| * A strategy for planning scans over collections of files that might be partitioned or bucketed | ||
|
|
@@ -97,7 +99,15 @@ object FileSourceStrategy extends Strategy with Logging { | |
| dataColumns | ||
| .filter(requiredAttributes.contains) | ||
| .filterNot(partitionColumns.contains) | ||
| val outputSchema = readDataColumns.toStructType | ||
| val outputSchema = if (fsRelation.sqlContext.conf.isParquetNestColumnPruning | ||
| && fsRelation.fileFormat.isInstanceOf[ParquetFileFormat]) { | ||
| val totalSchema = readDataColumns.toStructType | ||
| val prunedSchema = StructType( | ||
| generateStructFieldsContainsNesting(projects, totalSchema)) | ||
| // Merge schema in same StructType and merge with filterAttributes | ||
| prunedSchema.fields.map(f => StructType(Array(f))).reduceLeft(_ merge _) | ||
| .merge(filterAttributes.toSeq.toStructType) | ||
| } else readDataColumns.toStructType | ||
|
||
| logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}") | ||
|
|
||
| val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter) | ||
|
|
@@ -126,4 +136,52 @@ object FileSourceStrategy extends Strategy with Logging { | |
|
|
||
| case _ => Nil | ||
| } | ||
|
|
||
| private def generateStructFieldsContainsNesting(projects: Seq[Expression], | ||
| totalSchema: StructType) : Seq[StructField] = { | ||
|
||
| def generateStructField(curField: List[String], | ||
| node: Expression) : Seq[StructField] = { | ||
|
||
| node match { | ||
| case ai: GetArrayItem => | ||
| // Here we drop the previous for simplify array and map support. | ||
| // Same strategy in GetArrayStructFields and GetMapValue | ||
| generateStructField(List.empty[String], ai.child) | ||
| case asf: GetArrayStructFields => | ||
| generateStructField(List.empty[String], asf.child) | ||
| case mv: GetMapValue => | ||
| generateStructField(List.empty[String], mv.child) | ||
| case attr: AttributeReference => | ||
| Seq(getFieldRecursively(totalSchema, attr.name :: curField)) | ||
| case sf: GetStructField => | ||
| generateStructField(sf.name.get :: curField, sf.child) | ||
|
||
| case _ => | ||
| if (node.children.nonEmpty) { | ||
| node.children.flatMap(child => generateStructField(curField, child)) | ||
| } else { | ||
| Seq.empty[StructField] | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def getFieldRecursively(totalSchema: StructType, | ||
| name: List[String]): StructField = { | ||
|
||
| if (name.length > 1) { | ||
| val curField = name.head | ||
| val curFieldType = totalSchema(curField) | ||
| curFieldType.dataType match { | ||
| case st: StructType => | ||
| val newField = getFieldRecursively(StructType(st.fields), name.drop(1)) | ||
| StructField(curFieldType.name, StructType(Seq(newField)), | ||
| curFieldType.nullable, curFieldType.metadata) | ||
| case _ => | ||
| throw new IllegalArgumentException(s"""Field "$curField" is not struct field.""") | ||
| } | ||
| } else { | ||
| totalSchema(name.head) | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually this function can be simplified to: def getNestedField(schema: StructType, path: Seq[String]): StructField = {
require(path.nonEmpty, "<error message>")
path.tail.foldLeft(schema(path.head)) { (field, name) =>
field.dataType match {
case t: StructType => t(name)
case _ => ??? // Throw exception here
}
}
}
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The func getFieldRecursively here need the return value which is a StructField contains all nested relation in path. For example: and when we want to get So maybe I can't use the simplified func getNestedField because it returns only the last StructField: |
||
|
|
||
| projects.flatMap(p => generateStructField(List.empty[String], p)) | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -212,6 +212,11 @@ object SQLConf { | |
| .booleanConf | ||
| .createWithDefault(true) | ||
|
|
||
| val PARQUET_NEST_COLUMN_PRUNING = SQLConfigBuilder("spark.sql.parquet.nestColumnPruning") | ||
|
||
| .doc("When set this to true, we will tell parquet only read the nest column`s leaf fields ") | ||
|
||
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| val PARQUET_CACHE_METADATA = SQLConfigBuilder("spark.sql.parquet.cacheMetadata") | ||
| .doc("Turns on caching of Parquet schema metadata. Can speed up querying of static data.") | ||
| .booleanConf | ||
|
|
@@ -661,6 +666,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { | |
|
|
||
| def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP) | ||
|
|
||
| def isParquetNestColumnPruning: Boolean = getConf(PARQUET_NEST_COLUMN_PRUNING) | ||
|
||
|
|
||
| def writeLegacyParquetFormat: Boolean = getConf(PARQUET_WRITE_LEGACY_FORMAT) | ||
|
|
||
| def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -571,6 +571,37 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext | |
| } | ||
| } | ||
|
|
||
| test("SPARK-4502 parquet nested fields pruning") { | ||
| // Schema of "test-data/nested-array-struct.parquet": | ||
| // root | ||
| // |-- col: struct (nullable = true) | ||
| // | |-- s1: struct (nullable = true) | ||
| // | | |-- s1_1: long (nullable = true) | ||
| // | | |-- s1_2: long (nullable = true) | ||
| // | |-- str: string (nullable = true) | ||
| // |-- num: long (nullable = true) | ||
| // |-- str: string (nullable = true) | ||
| val df = readResourceParquetFile("test-data/nested-struct.snappy.parquet") | ||
| df.createOrReplaceTempView("tmp_table") | ||
|
||
| // normal test | ||
| val query1 = "select num,col.s1.s1_1 from tmp_table" | ||
| val result1 = sql(query1) | ||
| withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") { | ||
| checkAnswer(sql(query1), result1) | ||
|
||
| } | ||
| // test for same struct meta merge | ||
| // col.s1.s1_1 and col.str should merge | ||
| // like col.[s1.s1_1, str] before pass to parquet | ||
| val query2 = "select col.s1.s1_1,col.str from tmp_table" | ||
| val result2 = sql(query2) | ||
| withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") { | ||
| checkAnswer(sql(query2), result2) | ||
| } | ||
|
|
||
| spark.sessionState.catalog.dropTable( | ||
| TableIdentifier("tmp_table"), ignoreIfNotExists = true, purge = false) | ||
| } | ||
|
|
||
| test("expand UDT in StructType") { | ||
| val schema = new StructType().add("n", new NestedStructUDT, nullable = true) | ||
| val expected = new StructType().add("n", new NestedStructUDT().sqlType, nullable = true) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe
fullSchema?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fix done