-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-4502][SQL]Support parquet nested struct pruning and add relevant test #14957
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
7eaa428
46153f1
46c2474
1c34877
23465ba
ab8f5ec
92ed369
5697911
d9aa397
d093c82
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation | |
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.execution.FileSourceScanExec | ||
| import org.apache.spark.sql.execution.SparkPlan | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
| /** | ||
| * A strategy for planning scans over collections of files that might be partitioned or bucketed | ||
|
|
@@ -97,7 +98,15 @@ object FileSourceStrategy extends Strategy with Logging { | |
| dataColumns | ||
| .filter(requiredAttributes.contains) | ||
| .filterNot(partitionColumns.contains) | ||
| val outputSchema = readDataColumns.toStructType | ||
| val outputSchema = if (fsRelation.sqlContext.conf.isParquetNestColumnPruning) { | ||
|
||
| val requiredColumnsWithNesting = generateRequiredColumnsContainsNesting( | ||
| projects, readDataColumns.attrs.map(_.name).toArray) | ||
| val totalSchema = readDataColumns.toStructType | ||
|
||
| val prunedSchema = StructType(requiredColumnsWithNesting.map(totalSchema(_))) | ||
| // Merge schema in same StructType and merge with filterAttributes | ||
| prunedSchema.fields.map(f => StructType(Array(f))).reduceLeft(_ merge _) | ||
| .merge(filterAttributes.toSeq.toStructType) | ||
| } else readDataColumns.toStructType | ||
|
||
| logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}") | ||
|
|
||
| val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter) | ||
|
|
@@ -126,4 +135,56 @@ object FileSourceStrategy extends Strategy with Logging { | |
|
|
||
| case _ => Nil | ||
| } | ||
|
|
||
| private def generateRequiredColumnsContainsNesting(projects: Seq[Expression], | ||
| columns: Array[String]) : Array[String] = { | ||
| def generateAttributeMap(nestFieldMap: scala.collection.mutable.Map[String, Seq[String]], | ||
| isNestField: Boolean, curString: Option[String], | ||
| node: Expression) { | ||
| node match { | ||
| case ai: GetArrayItem => | ||
| // Here we drop the curString for simplify array and map support. | ||
| // Same strategy in GetArrayStructFields and GetMapValue | ||
| generateAttributeMap(nestFieldMap, isNestField = true, None, ai.child) | ||
|
|
||
| case asf: GetArrayStructFields => | ||
| generateAttributeMap(nestFieldMap, isNestField = true, None, asf.child) | ||
|
|
||
| case mv: GetMapValue => | ||
| generateAttributeMap(nestFieldMap, isNestField = true, None, mv.child) | ||
|
|
||
| case attr: AttributeReference => | ||
| if (isNestField && curString.isDefined) { | ||
| val attrStr = attr.name | ||
| if (nestFieldMap.contains(attrStr)) { | ||
| nestFieldMap(attrStr) = nestFieldMap(attrStr) ++ Seq(attrStr + "." + curString.get) | ||
| } else { | ||
| nestFieldMap += (attrStr -> Seq(attrStr + "." + curString.get)) | ||
| } | ||
| } | ||
| case sf: GetStructField => | ||
| val str = if (curString.isDefined) { | ||
| sf.name.get + "." + curString.get | ||
| } else sf.name.get | ||
| generateAttributeMap(nestFieldMap, isNestField = true, Option(str), sf.child) | ||
| case _ => | ||
| if (node.children.nonEmpty) { | ||
| node.children.foreach(child => generateAttributeMap(nestFieldMap, | ||
| isNestField, curString, child)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| val nestFieldMap = scala.collection.mutable.Map.empty[String, Seq[String]] | ||
| projects.foreach(p => generateAttributeMap(nestFieldMap, isNestField = false, None, p)) | ||
| val col_list = columns.toList.flatMap(col => { | ||
| if (nestFieldMap.contains(col)) { | ||
| nestFieldMap.get(col).get.toList | ||
| } else { | ||
| List(col) | ||
| } | ||
| }) | ||
| col_list.toArray | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -212,6 +212,11 @@ object SQLConf { | |
| .booleanConf | ||
| .createWithDefault(true) | ||
|
|
||
| val PARQUET_NEST_COLUMN_PRUNING = SQLConfigBuilder("spark.sql.parquet.nestColumnPruning") | ||
|
||
| .doc("When set this to true, we will tell parquet only read the nest column`s leaf fields ") | ||
|
||
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| val PARQUET_CACHE_METADATA = SQLConfigBuilder("spark.sql.parquet.cacheMetadata") | ||
| .doc("Turns on caching of Parquet schema metadata. Can speed up querying of static data.") | ||
| .booleanConf | ||
|
|
@@ -661,6 +666,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { | |
|
|
||
| def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP) | ||
|
|
||
| def isParquetNestColumnPruning: Boolean = getConf(PARQUET_NEST_COLUMN_PRUNING) | ||
|
||
|
|
||
| def writeLegacyParquetFormat: Boolean = getConf(PARQUET_WRITE_LEGACY_FORMAT) | ||
|
|
||
| def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -571,6 +571,44 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext | |
| } | ||
| } | ||
|
|
||
| test("SPARK-4502 parquet nested fields pruning") { | ||
| // Schema of "test-data/nested-array-struct.parquet": | ||
| // root | ||
| // |-- primitive: integer (nullable = true) | ||
| // |-- myComplex: array (nullable = true) | ||
| // | |-- element: struct (containsNull = true) | ||
| // | | |-- id: integer (nullable = true) | ||
| // | | |-- repeatedMessage: array (nullable = true) | ||
| // | | | |-- element: struct (containsNull = true) | ||
| // | | | | |-- someId: integer (nullable = true) | ||
| val df = readResourceParquetFile("test-data/nested-array-struct.parquet") | ||
|
||
| df.createOrReplaceTempView("tmp_table") | ||
|
||
| // normal test | ||
| val query1 = "select primitive,myComplex[0].id from tmp_table" | ||
| val result1 = sql(query1) | ||
| withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") { | ||
| checkAnswer(sql(query1), result1) | ||
|
||
| } | ||
| // test for array in struct | ||
| val query2 = "select primitive,myComplex[0].repeatedMessage[0].someId from tmp_table" | ||
| val result2 = sql(query2) | ||
| withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") { | ||
| checkAnswer(sql(query2), result2) | ||
| } | ||
| // test for same struct meta merge | ||
| // myComplex.id and myComplex.repeatedMessage.someId should merge | ||
| // like myComplex.[id, repeatedMessage.someId] before pass to parquet | ||
| val query3 = "select myComplex[0].id, myComplex[0].repeatedMessage[0].someId" + | ||
| " from tmp_table" | ||
| val result3 = sql(query3) | ||
| withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") { | ||
| checkAnswer(sql(query3), result3) | ||
| } | ||
|
|
||
| spark.sessionState.catalog.dropTable( | ||
| TableIdentifier("tmp_table"), ignoreIfNotExists = true, purge = false) | ||
| } | ||
|
|
||
| test("expand UDT in StructType") { | ||
| val schema = new StructType().add("n", new NestedStructUDT, nullable = true) | ||
| val expected = new StructType().add("n", new NestedStructUDT().sqlType, nullable = true) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC, this will drop the support to access the field name containing
.(e.g."a.b") which can be accessed via"a.b". Could you confirm this please?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@HyukjinKwon Thanks for your review, mix the recursively get with the default apply has this problem, I fixed it in next patch and use ',' which is a invalid character in Parquet schema