apache · xuanyuanking · Sep 5, 2016 · Sep 7, 2016 · Sep 7, 2016 · Sep 7, 2016
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -25,6 +25,8 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.types.{StructField, StructType}
 
 /**
  * A strategy for planning scans over collections of files that might be partitioned or bucketed
@@ -97,7 +99,15 @@ object FileSourceStrategy extends Strategy with Logging {
         dataColumns
           .filter(requiredAttributes.contains)
           .filterNot(partitionColumns.contains)
-      val outputSchema = readDataColumns.toStructType
+      val outputSchema = if (fsRelation.sqlContext.conf.isParquetNestColumnPruning
+        && fsRelation.fileFormat.isInstanceOf[ParquetFileFormat]) {
+        val totalSchema = readDataColumns.toStructType
+        val prunedSchema = StructType(
+          generateStructFieldsContainsNesting(projects, totalSchema))
+        // Merge schema in same StructType and merge with filterAttributes
+        prunedSchema.fields.map(f => StructType(Array(f))).reduceLeft(_ merge _)
+          .merge(filterAttributes.toSeq.toStructType)
+      } else readDataColumns.toStructType
       logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}")
 
       val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter)
@@ -126,4 +136,52 @@ object FileSourceStrategy extends Strategy with Logging {
 
     case _ => Nil
   }
+
+  private def generateStructFieldsContainsNesting(projects: Seq[Expression],
+                                      totalSchema: StructType) : Seq[StructField] = {
+    def generateStructField(curField: List[String],
+                             node: Expression) : Seq[StructField] = {
+      node match {
+        case ai: GetArrayItem =>
+          // Here we drop the previous for simplify array and map support.
+          // Same strategy in GetArrayStructFields and GetMapValue
+          generateStructField(List.empty[String], ai.child)
+        case asf: GetArrayStructFields =>
+          generateStructField(List.empty[String], asf.child)
+        case mv: GetMapValue =>
+          generateStructField(List.empty[String], mv.child)
+        case attr: AttributeReference =>
+          Seq(getFieldRecursively(totalSchema, attr.name :: curField))
+        case sf: GetStructField =>
+          generateStructField(sf.name.get :: curField, sf.child)
+        case _ =>
+          if (node.children.nonEmpty) {
+            node.children.flatMap(child => generateStructField(curField, child))
+          } else {
+            Seq.empty[StructField]
+          }
+      }
+    }
+
+    def getFieldRecursively(totalSchema: StructType,
+                            name: List[String]): StructField = {
+      if (name.length > 1) {
+        val curField = name.head
+        val curFieldType = totalSchema(curField)
+        curFieldType.dataType match {
+          case st: StructType =>
+            val newField = getFieldRecursively(StructType(st.fields), name.drop(1))
+            StructField(curFieldType.name, StructType(Seq(newField)),
+              curFieldType.nullable, curFieldType.metadata)
+          case _ =>
+            throw new IllegalArgumentException(s"""Field "$curField" is not struct field.""")
+        }
+      } else {
+        totalSchema(name.head)
+      }
+    }
+
+    projects.flatMap(p => generateStructField(List.empty[String], p))
+  }
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -212,6 +212,11 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val PARQUET_NEST_COLUMN_PRUNING = SQLConfigBuilder("spark.sql.parquet.nestColumnPruning")
+    .doc("When set this to true, we will tell parquet only read the nest column`s leaf fields ")
+    .booleanConf
+    .createWithDefault(false)
+
   val PARQUET_CACHE_METADATA = SQLConfigBuilder("spark.sql.parquet.cacheMetadata")
     .doc("Turns on caching of Parquet schema metadata. Can speed up querying of static data.")
     .booleanConf
@@ -661,6 +666,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP)
 
+  def isParquetNestColumnPruning: Boolean = getConf(PARQUET_NEST_COLUMN_PRUNING)
+
   def writeLegacyParquetFormat: Boolean = getConf(PARQUET_WRITE_LEGACY_FORMAT)
 
   def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING)

diff --git a/sql/core/src/test/resources/test-data/nested-struct.snappy.parquet b/sql/core/src/test/resources/test-data/nested-struct.snappy.parquet
diff --git a/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -571,6 +571,37 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
+  test("SPARK-4502 parquet nested fields pruning") {
+    // Schema of "test-data/nested-array-struct.parquet":
+    //    root
+    //    |-- col: struct (nullable = true)
+    //    |    |-- s1: struct (nullable = true)
+    //    |    |    |-- s1_1: long (nullable = true)
+    //    |    |    |-- s1_2: long (nullable = true)
+    //    |    |-- str: string (nullable = true)
+    //    |-- num: long (nullable = true)
+    //    |-- str: string (nullable = true)
+    val df = readResourceParquetFile("test-data/nested-struct.snappy.parquet")
+    df.createOrReplaceTempView("tmp_table")
+    // normal test
+    val query1 = "select num,col.s1.s1_1 from tmp_table"
+    val result1 = sql(query1)
+    withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") {
+      checkAnswer(sql(query1), result1)
+    }
+    // test for same struct meta merge
+    // col.s1.s1_1 and col.str should merge
+    // like col.[s1.s1_1, str] before pass to parquet
+    val query2 = "select col.s1.s1_1,col.str from tmp_table"
+    val result2 = sql(query2)
+    withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") {
+      checkAnswer(sql(query2), result2)
+    }
+
+    spark.sessionState.catalog.dropTable(
+      TableIdentifier("tmp_table"), ignoreIfNotExists = true, purge = false)
+  }
+
   test("expand UDT in StructType") {
     val schema = new StructType().add("n", new NestedStructUDT, nullable = true)
     val expected = new StructType().add("n", new NestedStructUDT().sqlType, nullable = true)