apache · xuanyuanking · Sep 5, 2016 · Sep 7, 2016 · Sep 7, 2016 · Sep 7, 2016
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -259,8 +259,23 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
    * @throws IllegalArgumentException if a field with the given name does not exist
    */
   def apply(name: String): StructField = {
-    nameToField.getOrElse(name,
-      throw new IllegalArgumentException(s"""Field "$name" does not exist."""))
+    if (name.contains('.')) {
+      val curFieldStr = name.split("\\.", 2)(0)
+      val nextFieldStr = name.split("\\.", 2)(1)
+      val curField = nameToField.getOrElse(curFieldStr,
+        throw new IllegalArgumentException(s"""Field "$name" does not exist."""))
+      curField.dataType match {
+        case st: StructType =>
+          val newField = StructType(st.fields).apply(nextFieldStr)
+          StructField(curField.name, StructType(Seq(newField)),
+            curField.nullable, curField.metadata)
+        case _ =>
+          throw new IllegalArgumentException(s"""Field "$curFieldStr" is not struct field.""")
+      }
+    } else {
+      nameToField.getOrElse(name,
+        throw new IllegalArgumentException(s"""Field "$name" does not exist."""))
+    }
   }
 
   /**

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.types.StructType
 
 /**
  * A strategy for planning scans over collections of files that might be partitioned or bucketed
@@ -97,7 +98,15 @@ object FileSourceStrategy extends Strategy with Logging {
         dataColumns
           .filter(requiredAttributes.contains)
           .filterNot(partitionColumns.contains)
-      val outputSchema = readDataColumns.toStructType
+      val outputSchema = if (fsRelation.sqlContext.conf.isParquetNestColumnPruning) {
+        val requiredColumnsWithNesting = generateRequiredColumnsContainsNesting(
+          projects, readDataColumns.attrs.map(_.name).toArray)
+        val totalSchema = readDataColumns.toStructType
+        val prunedSchema = StructType(requiredColumnsWithNesting.map(totalSchema(_)))
+        // Merge schema in same StructType and merge with filterAttributes
+        prunedSchema.fields.map(f => StructType(Array(f))).reduceLeft(_ merge _)
+          .merge(filterAttributes.toSeq.toStructType)
+      } else readDataColumns.toStructType
       logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}")
 
       val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter)
@@ -126,4 +135,56 @@ object FileSourceStrategy extends Strategy with Logging {
 
     case _ => Nil
   }
+
+  private def generateRequiredColumnsContainsNesting(projects: Seq[Expression],
+                                      columns: Array[String]) : Array[String] = {
+    def generateAttributeMap(nestFieldMap: scala.collection.mutable.Map[String, Seq[String]],
+                             isNestField: Boolean, curString: Option[String],
+                             node: Expression) {
+      node match {
+        case ai: GetArrayItem =>
+          // Here we drop the curString for simplify array and map support.
+          // Same strategy in GetArrayStructFields and GetMapValue
+          generateAttributeMap(nestFieldMap, isNestField = true, None, ai.child)
+
+        case asf: GetArrayStructFields =>
+          generateAttributeMap(nestFieldMap, isNestField = true, None, asf.child)
+
+        case mv: GetMapValue =>
+          generateAttributeMap(nestFieldMap, isNestField = true, None, mv.child)
+
+        case attr: AttributeReference =>
+          if (isNestField && curString.isDefined) {
+            val attrStr = attr.name
+            if (nestFieldMap.contains(attrStr)) {
+              nestFieldMap(attrStr) = nestFieldMap(attrStr) ++ Seq(attrStr + "." + curString.get)
+            } else {
+              nestFieldMap += (attrStr -> Seq(attrStr + "." + curString.get))
+            }
+          }
+        case sf: GetStructField =>
+          val str = if (curString.isDefined) {
+            sf.name.get + "." + curString.get
+          } else sf.name.get
+          generateAttributeMap(nestFieldMap, isNestField = true, Option(str), sf.child)
+        case _ =>
+          if (node.children.nonEmpty) {
+            node.children.foreach(child => generateAttributeMap(nestFieldMap,
+              isNestField, curString, child))
+          }
+      }
+    }
+
+    val nestFieldMap = scala.collection.mutable.Map.empty[String, Seq[String]]
+    projects.foreach(p => generateAttributeMap(nestFieldMap, isNestField = false, None, p))
+    val col_list = columns.toList.flatMap(col => {
+      if (nestFieldMap.contains(col)) {
+        nestFieldMap.get(col).get.toList
+      } else {
+        List(col)
+      }
+    })
+    col_list.toArray
+  }
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -212,6 +212,11 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val PARQUET_NEST_COLUMN_PRUNING = SQLConfigBuilder("spark.sql.parquet.nestColumnPruning")
+    .doc("When set this to true, we will tell parquet only read the nest column`s leaf fields ")
+    .booleanConf
+    .createWithDefault(false)
+
   val PARQUET_CACHE_METADATA = SQLConfigBuilder("spark.sql.parquet.cacheMetadata")
     .doc("Turns on caching of Parquet schema metadata. Can speed up querying of static data.")
     .booleanConf
@@ -661,6 +666,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP)
 
+  def isParquetNestColumnPruning: Boolean = getConf(PARQUET_NEST_COLUMN_PRUNING)
+
   def writeLegacyParquetFormat: Boolean = getConf(PARQUET_WRITE_LEGACY_FORMAT)
 
   def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING)

diff --git a/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -571,6 +571,44 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
+  test("SPARK-4502 parquet nested fields pruning") {
+    // Schema of "test-data/nested-array-struct.parquet":
+    //    root
+    //    |-- primitive: integer (nullable = true)
+    //    |-- myComplex: array (nullable = true)
+    //    |    |-- element: struct (containsNull = true)
+    //    |    |    |-- id: integer (nullable = true)
+    //    |    |    |-- repeatedMessage: array (nullable = true)
+    //    |    |    |    |-- element: struct (containsNull = true)
+    //    |    |    |    |    |-- someId: integer (nullable = true)
+    val df = readResourceParquetFile("test-data/nested-array-struct.parquet")
+    df.createOrReplaceTempView("tmp_table")
+    // normal test
+    val query1 = "select primitive,myComplex[0].id from tmp_table"
+    val result1 = sql(query1)
+    withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") {
+      checkAnswer(sql(query1), result1)
+    }
+    // test for array in struct
+    val query2 = "select primitive,myComplex[0].repeatedMessage[0].someId from tmp_table"
+    val result2 = sql(query2)
+    withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") {
+      checkAnswer(sql(query2), result2)
+    }
+    // test for same struct meta merge
+    // myComplex.id and myComplex.repeatedMessage.someId should merge
+    // like myComplex.[id, repeatedMessage.someId] before pass to parquet
+    val query3 = "select myComplex[0].id, myComplex[0].repeatedMessage[0].someId" +
+      " from tmp_table"
+    val result3 = sql(query3)
+    withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") {
+      checkAnswer(sql(query3), result3)
+    }
+
+    spark.sessionState.catalog.dropTable(
+      TableIdentifier("tmp_table"), ignoreIfNotExists = true, purge = false)
+  }
+
   test("expand UDT in StructType") {
     val schema = new StructType().add("n", new NestedStructUDT, nullable = true)
     val expected = new StructType().add("n", new NestedStructUDT().sqlType, nullable = true)