apache · viirya · Sep 7, 2018 · Sep 7, 2018 · Sep 10, 2018 · Sep 11, 2018
diff --git a/.../main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala b/.../main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, IsNotNull, IsNull, NamedExpression}
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -196,6 +196,7 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
    */
   private def getRootFields(expr: Expression): Seq[RootField] = {
     expr match {
+      case IsNotNull(_: Attribute) | IsNull(_: Attribute) => Seq.empty
       case att: Attribute =>
         RootField(StructField(att.name, att.dataType, att.nullable), derivedFromAtt = true) :: Nil
       case SelectedField(field) => RootField(field, derivedFromAtt = false) :: Nil

diff --git a/.../scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/.../scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
@@ -155,6 +155,30 @@ class ParquetSchemaPruningSuite
       Row(null) :: Row(null) :: Nil)
   }
 
+  testSchemaPruning("select a single complex field and in where clause") {
+    val query = sql("select name.first from contacts where name.first = 'Jane'")
+    checkScan(query, "struct<name:struct<first:string>>")
+    checkAnswer(query, Row("Jane") :: Nil)
+  }
+
+  testSchemaPruning("select a single complex field array and in clause") {
+    val query = sql("select friends.middle from contacts where friends.first[0] = 'Susan'")
+    checkScan(query,
+      "struct<friends:array<struct<first:string,middle:string>>>")
+    checkAnswer(query.orderBy("id"),
+      Row(Array("Z.")) :: Nil)
+  }
+
+  testSchemaPruning("select a single complex field from a map entry and in clause") {
+    val query =
+      sql("select relatives[\"brother\"].middle from contacts " +
+        "where relatives[\"brother\"].first = 'John'")
+    checkScan(query,
+      "struct<relatives:map<string,struct<first:string,middle:string>>>")
+    checkAnswer(query.orderBy("id"),
+      Row("Y.") :: Nil)
+  }
+
   private def testSchemaPruning(testName: String)(testThunk: => Unit) {
     withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
       test(s"Spark vectorized reader - without partition data column - $testName") {