not push down null

huaxingao · huaxingao · commit 08efa3c02f98 · 2022-07-10T21:57:00.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -748,7 +748,13 @@ class ParquetFilters(
             makeEq.lift(fieldType).map(_(fieldNames, v))
           }.reduceLeftOption(FilterApi.or)
         } else if (canPartialPushDownConjuncts) {
-          makeInPredicate.lift(fieldType).map(_(fieldNames, values))
+          if (values.contains(null)) {
+            Seq(makeEq.lift(fieldType).map(_(fieldNames, null)),
+              makeInPredicate.lift(fieldType).map(_(fieldNames, values.filter(_ != null)))
+            ).flatten.reduceLeftOption(FilterApi.or)
+          } else {
+            makeInPredicate.lift(fieldType).map(_(fieldNames, values))
+          }
         } else {
           None
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -2015,33 +2015,39 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
   test("SPARK-38825: in and notIn filters") {
     import testImplicits._
     withTempPath { file =>
-      Seq(1, 2, 0, -1, 99, Integer.MAX_VALUE, 1000, 3, 7, Integer.MIN_VALUE, 2)
-        .toDF("id").coalesce(1).write.mode("overwrite")
-        .parquet(file.getCanonicalPath)
-      var df = spark.read.parquet(file.getCanonicalPath)
-      var in = df.filter(col("id").isin(100, 3, 11, 12, 13, Integer.MAX_VALUE, Integer.MIN_VALUE))
-      var notIn =
-        df.filter(!col("id").isin(100, 3, 11, 12, 13, Integer.MAX_VALUE, Integer.MIN_VALUE))
-      checkAnswer(in, Seq(Row(3), Row(-2147483648), Row(2147483647)))
-      checkAnswer(notIn, Seq(Row(1), Row(2), Row(0), Row(-1), Row(99), Row(1000), Row(7), Row(2)))
-
-      Seq("mary", "martin", "lucy", "alex", null, "mary", "dan").toDF("name").coalesce(1)
-        .write.mode("overwrite").parquet(file.getCanonicalPath)
-      df = spark.read.parquet(file.getCanonicalPath)
-      in = df.filter(col("name").isin("mary", "victor", "leo", "alex"))
-      notIn = df.filter(!col("name").isin("mary", "victor", "leo", "alex"))
-      checkAnswer(in, Seq(Row("mary"), Row("alex"), Row("mary")))
-      checkAnswer(notIn, Seq(Row("martin"), Row("lucy"), Row("dan")))
-
-      in = df.filter(col("name").isin("mary", "victor", "leo", "alex", null))
-      notIn = df.filter(!col("name").isin("mary", "victor", "leo", "alex", null))
-      checkAnswer(in, Seq(Row("mary"), Row("alex"), Row("mary")))
-      checkAnswer(notIn, Seq())
-
-      in = df.filter(col("name").isin(null))
-      notIn = df.filter(!col("name").isin(null))
-      checkAnswer(in, Seq())
-      checkAnswer(notIn, Seq())
+      Seq(3, 20).foreach { threshold =>
+        withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") {
+          Seq(1, 2, 0, -1, 99, Integer.MAX_VALUE, 1000, 3, 7, Integer.MIN_VALUE, 2)
+            .toDF("id").coalesce(1).write.mode("overwrite")
+            .parquet(file.getCanonicalPath)
+          var df = spark.read.parquet(file.getCanonicalPath)
+          var in = df.filter(col("id")
+            .isin(100, 3, 11, 12, 13, Integer.MAX_VALUE, Integer.MIN_VALUE))
+          var notIn = df.filter(!col("id")
+            .isin(100, 3, 11, 12, 13, Integer.MAX_VALUE, Integer.MIN_VALUE))
+          checkAnswer(in, Seq(Row(3), Row(-2147483648), Row(2147483647)))
+          checkAnswer(notIn,
+            Seq(Row(1), Row(2), Row(0), Row(-1), Row(99), Row(1000), Row(7), Row(2)))
+
+          Seq("mary", "martin", "lucy", "alex", null, "mary", "dan").toDF("name").coalesce(1)
+            .write.mode("overwrite").parquet(file.getCanonicalPath)
+          df = spark.read.parquet(file.getCanonicalPath)
+          in = df.filter(col("name").isin("mary", "victor", "leo", "alex"))
+          notIn = df.filter(!col("name").isin("mary", "victor", "leo", "alex"))
+          checkAnswer(in, Seq(Row("mary"), Row("alex"), Row("mary")))
+          checkAnswer(notIn, Seq(Row("martin"), Row("lucy"), Row("dan")))
+
+          in = df.filter(col("name").isin("mary", "victor", "leo", "alex", null))
+          notIn = df.filter(!col("name").isin("mary", "victor", "leo", "alex", null))
+          checkAnswer(in, Seq(Row("mary"), Row("alex"), Row("mary")))
+          checkAnswer(notIn, Seq())
+
+          in = df.filter(col("name").isin(null))
+          notIn = df.filter(!col("name").isin(null))
+          checkAnswer(in, Seq())
+          checkAnswer(notIn, Seq())
+        }
+      }
     }
   }
 }