[SPARK-33593][SQL] Parquet vector reader incorrect with binary partition value

AngersZhuuuu · AngersZhuuuu · commit 4784edd3e893 · 2020-12-17T23:12:56.000+08:00
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -54,6 +54,8 @@ public static void populate(WritableColumnVector col, InternalRow row, int field
     } else {
       if (t == DataTypes.BooleanType) {
         col.putBooleans(0, capacity, row.getBoolean(fieldIdx));
+      } else if (t == DataTypes.BinaryType) {
+        col.putByteArray(0, row.getBinary(fieldIdx));
       } else if (t == DataTypes.ByteType) {
         col.putBytes(0, capacity, row.getByte(fieldIdx));
       } else if (t == DataTypes.ShortType) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -3745,6 +3745,21 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       }
     }
   }
+
+  test("SPARK-33593: Parquet vector reader incorrect with binary partition value") {
+    Seq(true).foreach(tag => {
+      withSQLConf("spark.sql.parquet.enableVectorizedReader" -> tag.toString) {
+        withTable("t1") {
+          sql(
+            """CREATE TABLE t1(name STRING, id BINARY, part BINARY)
+              | USING PARQUET PARTITIONED BY (part)""".stripMargin)
+          sql(s"INSERT INTO t1 PARTITION(part = 'Spark SQL') VALUES('a', X'537061726B2053514C')")
+          checkAnswer(sql("SELECT name, cast(id as string), cast(part as string) FROM t1"),
+            Row("a", "Spark SQL", "Spark SQL"))
+        }
+      }
+    })
+  }
 }
 
 case class Foo(bar: Option[String])

Original file line number	Diff line number	Diff line change
`@@ -3745,6 +3745,21 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark`
`3745`	`3745`	`}`
`3746`	`3746`	`}`
`3747`	`3747`	`}`
	`3748`	`+`
	`3749`	`+ test("SPARK-33593: Parquet vector reader incorrect with binary partition value") {`
	`3750`	`+ Seq(true).foreach(tag => {`
	`3751`	`+ withSQLConf("spark.sql.parquet.enableVectorizedReader" -> tag.toString) {`
	`3752`	`+ withTable("t1") {`
	`3753`	`+ sql(`
	`3754`	`+ """CREATE TABLE t1(name STRING, id BINARY, part BINARY)`
	`3755`	`+ \| USING PARQUET PARTITIONED BY (part)""".stripMargin)`
	`3756`	`+ sql(s"INSERT INTO t1 PARTITION(part = 'Spark SQL') VALUES('a', X'537061726B2053514C')")`
	`3757`	`+ checkAnswer(sql("SELECT name, cast(id as string), cast(part as string) FROM t1"),`
	`3758`	`+ Row("a", "Spark SQL", "Spark SQL"))`
	`3759`	`+ }`
	`3760`	`+ }`
	`3761`	`+ })`
	`3762`	`+ }`
`3748`	`3763`	`}`
`3749`	`3764`
`3750`	`3765`	`case class Foo(bar: Option[String])`