From e3a583830182a0ba8c46c465f38685db70e8d630 Mon Sep 17 00:00:00 2001 From: jackierwzhang Date: Tue, 1 Mar 2022 10:36:41 -0800 Subject: [PATCH 1/4] fix --- .../parquet/ParquetReadSupport.scala | 2 +- .../parquet/ParquetFieldIdIOSuite.scala | 31 ++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 97e691ff7c66c..abb0d8155bd16 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -140,7 +140,7 @@ object ParquetReadSupport extends Logging { "Spark read schema expects field Ids, " + "but Parquet file schema doesn't contain any field Ids.\n" + "Please remove the field ids from Spark schema or ignore missing ids by " + - "setting `spark.sql.parquet.fieldId.ignoreMissing = true`\n" + + "setting `spark.sql.parquet.fieldId.read.ignoreMissing = true`\n" + s""" |Spark read schema: |${catalystRequestedSchema.prettyJson} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFieldIdIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFieldIdIOSuite.scala index ff0bb2f92d208..6c5204491645e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFieldIdIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFieldIdIOSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.SparkException import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{IntegerType, Metadata, MetadataBuilder, StringType, StructType} +import org.apache.spark.sql.types.{ArrayType, IntegerType, MapType, Metadata, MetadataBuilder, StringType, StructType} class ParquetFieldIdIOSuite extends QueryTest with ParquetTest with SharedSparkSession { @@ -107,6 +107,35 @@ class ParquetFieldIdIOSuite extends QueryTest with ParquetTest with SharedSparkS } } + test("absence of field ids: reading nested schema") { + withTempDir { dir => + // now with nested schema/complex type + val readSchema = + new StructType() + .add("a", IntegerType, true, withId(1)) + .add("b", ArrayType(StringType), true, withId(2)) + .add("c", new StructType().add("xx", IntegerType, true, withId(6)), true, withId(3)) + .add("d", MapType(StringType, StringType), true, withId(4)) + .add("e", IntegerType, true, withId(5)) + + val writeSchema = + new StructType() + .add("a", IntegerType, true, withId(5)) + .add("randomName", StringType, true) + + val writeData = Seq(Row(100, "text"), Row(200, "more")) + + spark.createDataFrame(writeData.asJava, writeSchema) + .write.mode("overwrite").parquet(dir.getCanonicalPath) + + withAllParquetReaders { + checkAnswer(spark.read.schema(readSchema).parquet(dir.getCanonicalPath), + // a, b, c, d all couldn't be found + Row(null, null, null, null, 100) :: Row(null, null, null, null, 200) :: Nil) + } + } + } + test("multiple id matches") { withTempDir { dir => val readSchema = From 25e2f4067b197839081984009ca704ccdc7b6b1c Mon Sep 17 00:00:00 2001 From: jackierwzhang Date: Tue, 1 Mar 2022 10:49:10 -0800 Subject: [PATCH 2/4] address comments --- .../execution/datasources/parquet/ParquetReadSupport.scala | 4 ++-- .../execution/datasources/parquet/ParquetFieldIdIOSuite.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index abb0d8155bd16..86c8e0fc4bc07 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -139,8 +139,8 @@ object ParquetReadSupport extends Logging { throw new RuntimeException( "Spark read schema expects field Ids, " + "but Parquet file schema doesn't contain any field Ids.\n" + - "Please remove the field ids from Spark schema or ignore missing ids by " + - "setting `spark.sql.parquet.fieldId.read.ignoreMissing = true`\n" + + s"Please remove the field ids from Spark schema or ignore missing ids by " + + s"setting ${SQLConf.IGNORE_MISSING_PARQUET_FIELD_ID.key}` = true`\n" + s""" |Spark read schema: |${catalystRequestedSchema.prettyJson} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFieldIdIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFieldIdIOSuite.scala index 6c5204491645e..5e01d3f447c96 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFieldIdIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFieldIdIOSuite.scala @@ -107,14 +107,14 @@ class ParquetFieldIdIOSuite extends QueryTest with ParquetTest with SharedSparkS } } - test("absence of field ids: reading nested schema") { + test("SPARK-38094: absence of field ids: reading nested schema") { withTempDir { dir => // now with nested schema/complex type val readSchema = new StructType() .add("a", IntegerType, true, withId(1)) .add("b", ArrayType(StringType), true, withId(2)) - .add("c", new StructType().add("xx", IntegerType, true, withId(6)), true, withId(3)) + .add("c", new StructType().add("c1", IntegerType, true, withId(6)), true, withId(3)) .add("d", MapType(StringType, StringType), true, withId(4)) .add("e", IntegerType, true, withId(5)) From d45ee3f3ce67baff2dcf2795fdc4ae36777edfd1 Mon Sep 17 00:00:00 2001 From: jackierwzhang Date: Tue, 1 Mar 2022 13:50:20 -0800 Subject: [PATCH 3/4] minor --- .../sql/execution/datasources/parquet/ParquetReadSupport.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 86c8e0fc4bc07..45297c1a846bf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -140,7 +140,7 @@ object ParquetReadSupport extends Logging { "Spark read schema expects field Ids, " + "but Parquet file schema doesn't contain any field Ids.\n" + s"Please remove the field ids from Spark schema or ignore missing ids by " + - s"setting ${SQLConf.IGNORE_MISSING_PARQUET_FIELD_ID.key}` = true`\n" + + s"setting `${SQLConf.IGNORE_MISSING_PARQUET_FIELD_ID.key} = true`\n" + s""" |Spark read schema: |${catalystRequestedSchema.prettyJson} From 822aac5f86bc6abf9bca76fad3537bbb636ee402 Mon Sep 17 00:00:00 2001 From: jackierwzhang Date: Tue, 1 Mar 2022 19:28:24 -0800 Subject: [PATCH 4/4] minor --- .../sql/execution/datasources/parquet/ParquetReadSupport.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 45297c1a846bf..69684f9466f98 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -139,7 +139,7 @@ object ParquetReadSupport extends Logging { throw new RuntimeException( "Spark read schema expects field Ids, " + "but Parquet file schema doesn't contain any field Ids.\n" + - s"Please remove the field ids from Spark schema or ignore missing ids by " + + "Please remove the field ids from Spark schema or ignore missing ids by " + s"setting `${SQLConf.IGNORE_MISSING_PARQUET_FIELD_ID.key} = true`\n" + s""" |Spark read schema: