From f9a35f189c43e9962956ccb6cfae33c803e46b74 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 14 Jan 2018 08:49:46 -0800 Subject: [PATCH 1/7] [SPARK-23072][SQL] Add a Unicode schema test for file-based data sources --- .../org/apache/spark/sql/SQLQuerySuite.scala | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 96bf65fce9c4..a95d71f858c7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2773,4 +2773,22 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } } } + + Seq("orc", "parquet", "csv", "json").foreach { format => + test(s"Write and read back unicode schema - $format") { + withTempPath { path => + val dir = path.getCanonicalPath + + // scalastyle:off nonascii + val df = Seq("a").toDF("한글") + // scalastyle:on nonascii + + df.write.format(format).option("header", "true").save(dir) + val answerDf = spark.read.format(format).option("header", "true").load(dir) + + assert(df.schema === answerDf.schema) + checkAnswer(df, answerDf) + } + } + } } From 60b8e43a6b60063690485a9061c7709b64adfa43 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 15 Jan 2018 12:06:16 -0800 Subject: [PATCH 2/7] Create FileBasedDataSourceSuite. --- .../spark/FileBasedDataSourceSuite.scala | 67 +++++++++++++++++++ .../org/apache/spark/sql/SQLQuerySuite.scala | 34 ---------- .../sql/hive/execution/SQLQuerySuite.scala | 8 --- 3 files changed, 67 insertions(+), 42 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala diff --git a/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala new file mode 100644 index 000000000000..47269e340ff1 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.test.SharedSQLContext + +class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { + import testImplicits._ + + Seq("orc", "parquet", "csv", "json", "text").foreach { format => + test(s"Writing empty datasets should not fail - $format") { + withTempDir { dir => + Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp") + } + } + } + + Seq("orc", "parquet", "csv", "json").foreach { format => + test(s"Write and read back unicode schema - $format") { + withTempPath { path => + val dir = path.getCanonicalPath + + // scalastyle:off nonascii + val df = Seq("a").toDF("한글") + // scalastyle:on nonascii + + df.write.format(format).option("header", "true").save(dir) + val answerDf = spark.read.format(format).option("header", "true").load(dir) + + assert(df.schema === answerDf.schema) + checkAnswer(df, answerDf) + } + } + } + + // Only New OrcFileFormat supports this + Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName, + "parquet").foreach { format => + test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") { + withTempPath { file => + val path = file.getCanonicalPath + val emptyDf = Seq((true, 1, "str")).toDF.limit(0) + emptyDf.write.format(format).save(path) + + val df = spark.read.format(format).load(path) + assert(df.schema.sameType(emptyDf.schema)) + checkAnswer(df, emptyDf) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index a95d71f858c7..7c9840a34eaa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2757,38 +2757,4 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } } } - - // Only New OrcFileFormat supports this - Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName, - "parquet").foreach { format => - test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") { - withTempPath { file => - val path = file.getCanonicalPath - val emptyDf = Seq((true, 1, "str")).toDF.limit(0) - emptyDf.write.format(format).save(path) - - val df = spark.read.format(format).load(path) - assert(df.schema.sameType(emptyDf.schema)) - checkAnswer(df, emptyDf) - } - } - } - - Seq("orc", "parquet", "csv", "json").foreach { format => - test(s"Write and read back unicode schema - $format") { - withTempPath { path => - val dir = path.getCanonicalPath - - // scalastyle:off nonascii - val df = Seq("a").toDF("한글") - // scalastyle:on nonascii - - df.write.format(format).option("header", "true").save(dir) - val answerDf = spark.read.format(format).option("header", "true").load(dir) - - assert(df.schema === answerDf.schema) - checkAnswer(df, answerDf) - } - } - } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 47adc77a52d5..33bcae91fdaf 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2159,12 +2159,4 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } } - - Seq("orc", "parquet", "csv", "json", "text").foreach { format => - test(s"Writing empty datasets should not fail - $format") { - withTempDir { dir => - Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp") - } - } - } } From 144c5965e20ae24d38deb033d7ce136a95212cac Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 15 Jan 2018 12:08:57 -0800 Subject: [PATCH 3/7] Move to under `sql` package. --- .../org/apache/spark/{ => sql}/FileBasedDataSourceSuite.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) rename sql/core/src/test/scala/org/apache/spark/{ => sql}/FileBasedDataSourceSuite.scala (97%) diff --git a/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala similarity index 97% rename from sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 47269e340ff1..9ca6a4bbadb8 100644 --- a/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -15,9 +15,8 @@ * limitations under the License. */ -package org.apache.spark +package org.apache.spark.sql -import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { From 5afaa2836133cfc18a52de38d666817991d62c5d Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 15 Jan 2018 22:23:53 -0800 Subject: [PATCH 4/7] Address comments. --- .../apache/spark/sql/FileBasedDataSourceSuite.scala | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 9ca6a4bbadb8..3f079204dc5f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -24,8 +24,8 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { Seq("orc", "parquet", "csv", "json", "text").foreach { format => test(s"Writing empty datasets should not fail - $format") { - withTempDir { dir => - Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp") + withTempPath { dir => + Seq("str").toDS().limit(0).write.format(format).save(dir.getCanonicalPath) } } } @@ -48,13 +48,11 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { } } - // Only New OrcFileFormat supports this - Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName, - "parquet").foreach { format => + Seq("orc", "parquet").foreach { format => test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") { withTempPath { file => val path = file.getCanonicalPath - val emptyDf = Seq((true, 1, "str")).toDF.limit(0) + val emptyDf = Seq((true, 1, "str")).toDF().limit(0) emptyDf.write.format(format).save(path) val df = spark.read.format(format).load(path) From fb708b70fe4bb5d29ef55ace7fc0aae61e831c03 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 16 Jan 2018 10:09:19 -0800 Subject: [PATCH 5/7] Move SPARK-22146, too. --- .../spark/sql/FileBasedDataSourceSuite.scala | 14 +++++++++++++- .../spark/sql/hive/MetastoreDataSourcesSuite.scala | 12 ------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 3f079204dc5f..148ae753bf4e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -31,7 +31,7 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { } Seq("orc", "parquet", "csv", "json").foreach { format => - test(s"Write and read back unicode schema - $format") { + test(s"SPARK-23072 Write and read back unicode schema - $format") { withTempPath { path => val dir = path.getCanonicalPath @@ -61,4 +61,16 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { } } } + + Seq("orc", "parquet", "csv", "json", "text").foreach { format => + test(s"SPARK-22146 read files containing special characters using $format") { + val nameWithSpecialChars = s"sp&cial%chars" + withTempDir { dir => + val tmpFile = s"$dir/$nameWithSpecialChars" + spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile) + val fileContent = spark.read.format(format).load(tmpFile) + checkAnswer(fileContent, Seq(Row("a"), Row("b"))) + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index c8caba83bf36..572d03764c5f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -1344,18 +1344,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv } } - Seq("orc", "parquet", "csv", "json", "text").foreach { format => - test(s"SPARK-22146: read files containing special characters using $format") { - val nameWithSpecialChars = s"sp&cial%chars" - withTempDir { dir => - val tmpFile = s"$dir/$nameWithSpecialChars" - spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile) - val fileContent = spark.read.format(format).load(tmpFile) - checkAnswer(fileContent, Seq(Row("a"), Row("b"))) - } - } - } - private def withDebugMode(f: => Unit): Unit = { val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE) try { From 8fec65b163b32e7592b21b4a6c19c69352f41919 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 16 Jan 2018 10:15:28 -0800 Subject: [PATCH 6/7] Remove unused imports. --- .../org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 572d03764c5f..fade143a1755 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -23,14 +23,12 @@ import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path -import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.execution.command.CreateTableCommand import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.hive.HiveExternalCatalog._ -import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf._ From c67809c9dbe0a21011649dceededa84d73377d1c Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 16 Jan 2018 18:54:55 -0800 Subject: [PATCH 7/7] Address comments. --- .../spark/sql/FileBasedDataSourceSuite.scala | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 148ae753bf4e..22fb496bc838 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -22,7 +22,9 @@ import org.apache.spark.sql.test.SharedSQLContext class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { import testImplicits._ - Seq("orc", "parquet", "csv", "json", "text").foreach { format => + private val allFileBasedDataSources = Seq("orc", "parquet", "csv", "json", "text") + + allFileBasedDataSources.foreach { format => test(s"Writing empty datasets should not fail - $format") { withTempPath { dir => Seq("str").toDS().limit(0).write.format(format).save(dir.getCanonicalPath) @@ -30,8 +32,9 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { } } - Seq("orc", "parquet", "csv", "json").foreach { format => - test(s"SPARK-23072 Write and read back unicode schema - $format") { + // `TEXT` data source always has a single column whose name is `value`. + allFileBasedDataSources.filterNot(_ == "text").foreach { format => + test(s"SPARK-23072 Write and read back unicode column names - $format") { withTempPath { path => val dir = path.getCanonicalPath @@ -42,12 +45,14 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { df.write.format(format).option("header", "true").save(dir) val answerDf = spark.read.format(format).option("header", "true").load(dir) - assert(df.schema === answerDf.schema) + assert(df.schema.sameType(answerDf.schema)) checkAnswer(df, answerDf) } } } + // Only ORC/Parquet support this. `CSV` and `JSON` returns an empty schema. + // `TEXT` data source always has a single column whose name is `value`. Seq("orc", "parquet").foreach { format => test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") { withTempPath { file => @@ -62,7 +67,7 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext { } } - Seq("orc", "parquet", "csv", "json", "text").foreach { format => + allFileBasedDataSources.foreach { format => test(s"SPARK-22146 read files containing special characters using $format") { val nameWithSpecialChars = s"sp&cial%chars" withTempDir { dir =>