From f9a35f189c43e9962956ccb6cfae33c803e46b74 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sun, 14 Jan 2018 08:49:46 -0800
Subject: [PATCH 1/7] [SPARK-23072][SQL] Add a Unicode schema test for
 file-based data sources

---
 .../org/apache/spark/sql/SQLQuerySuite.scala   | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 96bf65fce9c4..a95d71f858c7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2773,4 +2773,22 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  Seq("orc", "parquet", "csv", "json").foreach { format =>
+    test(s"Write and read back unicode schema - $format") {
+      withTempPath { path =>
+        val dir = path.getCanonicalPath
+
+        // scalastyle:off nonascii
+        val df = Seq("a").toDF("한글")
+        // scalastyle:on nonascii
+
+        df.write.format(format).option("header", "true").save(dir)
+        val answerDf = spark.read.format(format).option("header", "true").load(dir)
+
+        assert(df.schema === answerDf.schema)
+        checkAnswer(df, answerDf)
+      }
+    }
+  }
 }

From 60b8e43a6b60063690485a9061c7709b64adfa43 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 15 Jan 2018 12:06:16 -0800
Subject: [PATCH 2/7] Create FileBasedDataSourceSuite.

---
 .../spark/FileBasedDataSourceSuite.scala      | 67 +++++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 34 ----------
 .../sql/hive/execution/SQLQuerySuite.scala    |  8 ---
 3 files changed, 67 insertions(+), 42 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala
new file mode 100644
index 000000000000..47269e340ff1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.test.SharedSQLContext
+
+class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
+    test(s"Writing empty datasets should not fail - $format") {
+      withTempDir { dir =>
+        Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp")
+      }
+    }
+  }
+
+  Seq("orc", "parquet", "csv", "json").foreach { format =>
+    test(s"Write and read back unicode schema - $format") {
+      withTempPath { path =>
+        val dir = path.getCanonicalPath
+
+        // scalastyle:off nonascii
+        val df = Seq("a").toDF("한글")
+        // scalastyle:on nonascii
+
+        df.write.format(format).option("header", "true").save(dir)
+        val answerDf = spark.read.format(format).option("header", "true").load(dir)
+
+        assert(df.schema === answerDf.schema)
+        checkAnswer(df, answerDf)
+      }
+    }
+  }
+
+  // Only New OrcFileFormat supports this
+  Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
+      "parquet").foreach { format =>
+    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
+      withTempPath { file =>
+        val path = file.getCanonicalPath
+        val emptyDf = Seq((true, 1, "str")).toDF.limit(0)
+        emptyDf.write.format(format).save(path)
+
+        val df = spark.read.format(format).load(path)
+        assert(df.schema.sameType(emptyDf.schema))
+        checkAnswer(df, emptyDf)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index a95d71f858c7..7c9840a34eaa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2757,38 +2757,4 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
-
-  // Only New OrcFileFormat supports this
-  Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
-      "parquet").foreach { format =>
-    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
-      withTempPath { file =>
-        val path = file.getCanonicalPath
-        val emptyDf = Seq((true, 1, "str")).toDF.limit(0)
-        emptyDf.write.format(format).save(path)
-
-        val df = spark.read.format(format).load(path)
-        assert(df.schema.sameType(emptyDf.schema))
-        checkAnswer(df, emptyDf)
-      }
-    }
-  }
-
-  Seq("orc", "parquet", "csv", "json").foreach { format =>
-    test(s"Write and read back unicode schema - $format") {
-      withTempPath { path =>
-        val dir = path.getCanonicalPath
-
-        // scalastyle:off nonascii
-        val df = Seq("a").toDF("한글")
-        // scalastyle:on nonascii
-
-        df.write.format(format).option("header", "true").save(dir)
-        val answerDf = spark.read.format(format).option("header", "true").load(dir)
-
-        assert(df.schema === answerDf.schema)
-        checkAnswer(df, answerDf)
-      }
-    }
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 47adc77a52d5..33bcae91fdaf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2159,12 +2159,4 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
-
-  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
-    test(s"Writing empty datasets should not fail - $format") {
-      withTempDir { dir =>
-        Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp")
-      }
-    }
-  }
 }

From 144c5965e20ae24d38deb033d7ce136a95212cac Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 15 Jan 2018 12:08:57 -0800
Subject: [PATCH 3/7] Move to under `sql` package.

---
 .../org/apache/spark/{ => sql}/FileBasedDataSourceSuite.scala  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/{ => sql}/FileBasedDataSourceSuite.scala (97%)

diff --git a/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
similarity index 97%
rename from sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 47269e340ff1..9ca6a4bbadb8 100644
--- a/sql/core/src/test/scala/org/apache/spark/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -15,9 +15,8 @@
  * limitations under the License.
  */
 
-package org.apache.spark
+package org.apache.spark.sql
 
-import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.test.SharedSQLContext
 
 class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {

From 5afaa2836133cfc18a52de38d666817991d62c5d Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 15 Jan 2018 22:23:53 -0800
Subject: [PATCH 4/7] Address comments.

---
 .../apache/spark/sql/FileBasedDataSourceSuite.scala    | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 9ca6a4bbadb8..3f079204dc5f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -24,8 +24,8 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
 
   Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
     test(s"Writing empty datasets should not fail - $format") {
-      withTempDir { dir =>
-        Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp")
+      withTempPath { dir =>
+        Seq("str").toDS().limit(0).write.format(format).save(dir.getCanonicalPath)
       }
     }
   }
@@ -48,13 +48,11 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  // Only New OrcFileFormat supports this
-  Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
-      "parquet").foreach { format =>
+  Seq("orc", "parquet").foreach { format =>
     test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
       withTempPath { file =>
         val path = file.getCanonicalPath
-        val emptyDf = Seq((true, 1, "str")).toDF.limit(0)
+        val emptyDf = Seq((true, 1, "str")).toDF().limit(0)
         emptyDf.write.format(format).save(path)
 
         val df = spark.read.format(format).load(path)

From fb708b70fe4bb5d29ef55ace7fc0aae61e831c03 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 16 Jan 2018 10:09:19 -0800
Subject: [PATCH 5/7] Move SPARK-22146, too.

---
 .../spark/sql/FileBasedDataSourceSuite.scala       | 14 +++++++++++++-
 .../spark/sql/hive/MetastoreDataSourcesSuite.scala | 12 ------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 3f079204dc5f..148ae753bf4e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -31,7 +31,7 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
   }
 
   Seq("orc", "parquet", "csv", "json").foreach { format =>
-    test(s"Write and read back unicode schema - $format") {
+    test(s"SPARK-23072 Write and read back unicode schema - $format") {
       withTempPath { path =>
         val dir = path.getCanonicalPath
 
@@ -61,4 +61,16 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
+    test(s"SPARK-22146 read files containing special characters using $format") {
+      val nameWithSpecialChars = s"sp&cial%chars"
+      withTempDir { dir =>
+        val tmpFile = s"$dir/$nameWithSpecialChars"
+        spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
+        val fileContent = spark.read.format(format).load(tmpFile)
+        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c8caba83bf36..572d03764c5f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1344,18 +1344,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
     }
   }
 
-  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
-    test(s"SPARK-22146: read files containing special characters using $format") {
-      val nameWithSpecialChars = s"sp&cial%chars"
-      withTempDir { dir =>
-        val tmpFile = s"$dir/$nameWithSpecialChars"
-        spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
-        val fileContent = spark.read.format(format).load(tmpFile)
-        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
-      }
-    }
-  }
-
   private def withDebugMode(f: => Unit): Unit = {
     val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
     try {

From 8fec65b163b32e7592b21b4a6c19c69352f41919 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 16 Jan 2018 10:15:28 -0800
Subject: [PATCH 6/7] Remove unused imports.

---
 .../org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 572d03764c5f..fade143a1755 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -23,14 +23,12 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.execution.command.CreateTableCommand
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.hive.HiveExternalCatalog._
-import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.StaticSQLConf._

From c67809c9dbe0a21011649dceededa84d73377d1c Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 16 Jan 2018 18:54:55 -0800
Subject: [PATCH 7/7] Address comments.

---
 .../spark/sql/FileBasedDataSourceSuite.scala      | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 148ae753bf4e..22fb496bc838 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -22,7 +22,9 @@ import org.apache.spark.sql.test.SharedSQLContext
 class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
-  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
+  private val allFileBasedDataSources = Seq("orc", "parquet", "csv", "json", "text")
+
+  allFileBasedDataSources.foreach { format =>
     test(s"Writing empty datasets should not fail - $format") {
       withTempPath { dir =>
         Seq("str").toDS().limit(0).write.format(format).save(dir.getCanonicalPath)
@@ -30,8 +32,9 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  Seq("orc", "parquet", "csv", "json").foreach { format =>
-    test(s"SPARK-23072 Write and read back unicode schema - $format") {
+  // `TEXT` data source always has a single column whose name is `value`.
+  allFileBasedDataSources.filterNot(_ == "text").foreach { format =>
+    test(s"SPARK-23072 Write and read back unicode column names - $format") {
       withTempPath { path =>
         val dir = path.getCanonicalPath
 
@@ -42,12 +45,14 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
         df.write.format(format).option("header", "true").save(dir)
         val answerDf = spark.read.format(format).option("header", "true").load(dir)
 
-        assert(df.schema === answerDf.schema)
+        assert(df.schema.sameType(answerDf.schema))
         checkAnswer(df, answerDf)
       }
     }
   }
 
+  // Only ORC/Parquet support this. `CSV` and `JSON` returns an empty schema.
+  // `TEXT` data source always has a single column whose name is `value`.
   Seq("orc", "parquet").foreach { format =>
     test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
       withTempPath { file =>
@@ -62,7 +67,7 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
+  allFileBasedDataSources.foreach { format =>
     test(s"SPARK-22146 read files containing special characters using $format") {
       val nameWithSpecialChars = s"sp&cial%chars"
       withTempDir { dir =>