[SPARK-25389][SQL] INSERT OVERWRITE DIRECTORY STORED AS should prevent duplicate fields

dongjoon-hyun · dongjoon-hyun · commit 77579aa8c35b · 2018-09-11T08:57:42.000-07:00
## What changes were proposed in this pull request? Like `INSERT OVERWRITE DIRECTORY USING` syntax, `INSERT OVERWRITE DIRECTORY STORED AS` should not generate files with duplicate fields because Spark cannot read those files back. **INSERT OVERWRITE DIRECTORY USING** ```scala scala> sql("INSERT OVERWRITE DIRECTORY 'file:///tmp/parquet' USING parquet SELECT 'id', 'id2' id") ... ERROR InsertIntoDataSourceDirCommand: Failed to write to directory ... org.apache.spark.sql.AnalysisException: Found duplicate column(s) when inserting into file:/tmp/parquet: `id`; ``` **INSERT OVERWRITE DIRECTORY STORED AS** ```scala scala> sql("INSERT OVERWRITE DIRECTORY 'file:///tmp/parquet' STORED AS parquet SELECT 'id', 'id2' id") // It generates corrupted files scala> spark.read.parquet("/tmp/parquet").show 18/09/09 22:09:57 WARN DataSource: Found duplicate column(s) in the data schema and the partition schema: `id`; ``` ## How was this patch tested? Pass the Jenkins with newly added test cases. Closes #22378 from dongjoon-hyun/SPARK-25389. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.hive.client.HiveClientImpl
+import org.apache.spark.sql.util.SchemaUtils
 
 /**
  * Command for writing the results of `query` to file system.
@@ -61,6 +62,10 @@ case class InsertIntoHiveDirCommand(
 
   override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
     assert(storage.locationUri.nonEmpty)
+    SchemaUtils.checkColumnNameDuplication(
+      outputColumnNames,
+      s"when inserting into ${storage.locationUri.get}",
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
 
     val hiveTable = HiveClientImpl.toHiveTable(CatalogTable(
       identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -750,4 +751,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
       }
     }
   }
+
+  Seq("LOCAL", "").foreach { local =>
+    Seq(true, false).foreach { caseSensitivity =>
+      Seq("orc", "parquet").foreach { format =>
+        test(s"SPARK-25389 INSERT OVERWRITE $local DIRECTORY ... STORED AS with duplicated names" +
+          s"(caseSensitivity=$caseSensitivity, format=$format)") {
+          withTempDir { dir =>
+            withSQLConf(SQLConf.CASE_SENSITIVE.key -> s"$caseSensitivity") {
+              val m = intercept[AnalysisException] {
+                sql(
+                  s"""
+                     |INSERT OVERWRITE $local DIRECTORY '${dir.toURI}'
+                     |STORED AS $format
+                     |SELECT 'id', 'id2' ${if (caseSensitivity) "id" else "ID"}
+                   """.stripMargin)
+              }.getMessage
+              assert(m.contains("Found duplicate column(s) when inserting into"))
+            }
+          }
+        }
+      }
+    }
+  }
 }