apache · windpiger · Feb 13, 2017 · Feb 13, 2017 · Feb 21, 2017 · Feb 21, 2017
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -114,22 +114,30 @@ class HadoopTableReader(
     val tablePath = hiveTable.getPath
     val inputPathStr = applyFilterIfNeeded(tablePath, filterOpt)
 
-    // logDebug("Table input: %s".format(tablePath))
-    val ifc = hiveTable.getInputFormatClass
-      .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]]
-    val hadoopRDD = createHadoopRdd(tableDesc, inputPathStr, ifc)
-
-    val attrsWithIndex = attributes.zipWithIndex
-    val mutableRow = new SpecificInternalRow(attributes.map(_.dataType))
-
-    val deserializedHadoopRDD = hadoopRDD.mapPartitions { iter =>
-      val hconf = broadcastedHadoopConf.value.value
-      val deserializer = deserializerClass.newInstance()
-      deserializer.initialize(hconf, tableDesc.getProperties)
-      HadoopTableReader.fillObject(iter, deserializer, attrsWithIndex, mutableRow, deserializer)
-    }
+    val locationPath = new Path(inputPathStr)
+    val fs = locationPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
+
+    // if the table location is not exists, return an empty RDD
+    if (!fs.exists(locationPath)) {
+      new EmptyRDD[InternalRow](sparkSession.sparkContext)
+    } else {
+      // logDebug("Table input: %s".format(tablePath))
+      val ifc = hiveTable.getInputFormatClass
+        .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]]
+      val hadoopRDD = createHadoopRdd(tableDesc, inputPathStr, ifc)
 
-    deserializedHadoopRDD
+      val attrsWithIndex = attributes.zipWithIndex
+      val mutableRow = new SpecificInternalRow(attributes.map(_.dataType))
+
+      val deserializedHadoopRDD = hadoopRDD.mapPartitions { iter =>
+        val hconf = broadcastedHadoopConf.value.value
+        val deserializer = deserializerClass.newInstance()
+        deserializer.initialize(hconf, tableDesc.getProperties)
+        HadoopTableReader.fillObject(iter, deserializer, attrsWithIndex, mutableRow, deserializer)
+      }
+
+      deserializedHadoopRDD
+    }
   }
 
   override def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[InternalRow] = {

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -20,13 +20,14 @@ package org.apache.spark.sql.hive.execution
 import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition}
 import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
 
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{EmptyRDD, RDD}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.Utils
 
 class HiveDDLSuite
   extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach {
@@ -1494,4 +1495,148 @@ class HiveDDLSuite
       }
     }
   }
+
+  test("insert data to a hive serde table which has a not existed location should succeed") {
+    withTable("t") {
+      withTempDir { dir =>
+        spark.sql(
+          s"""
+             |CREATE TABLE t(a string, b int)
+             |USING hive
+             |OPTIONS(path "file:${dir.getCanonicalPath}")
+           """.stripMargin)
+        val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        val expectedPath = s"file:${dir.getAbsolutePath.stripSuffix("/")}"
+        assert(table.location.stripSuffix("/") == expectedPath)
+
+        val tableLocFile = new File(table.location.stripPrefix("file:"))
+        tableLocFile.delete()
+        assert(!tableLocFile.exists())
+        spark.sql("INSERT INTO TABLE t SELECT 'c', 1")
+        assert(tableLocFile.exists())
+        checkAnswer(spark.table("t"), Row("c", 1) :: Nil)
+
+        Utils.deleteRecursively(dir)
+        assert(!tableLocFile.exists())
+        spark.sql("INSERT OVERWRITE TABLE t SELECT 'c', 1")
+        assert(tableLocFile.exists())
+        checkAnswer(spark.table("t"), Row("c", 1) :: Nil)
+
+        val newDirFile = new File(dir, "x")
+        spark.sql(s"ALTER TABLE t SET LOCATION '${newDirFile.getAbsolutePath}'")
+        spark.sessionState.catalog.refreshTable(TableIdentifier("t"))
+
+        val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        assert(table1.location.stripSuffix("/") == newDirFile.getAbsolutePath.stripSuffix("/"))
+        assert(!newDirFile.exists())
+
+        spark.sql("INSERT INTO TABLE t SELECT 'c', 1")
+        checkAnswer(spark.table("t"), Row("c", 1) :: Nil)
+        assert(newDirFile.exists())
+      }
+    }
+  }
+
+  test("insert into a hive serde table with no existed partition location should succeed") {
+    withTable("t") {
+      withTempDir { dir =>
+        spark.sql(
+          s"""
+             |CREATE TABLE t(a int, b int, c int, d int)
+             |USING hive
+             |PARTITIONED BY(a, b)
+             |LOCATION "file:${dir.getCanonicalPath}"
+           """.stripMargin)
+        val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        val expectedPath = s"file:${dir.getAbsolutePath.stripSuffix("/")}"
+        assert(table.location.stripSuffix("/") == expectedPath)
+
+        spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4")
+        checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil)
+
+        val partLoc = new File(s"${dir.getAbsolutePath}/a=1")
+        Utils.deleteRecursively(partLoc)
+        assert(!partLoc.exists())
+        // insert overwrite into a partition which location has been deleted.
+        spark.sql("INSERT OVERWRITE TABLE t PARTITION(a=1, b=2) SELECT 7, 8")
+        assert(partLoc.exists())
+        checkAnswer(spark.table("t"), Row(7, 8, 1, 2) :: Nil)
+
+        val newDirFile = new File(dir, "x")
+        spark.sql(s"ALTER TABLE t PARTITION(a=1, b=2) SET LOCATION " +
+          s"'${newDirFile.getAbsolutePath}'")
+        assert(!newDirFile.exists())
+
+        // insert into a partition which location does not exists.
+        spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 9, 10")
+        assert(newDirFile.exists())
+        checkAnswer(spark.table("t"), Row(9, 10, 1, 2) :: Nil)
+      }
+    }
+  }
+
+  test("read data from a hive serde table which has a not existed location should succeed") {
+    withTable("t") {
+      withTempDir { dir =>
+        spark.sql(
+          s"""
+             |CREATE TABLE t(a string, b int)
+             |USING hive
+             |OPTIONS(path "file:${dir.getAbsolutePath}")
+           """.stripMargin)
+        val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        val expectedPath = s"file:${dir.getAbsolutePath.stripSuffix("/")}"
+        assert(table.location.stripSuffix("/") == expectedPath)
+
+        dir.delete()
+        checkAnswer(spark.table("t"), Nil)
+
+        val newDirFile = new File(dir, "x")
+        spark.sql(s"ALTER TABLE t SET LOCATION '${newDirFile.getAbsolutePath}'")
+
+        val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        assert(table1.location.stripSuffix("/") == newDirFile.getAbsolutePath.stripSuffix("/"))
+        assert(!newDirFile.exists())
+        checkAnswer(spark.table("t"), Nil)
+      }
+    }
+  }
+
+  test("read data from a hive serde table with no existed partition location should succeed") {
+    withTable("t") {
+      withTempDir { dir =>
+        spark.sql(
+          s"""
+             |CREATE TABLE t(a int, b int, c int, d int)
+             |USING hive
+             |PARTITIONED BY(a, b)
+             |LOCATION "file:${dir.getCanonicalPath}"
+           """.stripMargin)
+        val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+
+        spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4")
+        checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil)
+
+        val newDirFile = new File(dir, "x")
+        spark.sql(s"ALTER TABLE t PARTITION(a=1, b=2) SET LOCATION " +
+          s"'${newDirFile.getAbsolutePath}'")
+        assert(!newDirFile.exists())
+        // select from a partition which location has changed to a not existed location
+        withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") {
+          checkAnswer(spark.sql("select * from t where a=1 and b=2"), Nil)
+        }
+
+        spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 5, 6")
+        checkAnswer(spark.table("t"), Row(5, 6, 1, 2) :: Nil)
+        assert(newDirFile.exists())
+
+        // select from a partition which location has been deleted.
+        Utils.deleteRecursively(newDirFile)
+        assert(!newDirFile.exists())
+        withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") {
+          checkAnswer(spark.sql("select * from t where a=1 and b=2"), Nil)
+        }
+      }
+    }
+  }
 }