-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-19575][SQL]Reading from or writing to a hive serde table with a non pre-existing location should succeed #16910
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
cb98375
401e86d
4493a8f
6fb2b57
b4caca7
119fa64
3870e19
92d1067
4f660d2
7aa43b1
f83d81d
2456a94
f4b4d29
3dcd6c6
065af19
a4f771a
15c0a77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -114,22 +114,30 @@ class HadoopTableReader( | |
| val tablePath = hiveTable.getPath | ||
| val inputPathStr = applyFilterIfNeeded(tablePath, filterOpt) | ||
|
|
||
| // logDebug("Table input: %s".format(tablePath)) | ||
| val ifc = hiveTable.getInputFormatClass | ||
| .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]] | ||
| val hadoopRDD = createHadoopRdd(tableDesc, inputPathStr, ifc) | ||
|
|
||
| val attrsWithIndex = attributes.zipWithIndex | ||
| val mutableRow = new SpecificInternalRow(attributes.map(_.dataType)) | ||
|
|
||
| val deserializedHadoopRDD = hadoopRDD.mapPartitions { iter => | ||
| val hconf = broadcastedHadoopConf.value.value | ||
| val deserializer = deserializerClass.newInstance() | ||
| deserializer.initialize(hconf, tableDesc.getProperties) | ||
| HadoopTableReader.fillObject(iter, deserializer, attrsWithIndex, mutableRow, deserializer) | ||
| } | ||
| val locationPath = new Path(inputPathStr) | ||
| val fs = locationPath.getFileSystem(sparkSession.sessionState.newHadoopConf()) | ||
|
|
||
| // if the table location is not exists, return an empty RDD | ||
|
||
| if (!fs.exists(locationPath)) { | ||
|
||
| new EmptyRDD[InternalRow](sparkSession.sparkContext) | ||
| } else { | ||
| // logDebug("Table input: %s".format(tablePath)) | ||
| val ifc = hiveTable.getInputFormatClass | ||
| .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]] | ||
| val hadoopRDD = createHadoopRdd(tableDesc, inputPathStr, ifc) | ||
|
|
||
| deserializedHadoopRDD | ||
| val attrsWithIndex = attributes.zipWithIndex | ||
| val mutableRow = new SpecificInternalRow(attributes.map(_.dataType)) | ||
|
|
||
| val deserializedHadoopRDD = hadoopRDD.mapPartitions { iter => | ||
| val hconf = broadcastedHadoopConf.value.value | ||
| val deserializer = deserializerClass.newInstance() | ||
| deserializer.initialize(hconf, tableDesc.getProperties) | ||
| HadoopTableReader.fillObject(iter, deserializer, attrsWithIndex, mutableRow, deserializer) | ||
| } | ||
|
|
||
| deserializedHadoopRDD | ||
| } | ||
| } | ||
|
|
||
| override def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[InternalRow] = { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,7 @@ import org.apache.spark.sql.internal.SQLConf | |
| import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION | ||
| import org.apache.spark.sql.test.SQLTestUtils | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| class HiveDDLSuite | ||
| extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach { | ||
|
|
@@ -1494,4 +1495,148 @@ class HiveDDLSuite | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("insert data to a hive serde table which has a not existed location should succeed") { | ||
|
||
| withTable("t") { | ||
| withTempDir { dir => | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t(a string, b int) | ||
| |USING hive | ||
| |OPTIONS(path "file:${dir.getCanonicalPath}") | ||
| """.stripMargin) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| val expectedPath = s"file:${dir.getAbsolutePath.stripSuffix("/")}" | ||
| assert(table.location.stripSuffix("/") == expectedPath) | ||
|
||
|
|
||
| val tableLocFile = new File(table.location.stripPrefix("file:")) | ||
|
||
| tableLocFile.delete() | ||
| assert(!tableLocFile.exists()) | ||
| spark.sql("INSERT INTO TABLE t SELECT 'c', 1") | ||
| assert(tableLocFile.exists()) | ||
| checkAnswer(spark.table("t"), Row("c", 1) :: Nil) | ||
|
|
||
| Utils.deleteRecursively(dir) | ||
| assert(!tableLocFile.exists()) | ||
| spark.sql("INSERT OVERWRITE TABLE t SELECT 'c', 1") | ||
| assert(tableLocFile.exists()) | ||
| checkAnswer(spark.table("t"), Row("c", 1) :: Nil) | ||
|
|
||
| val newDirFile = new File(dir, "x") | ||
| spark.sql(s"ALTER TABLE t SET LOCATION '${newDirFile.getAbsolutePath}'") | ||
| spark.sessionState.catalog.refreshTable(TableIdentifier("t")) | ||
|
|
||
| val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(table1.location.stripSuffix("/") == newDirFile.getAbsolutePath.stripSuffix("/")) | ||
| assert(!newDirFile.exists()) | ||
|
|
||
| spark.sql("INSERT INTO TABLE t SELECT 'c', 1") | ||
| checkAnswer(spark.table("t"), Row("c", 1) :: Nil) | ||
| assert(newDirFile.exists()) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("insert into a hive serde table with no existed partition location should succeed") { | ||
| withTable("t") { | ||
| withTempDir { dir => | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t(a int, b int, c int, d int) | ||
| |USING hive | ||
| |PARTITIONED BY(a, b) | ||
| |LOCATION "file:${dir.getCanonicalPath}" | ||
| """.stripMargin) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| val expectedPath = s"file:${dir.getAbsolutePath.stripSuffix("/")}" | ||
| assert(table.location.stripSuffix("/") == expectedPath) | ||
|
|
||
| spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4") | ||
| checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil) | ||
|
|
||
| val partLoc = new File(s"${dir.getAbsolutePath}/a=1") | ||
| Utils.deleteRecursively(partLoc) | ||
| assert(!partLoc.exists()) | ||
| // insert overwrite into a partition which location has been deleted. | ||
| spark.sql("INSERT OVERWRITE TABLE t PARTITION(a=1, b=2) SELECT 7, 8") | ||
| assert(partLoc.exists()) | ||
| checkAnswer(spark.table("t"), Row(7, 8, 1, 2) :: Nil) | ||
|
|
||
| val newDirFile = new File(dir, "x") | ||
| spark.sql(s"ALTER TABLE t PARTITION(a=1, b=2) SET LOCATION " + | ||
| s"'${newDirFile.getAbsolutePath}'") | ||
|
||
| assert(!newDirFile.exists()) | ||
|
|
||
| // insert into a partition which location does not exists. | ||
| spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 9, 10") | ||
| assert(newDirFile.exists()) | ||
| checkAnswer(spark.table("t"), Row(9, 10, 1, 2) :: Nil) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("read data from a hive serde table which has a not existed location should succeed") { | ||
|
||
| withTable("t") { | ||
| withTempDir { dir => | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t(a string, b int) | ||
| |USING hive | ||
| |OPTIONS(path "file:${dir.getAbsolutePath}") | ||
| """.stripMargin) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| val expectedPath = s"file:${dir.getAbsolutePath.stripSuffix("/")}" | ||
| assert(table.location.stripSuffix("/") == expectedPath) | ||
|
|
||
| dir.delete() | ||
| checkAnswer(spark.table("t"), Nil) | ||
|
|
||
| val newDirFile = new File(dir, "x") | ||
| spark.sql(s"ALTER TABLE t SET LOCATION '${newDirFile.getAbsolutePath}'") | ||
|
|
||
| val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(table1.location.stripSuffix("/") == newDirFile.getAbsolutePath.stripSuffix("/")) | ||
| assert(!newDirFile.exists()) | ||
| checkAnswer(spark.table("t"), Nil) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("read data from a hive serde table with no existed partition location should succeed") { | ||
| withTable("t") { | ||
| withTempDir { dir => | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t(a int, b int, c int, d int) | ||
| |USING hive | ||
| |PARTITIONED BY(a, b) | ||
| |LOCATION "file:${dir.getCanonicalPath}" | ||
| """.stripMargin) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
|
||
|
|
||
| spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4") | ||
| checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil) | ||
|
|
||
| val newDirFile = new File(dir, "x") | ||
| spark.sql(s"ALTER TABLE t PARTITION(a=1, b=2) SET LOCATION " + | ||
| s"'${newDirFile.getAbsolutePath}'") | ||
|
||
| assert(!newDirFile.exists()) | ||
| // select from a partition which location has changed to a not existed location | ||
| withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why setting this conf?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we don't set it,it will throw an exception,if we set it,it will check if the partition path exists,and will not throw exception just return emptyrdd even if path not existed
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this expected? I think hive will always return empty result right?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW this conf will be removed soon, as it has bugs.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok~thanks~ then here we also need to modify something?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, hive return empty , if there is a bug here(could you describe what the bug is?), we can remove the conf ,and always return result? |
||
| checkAnswer(spark.sql("select * from t where a=1 and b=2"), Nil) | ||
| } | ||
|
|
||
| spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 5, 6") | ||
| checkAnswer(spark.table("t"), Row(5, 6, 1, 2) :: Nil) | ||
| assert(newDirFile.exists()) | ||
|
|
||
| // select from a partition which location has been deleted. | ||
| Utils.deleteRecursively(newDirFile) | ||
| assert(!newDirFile.exists()) | ||
| withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") { | ||
| checkAnswer(spark.sql("select * from t where a=1 and b=2"), Nil) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about replacing
sparkSession.sessionState.newHadoopConf()bybroadcastedHadoopConf.value.value?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok~