-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-5498][SQL]fix query exception when partition schema does not match table schema #4289
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
3b27af3
10744ca
b1527d5
afc7da5
7470901
63d170a
12d800d
2a91a87
c879aa1
0549759
1e8b30c
d6c93c5
535b0b6
07d84b6
b41d6b9
9c8da74
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,7 @@ import org.apache.hadoop.hive.ql.exec.Utilities | |
| import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition, Table => HiveTable} | ||
| import org.apache.hadoop.hive.ql.plan.{PlanUtils, TableDesc} | ||
| import org.apache.hadoop.hive.serde2.Deserializer | ||
| import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector | ||
| import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorConverters, StructObjectInspector} | ||
| import org.apache.hadoop.hive.serde2.objectinspector.primitive._ | ||
| import org.apache.hadoop.io.Writable | ||
| import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf} | ||
|
|
@@ -188,9 +188,13 @@ class HadoopTableReader( | |
| val hconf = broadcastedHiveConf.value.value | ||
| val deserializer = localDeserializer.newInstance() | ||
| deserializer.initialize(hconf, partProps) | ||
| // get the table deserializer | ||
| val tableSerDe = tableDesc.getDeserializerClass.newInstance() | ||
| tableSerDe.initialize(hconf, tableDesc.getProperties) | ||
|
|
||
| // fill the non partition key attributes | ||
| HadoopTableReader.fillObject(iter, deserializer, nonPartitionKeyAttrs, mutableRow) | ||
| HadoopTableReader.fillObject(iter, deserializer, nonPartitionKeyAttrs, | ||
| mutableRow, Some(tableSerDe)) | ||
| } | ||
| }.toSeq | ||
|
|
||
|
|
@@ -264,15 +268,31 @@ private[hive] object HadoopTableReader extends HiveInspectors { | |
| * @param nonPartitionKeyAttrs Attributes that should be filled together with their corresponding | ||
| * positions in the output schema | ||
| * @param mutableRow A reusable `MutableRow` that should be filled | ||
| * @param convertdeserializer The `Deserializer` covert the `deserializer` | ||
| * @return An `Iterator[Row]` transformed from `iterator` | ||
| */ | ||
| def fillObject( | ||
| iterator: Iterator[Writable], | ||
| deserializer: Deserializer, | ||
| nonPartitionKeyAttrs: Seq[(Attribute, Int)], | ||
| mutableRow: MutableRow): Iterator[Row] = { | ||
| mutableRow: MutableRow, | ||
| convertdeserializer: Option[Deserializer] = None): Iterator[Row] = { | ||
|
|
||
| val soi = convertdeserializer match { | ||
| case Some(convert) => | ||
| // check need to convert | ||
| if (deserializer.getObjectInspector.equals(convert.getObjectInspector)) { | ||
| deserializer.getObjectInspector().asInstanceOf[StructObjectInspector] | ||
| } | ||
| else { | ||
| HiveShim.getConvertedOI( | ||
| deserializer.getObjectInspector(), | ||
| convert.getObjectInspector()).asInstanceOf[StructObjectInspector] | ||
| } | ||
| case None => | ||
| deserializer.getObjectInspector().asInstanceOf[StructObjectInspector] | ||
| } | ||
|
|
||
| val soi = deserializer.getObjectInspector().asInstanceOf[StructObjectInspector] | ||
| val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map { case (attr, ordinal) => | ||
| soi.getStructFieldRef(attr.name) -> ordinal | ||
| }.unzip | ||
|
|
@@ -315,9 +335,23 @@ private[hive] object HadoopTableReader extends HiveInspectors { | |
| } | ||
| } | ||
|
|
||
| val partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter( | ||
| deserializer.getObjectInspector, soi) | ||
|
|
||
| // Map each tuple to a row object | ||
| iterator.map { value => | ||
| val raw = deserializer.deserialize(value) | ||
| val raw = convertdeserializer match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general, we'd better not to do the pattern matching within the iterator, and we can do that like:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for remind |
||
| case Some(convert) => | ||
| if (deserializer.getObjectInspector.equals(convert.getObjectInspector)) { | ||
| deserializer.deserialize(value) | ||
| } | ||
| // If partition schema does not match table schema, update the row to match | ||
| else { | ||
| partTblObjectInspectorConverter.convert(deserializer.deserialize(value)) | ||
| } | ||
| case None => | ||
| deserializer.deserialize(value) | ||
| } | ||
| var i = 0 | ||
| while (i < fieldRefs.length) { | ||
| val fieldValue = soi.getStructFieldData(raw, fieldRefs(i)) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -172,4 +172,19 @@ class InsertIntoHiveTableSuite extends QueryTest { | |
|
|
||
| sql("DROP TABLE hiveTableWithStructValue") | ||
| } | ||
|
|
||
| test("SPARK-5498:partition schema does not match table schema"){ | ||
| val testData = TestHive.sparkContext.parallelize( | ||
| (1 to 10).map(i => TestData(i, i.toString))) | ||
| testData.registerTempTable("testData") | ||
| val tmpDir = Files.createTempDir() | ||
| sql(s"CREATE TABLE table_with_partition(key int,value string) PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") | ||
| sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') SELECT key,value FROM testData") | ||
| sql("ALTER TABLE table_with_partition CHANGE COLUMN key key BIGINT") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just checked the Hive Document
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I check this query in Hive 0.11 and hive-0.12 is OK,I will check this query in Hive 0.13.1 later. |
||
| checkAnswer(sql("select key,value from table_with_partition where ds='1' "), | ||
| testData.toSchemaRDD.collect.toSeq | ||
| ) | ||
| sql("DROP TABLE table_with_partition") | ||
|
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,7 @@ import org.apache.hadoop.hive.ql.plan.{CreateTableDesc, FileSinkDesc, TableDesc} | |
| import org.apache.hadoop.hive.ql.processors._ | ||
| import org.apache.hadoop.hive.ql.stats.StatsSetupConst | ||
| import org.apache.hadoop.hive.serde2.{ColumnProjectionUtils, Deserializer, io => hiveIo} | ||
| import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, PrimitiveObjectInspector} | ||
| import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorConverters, ObjectInspector, PrimitiveObjectInspector} | ||
| import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory | ||
| import org.apache.hadoop.hive.serde2.objectinspector.primitive.{HiveDecimalObjectInspector, PrimitiveObjectInspectorFactory} | ||
| import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfo, TypeInfoFactory} | ||
|
|
@@ -187,7 +187,7 @@ private[hive] object HiveShim { | |
|
|
||
| def getStatsSetupConstRawDataSize = StatsSetupConst.RAW_DATA_SIZE | ||
|
|
||
| def createDefaultDBIfNeeded(context: HiveContext) = { } | ||
| def createDefaultDBIfNeeded(context: HiveContext) = {} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is unneeded and will cause conflicts with other PRs in flight. |
||
|
|
||
| def getCommandProcessor(cmd: Array[String], conf: HiveConf) = { | ||
| CommandProcessorFactory.get(cmd(0), conf) | ||
|
|
@@ -208,7 +208,7 @@ private[hive] object HiveShim { | |
|
|
||
| def getDataLocationPath(p: Partition) = p.getPartitionPath | ||
|
|
||
| def getAllPartitionsOf(client: Hive, tbl: Table) = client.getAllPartitionsForPruner(tbl) | ||
| def getAllPartitionsOf(client: Hive, tbl: Table) = client.getAllPartitionsForPruner(tbl) | ||
|
|
||
| def compatibilityBlackList = Seq( | ||
| "decimal_.*", | ||
|
|
@@ -242,6 +242,11 @@ private[hive] object HiveShim { | |
| } | ||
| } | ||
|
|
||
| // make getConvertedOI compatible between 0.12.0 and 0.13.1 | ||
|
||
| def getConvertedOI(inputOI: ObjectInspector, outputOI: ObjectInspector): ObjectInspector = { | ||
|
||
| ObjectInspectorConverters.getConvertedOI(inputOI, outputOI, new java.lang.Boolean(true)) | ||
|
||
| } | ||
|
|
||
| def prepareWritable(w: Writable): Writable = { | ||
| w | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,7 +37,7 @@ import org.apache.hadoop.hive.ql.plan.{CreateTableDesc, FileSinkDesc, TableDesc} | |
| import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory | ||
| import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfo, DecimalTypeInfo, TypeInfoFactory} | ||
| import org.apache.hadoop.hive.serde2.objectinspector.primitive.{HiveDecimalObjectInspector, PrimitiveObjectInspectorFactory} | ||
| import org.apache.hadoop.hive.serde2.objectinspector.{PrimitiveObjectInspector, ObjectInspector} | ||
| import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorConverters, PrimitiveObjectInspector, ObjectInspector} | ||
| import org.apache.hadoop.hive.serde2.{Deserializer, ColumnProjectionUtils} | ||
| import org.apache.hadoop.hive.serde2.{io => hiveIo} | ||
| import org.apache.hadoop.hive.serde2.avro.AvroGenericRecordWritable | ||
|
|
@@ -397,7 +397,12 @@ private[hive] object HiveShim { | |
| Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue(), hdoi.precision(), hdoi.scale()) | ||
| } | ||
| } | ||
|
|
||
|
|
||
| // make getConvertedOI compatible between 0.12.0 and 0.13.1 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Same here. |
||
| def getConvertedOI(inputOI: ObjectInspector, outputOI: ObjectInspector): ObjectInspector = { | ||
| ObjectInspectorConverters.getConvertedOI(inputOI, outputOI) | ||
| } | ||
|
|
||
| /* | ||
| * Bug introduced in hive-0.13. AvroGenericRecordWritable has a member recordReaderID that | ||
| * is needed to initialize before serialization. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of passing the
deserializer, how about take theconverteras the argument? By the way, I think Hive provides theIdentityConverter, which mean we can make the parameter as "ObjectInspectorConverters.Converter", not necessary wrapped byOption.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But the
val soialso need a convert deserializer when the schema doesn't matchThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, you're right, forget about my comment above. :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change the
convertdeserializertooutputStructObjectInspector?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
variable name should be in camel style.
convertdeserializer=>convertDeserializer? or change it to a better name?