-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables #29045
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
49a2350
195bc23
9b51d67
b9282d0
ff938a5
45be048
c2ca484
f8f7aff
c2d7a21
4469e6d
743ffe3
9de3516
f8ece1f
5b8715e
d0f6b9b
c79794f
cf68729
6150b08
c0f6209
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,7 @@ import scala.collection.JavaConverters._ | |
|
|
||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.{FileStatus, Path} | ||
| import org.apache.orc.{OrcFile, Reader, TypeDescription, Writer} | ||
| import org.apache.orc.{OrcConf, OrcFile, Reader, TypeDescription, Writer} | ||
|
|
||
| import org.apache.spark.{SPARK_VERSION_SHORT, SparkException} | ||
| import org.apache.spark.deploy.SparkHadoopUtil | ||
|
|
@@ -116,15 +116,17 @@ object OrcUtils extends Logging { | |
| } | ||
|
|
||
| /** | ||
| * Returns the requested column ids from the given ORC file. Column id can be -1, which means the | ||
| * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. | ||
| * @return Returns the combination of requested column ids from the given ORC file and | ||
| * boolean flag to find if the pruneCols is allowed or not. Requested Column id can be | ||
| * -1, which means the requested column doesn't exist in the ORC file. Returns None | ||
| * if the given ORC file is empty. | ||
| */ | ||
| def requestedColumnIds( | ||
| isCaseSensitive: Boolean, | ||
| dataSchema: StructType, | ||
| requiredSchema: StructType, | ||
| reader: Reader, | ||
| conf: Configuration): Option[Array[Int]] = { | ||
| conf: Configuration): Option[(Array[Int], Boolean)] = { | ||
| val orcFieldNames = reader.getSchema.getFieldNames.asScala | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (orcFieldNames.isEmpty) { | ||
| // SPARK-8501: Some old empty ORC files always have an empty schema stored in their footer. | ||
|
|
@@ -136,14 +138,18 @@ object OrcUtils extends Logging { | |
| assert(orcFieldNames.length <= dataSchema.length, "The given data schema " + | ||
| s"${dataSchema.catalogString} has less fields than the actual ORC physical schema, " + | ||
| "no idea which columns were dropped, fail to read.") | ||
| // for ORC file written by Hive, no field names | ||
| // in the physical schema, there is a need to send the | ||
| // entire dataSchema instead of required schema. | ||
| // So pruneCols is not done in this case | ||
| Some(requiredSchema.fieldNames.map { name => | ||
| val index = dataSchema.fieldIndex(name) | ||
| if (index < orcFieldNames.length) { | ||
| index | ||
| } else { | ||
| -1 | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| }) | ||
| }, false) | ||
| } else { | ||
| if (isCaseSensitive) { | ||
| Some(requiredSchema.fieldNames.zipWithIndex.map { case (name, idx) => | ||
|
|
@@ -152,7 +158,7 @@ object OrcUtils extends Logging { | |
| } else { | ||
| -1 | ||
| } | ||
| }) | ||
| }, true) | ||
| } else { | ||
| // Do case-insensitive resolution only if in case-insensitive mode | ||
| val caseInsensitiveOrcFieldMap = orcFieldNames.groupBy(_.toLowerCase(Locale.ROOT)) | ||
|
|
@@ -170,7 +176,7 @@ object OrcUtils extends Logging { | |
| idx | ||
| } | ||
| }.getOrElse(-1) | ||
| }) | ||
| }, true) | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -199,4 +205,25 @@ object OrcUtils extends Logging { | |
| s"map<${orcTypeDescriptionString(m.keyType)},${orcTypeDescriptionString(m.valueType)}>" | ||
| case _ => dt.catalogString | ||
| } | ||
|
|
||
| /** | ||
| * @return Returns the result schema string based on the canPruneCols flag. | ||
| * resultSchemaString will be created using resultsSchema in case of | ||
| * canPruneCols is true and for canPruneCols as false value | ||
| * resultSchemaString will be created using the actual dataSchema. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This description is not clear enough. This utility function also changed the value of @SaurabhChawla100 Could you submit a follow-up PR to improve the description?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gatorsmile - This is the new helper method that we have added as the part of this PR sure I Will update the description in the follow-up PR . Shall I raised the PR against the new Jira or with this same jira . Since this Jira is already resolved
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Its okay to refer to this JIRA ticket. Then, please add |
||
| */ | ||
| def orcResultSchemaString( | ||
| canPruneCols: Boolean, | ||
| dataSchema: StructType, | ||
| resultSchema: StructType, | ||
| partitionSchema: StructType, | ||
| conf: Configuration): String = { | ||
| val resultSchemaString = if (canPruneCols) { | ||
| OrcUtils.orcTypeDescriptionString(resultSchema) | ||
| } else { | ||
| OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields)) | ||
| } | ||
| OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) | ||
| resultSchemaString | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.