-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8838][SQL] Add config to enable/disable merging part-files when merging parquet schema #7238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
bbd4ce7
8bbebcb
4bdd7e0
3b6be5b
0e734e0
4caf293
47df981
a57be0e
ea8f6e5
4eb2f00
df43027
d4ed7e6
afc2fa1
dbc8e6b
8816f44
71d5b5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -120,6 +120,9 @@ private[sql] class ParquetRelation2( | |
| .map(_.toBoolean) | ||
| .getOrElse(sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED)) | ||
|
|
||
| private val skipMergePartFiles = | ||
| sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_SKIP_MERGE_PARTFILES) | ||
|
|
||
| private val maybeMetastoreSchema = parameters | ||
| .get(ParquetRelation2.METASTORE_SCHEMA) | ||
| .map(DataType.fromJson(_).asInstanceOf[StructType]) | ||
|
|
@@ -407,7 +410,17 @@ private[sql] class ParquetRelation2( | |
| val filesToTouch = | ||
| if (shouldMergeSchemas) { | ||
| // Also includes summary files, 'cause there might be empty partition directories. | ||
| (metadataStatuses ++ commonMetadataStatuses ++ dataStatuses).toSeq | ||
|
|
||
| // If skipMergePartFiles config is true, we assume that all part-files are the same for | ||
| // their schema with summary files, so we ignore them when merging schema. | ||
| // If the config is false, which is the default setting, we merge all part-files. | ||
| val needMerged: Seq[FileStatus] = | ||
|
||
| if (skipMergePartFiles) { | ||
| Seq() | ||
| } else { | ||
| dataStatuses | ||
| } | ||
| (metadataStatuses ++ commonMetadataStatuses ++ needMerged).toSeq | ||
|
||
| } else { | ||
| // Tries any "_common_metadata" first. Parquet files written by old versions or Parquet | ||
| // don't have this. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd propose renaming this configuration to
spark.sql.parquet.respectSummaryFiles.