-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-22790][SQL] add a configurable factor to describe HadoopFsRelation's size #20072
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
fbe8253
f40dde4
e868b6f
400f71e
2ba0c26
669704c
e25aff5
36e8ac4
4b0a85b
291ce3a
5230081
6fe8589
c584c61
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -249,7 +249,7 @@ object SQLConf { | |
| val CONSTRAINT_PROPAGATION_ENABLED = buildConf("spark.sql.constraintPropagation.enabled") | ||
| .internal() | ||
| .doc("When true, the query optimizer will infer and propagate data constraints in the query " + | ||
| "plan to optimize them. Constraint propagation can sometimes be computationally expensive" + | ||
| "plan to optimize them. Constraint propagation can sometimes be computationally expensive " + | ||
| "for certain kinds of query plans (such as those with a large number of predicates and " + | ||
| "aliases) which might negatively impact overall runtime.") | ||
| .booleanConf | ||
|
|
@@ -263,6 +263,17 @@ object SQLConf { | |
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| val DISK_TO_MEMORY_SIZE_FACTOR = buildConf( | ||
| "spark.sql.sources.compressionFactor") | ||
|
||
| .internal() | ||
| .doc("The result of multiplying this factor with the size of data source files is propagated " + | ||
| "to serve as the stats to choose the best execution plan. In the case where the " + | ||
|
||
| "in-disk and in-memory size of data is significantly different, users can adjust this " + | ||
| "factor for a better choice of the execution plan. The default value is 1.0.") | ||
| .doubleConf | ||
| .checkValue(_ > 0, "the value of fileDataSizeFactor must be larger than 0") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's not necessary to be that parquet is always smaller than memory size...e.g. in some simple dataset (like the one used in the test), parquet's overhead makes the overall size larger than in-memory size.... but with TPCDS dataset, I observed that parquet size is much smaller than in-memory size |
||
| .createWithDefault(1.0) | ||
|
||
|
|
||
| val PARQUET_SCHEMA_MERGING_ENABLED = buildConf("spark.sql.parquet.mergeSchema") | ||
| .doc("When true, the Parquet data source merges schemas collected from all data files, " + | ||
| "otherwise the schema is picked from the summary file or a random data file " + | ||
|
|
@@ -1241,6 +1252,8 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def escapedStringLiterals: Boolean = getConf(ESCAPED_STRING_LITERALS) | ||
|
|
||
| def diskToMemorySizeFactor: Double = getConf(DISK_TO_MEMORY_SIZE_FACTOR) | ||
|
|
||
| def stringRedationPattern: Option[Regex] = SQL_STRING_REDACTION_PATTERN.readFrom(reader) | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -82,7 +82,11 @@ case class HadoopFsRelation( | |
| } | ||
| } | ||
|
|
||
| override def sizeInBytes: Long = location.sizeInBytes | ||
| override def sizeInBytes: Long = { | ||
| val sizeFactor = sqlContext.conf.diskToMemorySizeFactor | ||
|
||
| (location.sizeInBytes * sizeFactor).toLong | ||
|
||
| } | ||
|
|
||
|
|
||
| override def inputFiles: Array[String] = location.inputFiles | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rename this too,
FILE_COMRESSION_FACTOR