From dd28b09d5a546a200717a46854a6085ed545fd8e Mon Sep 17 00:00:00 2001 From: turbofei Date: Tue, 22 Oct 2019 12:09:42 +0800 Subject: [PATCH 1/6] [SPARK-29542] Make the description of spark.sql.files.maxPartitionBytes be clearly --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 4944099fcc0d..19f482bea68c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -980,7 +980,8 @@ object SQLConf { .createWithDefault(true) val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes") - .doc("The maximum number of bytes to pack into a single partition when reading files.") + .doc("The maximum number of bytes to pack into a single partition when reading files" + + " for data source table.") .bytesConf(ByteUnit.BYTE) .createWithDefault(128 * 1024 * 1024) // parquet.block.size From 7ecb7b602de82e70ccbd1cf11f86a635092d196d Mon Sep 17 00:00:00 2001 From: turbofei Date: Tue, 22 Oct 2019 19:59:08 +0800 Subject: [PATCH 2/6] fix style --- .../apache/spark/sql/internal/SQLConf.scala | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 19f482bea68c..0924bc347a0f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -980,34 +980,36 @@ object SQLConf { .createWithDefault(true) val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes") - .doc("The maximum number of bytes to pack into a single partition when reading files" + - " for data source table.") + .doc("The maximum number of bytes to pack into a single partition when Spark file-based" + + " sources are used to read files.") .bytesConf(ByteUnit.BYTE) .createWithDefault(128 * 1024 * 1024) // parquet.block.size val FILES_OPEN_COST_IN_BYTES = buildConf("spark.sql.files.openCostInBytes") .internal() - .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" + - " the same time. This is used when putting multiple files into a partition. It's better to" + - " over estimated, then the partitions with small files will be faster than partitions with" + - " bigger files (which is scheduled first).") + .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in " + + "the same time. This is used when putting multiple file-source files into a partition. " + + "It's better to over estimated, then the partitions with small files will be faster than " + + "partitions with bigger files (which is scheduled first).") .longConf .createWithDefault(4 * 1024 * 1024) val IGNORE_CORRUPT_FILES = buildConf("spark.sql.files.ignoreCorruptFiles") - .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " + - "encountering corrupted files and the contents that have been read will still be returned.") + .doc("Whether to ignore corrupt file-source files. If true, the Spark jobs will continue to " + + "run when encountering corrupted files and the contents that have been read will still be " + + "returned.") .booleanConf .createWithDefault(false) val IGNORE_MISSING_FILES = buildConf("spark.sql.files.ignoreMissingFiles") - .doc("Whether to ignore missing files. If true, the Spark jobs will continue to run when " + - "encountering missing files and the contents that have been read will still be returned.") + .doc("Whether to ignore missing file-source files. If true, the Spark jobs will continue to " + + "run when encountering missing files and the contents that have been read will still be " + + "returned.") .booleanConf .createWithDefault(false) val MAX_RECORDS_PER_FILE = buildConf("spark.sql.files.maxRecordsPerFile") - .doc("Maximum number of records to write out to a single file. " + + .doc("Maximum number of records to write out to a single file-source file. " + "If this value is zero or negative, there is no limit.") .longConf .createWithDefault(0) From 4a56b2e3b397b029598814a9f2c0f43eb3aaa8b2 Mon Sep 17 00:00:00 2001 From: turbofei Date: Wed, 23 Oct 2019 00:11:46 +0800 Subject: [PATCH 3/6] fix blank --- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0924bc347a0f..3cc590a4c017 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -980,8 +980,8 @@ object SQLConf { .createWithDefault(true) val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes") - .doc("The maximum number of bytes to pack into a single partition when Spark file-based" + - " sources are used to read files.") + .doc("The maximum number of bytes to pack into a single partition when Spark file-based " + + "sources are used to read files.") .bytesConf(ByteUnit.BYTE) .createWithDefault(128 * 1024 * 1024) // parquet.block.size From 849e99565e045878f46070d4930a774190282814 Mon Sep 17 00:00:00 2001 From: turbofei Date: Wed, 23 Oct 2019 10:47:11 +0800 Subject: [PATCH 4/6] revert --- .../apache/spark/sql/internal/SQLConf.scala | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3cc590a4c017..4944099fcc0d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -980,36 +980,33 @@ object SQLConf { .createWithDefault(true) val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes") - .doc("The maximum number of bytes to pack into a single partition when Spark file-based " + - "sources are used to read files.") + .doc("The maximum number of bytes to pack into a single partition when reading files.") .bytesConf(ByteUnit.BYTE) .createWithDefault(128 * 1024 * 1024) // parquet.block.size val FILES_OPEN_COST_IN_BYTES = buildConf("spark.sql.files.openCostInBytes") .internal() - .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in " + - "the same time. This is used when putting multiple file-source files into a partition. " + - "It's better to over estimated, then the partitions with small files will be faster than " + - "partitions with bigger files (which is scheduled first).") + .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" + + " the same time. This is used when putting multiple files into a partition. It's better to" + + " over estimated, then the partitions with small files will be faster than partitions with" + + " bigger files (which is scheduled first).") .longConf .createWithDefault(4 * 1024 * 1024) val IGNORE_CORRUPT_FILES = buildConf("spark.sql.files.ignoreCorruptFiles") - .doc("Whether to ignore corrupt file-source files. If true, the Spark jobs will continue to " + - "run when encountering corrupted files and the contents that have been read will still be " + - "returned.") + .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " + + "encountering corrupted files and the contents that have been read will still be returned.") .booleanConf .createWithDefault(false) val IGNORE_MISSING_FILES = buildConf("spark.sql.files.ignoreMissingFiles") - .doc("Whether to ignore missing file-source files. If true, the Spark jobs will continue to " + - "run when encountering missing files and the contents that have been read will still be " + - "returned.") + .doc("Whether to ignore missing files. If true, the Spark jobs will continue to run when " + + "encountering missing files and the contents that have been read will still be returned.") .booleanConf .createWithDefault(false) val MAX_RECORDS_PER_FILE = buildConf("spark.sql.files.maxRecordsPerFile") - .doc("Maximum number of records to write out to a single file-source file. " + + .doc("Maximum number of records to write out to a single file. " + "If this value is zero or negative, there is no limit.") .longConf .createWithDefault(0) From 1de48df5e9d9d3ca1b0ea648f7c1d9d05d9f4ffd Mon Sep 17 00:00:00 2001 From: turbofei Date: Wed, 23 Oct 2019 10:51:48 +0800 Subject: [PATCH 5/6] refactor --- .../apache/spark/sql/internal/SQLConf.scala | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 4944099fcc0d..306d6edbcd8d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -980,7 +980,9 @@ object SQLConf { .createWithDefault(true) val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes") - .doc("The maximum number of bytes to pack into a single partition when reading files.") + .doc("The maximum number of bytes to pack into a single partition when reading files. " + + "This configuration is effective only when using file-based sources such as Parquet, JSON " + + "and ORC.") .bytesConf(ByteUnit.BYTE) .createWithDefault(128 * 1024 * 1024) // parquet.block.size @@ -989,25 +991,31 @@ object SQLConf { .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" + " the same time. This is used when putting multiple files into a partition. It's better to" + " over estimated, then the partitions with small files will be faster than partitions with" + - " bigger files (which is scheduled first).") + " bigger files (which is scheduled first). This configuration is effective only when using" + + " file-based sources such as Parquet, JSON and ORC.") .longConf .createWithDefault(4 * 1024 * 1024) val IGNORE_CORRUPT_FILES = buildConf("spark.sql.files.ignoreCorruptFiles") .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " + - "encountering corrupted files and the contents that have been read will still be returned.") + "encountering corrupted files and the contents that have been read will still be returned. " + + "This configuration is effective only when using file-based sources such as Parquet, JSON " + + "and ORC.") .booleanConf .createWithDefault(false) val IGNORE_MISSING_FILES = buildConf("spark.sql.files.ignoreMissingFiles") .doc("Whether to ignore missing files. If true, the Spark jobs will continue to run when " + - "encountering missing files and the contents that have been read will still be returned.") + "encountering missing files and the contents that have been read will still be returned. " + + "This configuration is effective only when using file-based sources such as Parquet, JSON " + + "and ORC.") .booleanConf .createWithDefault(false) val MAX_RECORDS_PER_FILE = buildConf("spark.sql.files.maxRecordsPerFile") .doc("Maximum number of records to write out to a single file. " + - "If this value is zero or negative, there is no limit.") + "If this value is zero or negative, there is no limit. This configuration is " + + "effective only when using file-based sources such as Parquet, JSON and ORC.") .longConf .createWithDefault(0) From 7e115d834971961957ec73ce0aa6c817b57d3ff2 Mon Sep 17 00:00:00 2001 From: turbofei Date: Wed, 23 Oct 2019 12:14:45 +0800 Subject: [PATCH 6/6] revert maxRecordsPerFile --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 306d6edbcd8d..a02cb832cc53 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1014,8 +1014,7 @@ object SQLConf { val MAX_RECORDS_PER_FILE = buildConf("spark.sql.files.maxRecordsPerFile") .doc("Maximum number of records to write out to a single file. " + - "If this value is zero or negative, there is no limit. This configuration is " + - "effective only when using file-based sources such as Parquet, JSON and ORC.") + "If this value is zero or negative, there is no limit.") .longConf .createWithDefault(0)