From 188a9476e3504c151ebe27b362a080469c262674 Mon Sep 17 00:00:00 2001 From: yucai Date: Fri, 26 Oct 2018 16:00:24 +0800 Subject: [PATCH 1/9] [SPARK-25850][SQL] Make the split threshold for the code generated method configurable --- .../expressions/codegen/CodeGenerator.scala | 3 ++- .../org/apache/spark/sql/internal/SQLConf.scala | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index d5857e060a2c4..b868a0f4fa284 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -910,12 +910,13 @@ class CodegenContext { val blocks = new ArrayBuffer[String]() val blockBuilder = new StringBuilder() var length = 0 + val splitThreshold = SQLConf.get.methodSplitThreshold for (code <- expressions) { // We can't know how many bytecode will be generated, so use the length of source code // as metric. A method should not go beyond 8K, otherwise it will not be JITted, should // also not be too small, or it will have many function calls (for wide table), see the // results in BenchmarkWideTable. - if (length > 1024) { + if (length > splitThreshold) { blocks += blockBuilder.toString() blockBuilder.clear() length = 0 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index e8529550b8fca..9ac8a12004515 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -812,6 +812,17 @@ object SQLConf { .intConf .createWithDefault(65535) + val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold") + .internal() + .doc("The maximum source code length of a single Java function by codegen. When the " + + "generated Java function source code exceeds this threshold, it will be split into " + + "multiple small functions, each function length is spark.sql.codegen.methodSplitThreshold." + + " A function's bytecode should not go beyond 8KB, otherwise it will not be JITted, should " + + "also not be too small, or we will have many function calls. We can't know how many " + + "bytecode will be generated, so use the length of source code as metric.") + .intConf + .createWithDefault(1024) + val WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR = buildConf("spark.sql.codegen.splitConsumeFuncByOperator") .internal() @@ -1733,6 +1744,8 @@ class SQLConf extends Serializable with Logging { def hugeMethodLimit: Int = getConf(WHOLESTAGE_HUGE_METHOD_LIMIT) + def methodSplitThreshold: Int = getConf(CODEGEN_METHOD_SPLIT_THRESHOLD) + def wholeStageSplitConsumeFuncByOperator: Boolean = getConf(WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR) From ee986ccb4ec48337434f68a3e16953e6ebc2f67f Mon Sep 17 00:00:00 2001 From: yucai Date: Sat, 27 Oct 2018 22:27:19 +0800 Subject: [PATCH 2/9] improve desc --- .../org/apache/spark/sql/internal/SQLConf.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 9ac8a12004515..65d1f0043b49c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -814,12 +814,12 @@ object SQLConf { val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold") .internal() - .doc("The maximum source code length of a single Java function by codegen. When the " + - "generated Java function source code exceeds this threshold, it will be split into " + - "multiple small functions, each function length is spark.sql.codegen.methodSplitThreshold." + - " A function's bytecode should not go beyond 8KB, otherwise it will not be JITted, should " + - "also not be too small, or we will have many function calls. We can't know how many " + - "bytecode will be generated, so use the length of source code as metric.") + .doc("Splits the generated code of expressions into multiple functions by this threshold." + + "Each function's code length (without comments) is larger than but near to this value, " + + "except that the last one may be smaller. We can't know how many bytecode will be " + + "generated, so use the code length as split metric. A function's bytecode should not go " + + "beyond 8KB, otherwise it will not be JITed, should also not be too small, or we will " + + "have many function calls. ") .intConf .createWithDefault(1024) From b578dd45cb4e6831a4bb54ba4c0d9c8f5c84fec5 Mon Sep 17 00:00:00 2001 From: yucai Date: Sat, 27 Oct 2018 22:31:54 +0800 Subject: [PATCH 3/9] improve desc --- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 65d1f0043b49c..55c8adb666878 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -818,8 +818,8 @@ object SQLConf { "Each function's code length (without comments) is larger than but near to this value, " + "except that the last one may be smaller. We can't know how many bytecode will be " + "generated, so use the code length as split metric. A function's bytecode should not go " + - "beyond 8KB, otherwise it will not be JITed, should also not be too small, or we will " + - "have many function calls. ") + "beyond 8KB, otherwise it will not be JITted; it also should not be too small, otherwise " + + "there will be many function calls. ") .intConf .createWithDefault(1024) From 0db224f0eebc52a8fc1dc47fa03ff78151b3b6d9 Mon Sep 17 00:00:00 2001 From: yucai Date: Sat, 27 Oct 2018 22:59:34 +0800 Subject: [PATCH 4/9] address comments --- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 55c8adb666878..1feb388b033c7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -814,12 +814,12 @@ object SQLConf { val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold") .internal() - .doc("Splits the generated code of expressions into multiple functions by this threshold." + - "Each function's code length (without comments) is larger than but near to this value, " + - "except that the last one may be smaller. We can't know how many bytecode will be " + - "generated, so use the code length as split metric. A function's bytecode should not go " + + .doc("The threshold of source code length without comment of a single Java function by " + + "codegen to be split. When the generated Java function source code exceeds this threshold" + + ", it will be split into multiple small functions. We can't know how many bytecode will " + + "be generated, so use the code length as metric. A function's bytecode should not go " + "beyond 8KB, otherwise it will not be JITted; it also should not be too small, otherwise " + - "there will be many function calls. ") + "there will be many function calls.") .intConf .createWithDefault(1024) From b0ce2cae731620a0ce7417b2b46c24cffb023059 Mon Sep 17 00:00:00 2001 From: yucai Date: Mon, 29 Oct 2018 13:23:57 +0800 Subject: [PATCH 5/9] address comments --- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 1feb388b033c7..aa4ec18909129 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -816,10 +816,10 @@ object SQLConf { .internal() .doc("The threshold of source code length without comment of a single Java function by " + "codegen to be split. When the generated Java function source code exceeds this threshold" + - ", it will be split into multiple small functions. We can't know how many bytecode will " + - "be generated, so use the code length as metric. A function's bytecode should not go " + - "beyond 8KB, otherwise it will not be JITted; it also should not be too small, otherwise " + - "there will be many function calls.") + ", it will be split into multiple small functions. We cannot know how many bytecode will " + + "be generated, so use the code length as metric. When running on HotSpot, a function's " + + "bytecode should not go beyond 8KB, otherwise it will not be JITted; it also should not " + + "be too small, otherwise there will be many function calls.") .intConf .createWithDefault(1024) From ba392c89f6c96b3fa64d22a564ac1742b7d0ff54 Mon Sep 17 00:00:00 2001 From: yucai Date: Wed, 31 Oct 2018 09:55:20 +0800 Subject: [PATCH 6/9] address comments --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index aa4ec18909129..e0f622ca7fda9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -821,6 +821,7 @@ object SQLConf { "bytecode should not go beyond 8KB, otherwise it will not be JITted; it also should not " + "be too small, otherwise there will be many function calls.") .intConf + .checkValue(threshold => threshold > 0, "The threshold must be a positive integer.") .createWithDefault(1024) val WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR = From 2fc6417e8e4b1e95be2d5f614811d4898cd696f3 Mon Sep 17 00:00:00 2001 From: yucai Date: Wed, 31 Oct 2018 14:15:03 +0800 Subject: [PATCH 7/9] address comments --- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index e0f622ca7fda9..b848b27c68e0c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -814,10 +814,10 @@ object SQLConf { val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold") .internal() - .doc("The threshold of source code length without comment of a single Java function by " + - "codegen to be split. When the generated Java function source code exceeds this threshold" + - ", it will be split into multiple small functions. We cannot know how many bytecode will " + - "be generated, so use the code length as metric. When running on HotSpot, a function's " + + .doc("The threshold of source-code splitting in the codegen. When the number of characters " + + "in a single JAVA function (without comment) exceeds the threshold, the function will be " + + "automatically split to multiple smaller ones. We cannot know how many bytecode will be " + + "generated, so use the code length as metric. When running on HotSpot, a function's " + "bytecode should not go beyond 8KB, otherwise it will not be JITted; it also should not " + "be too small, otherwise there will be many function calls.") .intConf From 8eeb5385943bab33fd9fa9fe132315b137c831ed Mon Sep 17 00:00:00 2001 From: yucai Date: Wed, 31 Oct 2018 15:08:31 +0800 Subject: [PATCH 8/9] minor --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index b848b27c68e0c..8d76c4dc4f69b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -815,7 +815,7 @@ object SQLConf { val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold") .internal() .doc("The threshold of source-code splitting in the codegen. When the number of characters " + - "in a single JAVA function (without comment) exceeds the threshold, the function will be " + + "in a single Java function (without comment) exceeds the threshold, the function will be " + "automatically split to multiple smaller ones. We cannot know how many bytecode will be " + "generated, so use the code length as metric. When running on HotSpot, a function's " + "bytecode should not go beyond 8KB, otherwise it will not be JITted; it also should not " + From 65a5355a352ad228786b930e41628e0f255e9b59 Mon Sep 17 00:00:00 2001 From: yucai Date: Fri, 2 Nov 2018 13:41:56 +0800 Subject: [PATCH 9/9] Expression.reduceCodeSize --- .../apache/spark/sql/catalyst/expressions/Expression.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index c215735ab1c98..ce0b8ed6d18d3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.trees.TreeNode +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -120,7 +121,8 @@ abstract class Expression extends TreeNode[Expression] { private def reduceCodeSize(ctx: CodegenContext, eval: ExprCode): Unit = { // TODO: support whole stage codegen too - if (eval.code.length > 1024 && ctx.INPUT_ROW != null && ctx.currentVars == null) { + val splitThreshold = SQLConf.get.methodSplitThreshold + if (eval.code.length > splitThreshold && ctx.INPUT_ROW != null && ctx.currentVars == null) { val setIsNull = if (!eval.isNull.isInstanceOf[LiteralValue]) { val globalIsNull = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "globalIsNull") val localIsNull = eval.isNull