From 188a9476e3504c151ebe27b362a080469c262674 Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Fri, 26 Oct 2018 16:00:24 +0800
Subject: [PATCH 1/9] [SPARK-25850][SQL] Make the split threshold for the code
 generated method configurable

---
 .../expressions/codegen/CodeGenerator.scala         |  3 ++-
 .../org/apache/spark/sql/internal/SQLConf.scala     | 13 +++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index d5857e060a2c4..b868a0f4fa284 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -910,12 +910,13 @@ class CodegenContext {
     val blocks = new ArrayBuffer[String]()
     val blockBuilder = new StringBuilder()
     var length = 0
+    val splitThreshold = SQLConf.get.methodSplitThreshold
     for (code <- expressions) {
       // We can't know how many bytecode will be generated, so use the length of source code
       // as metric. A method should not go beyond 8K, otherwise it will not be JITted, should
       // also not be too small, or it will have many function calls (for wide table), see the
       // results in BenchmarkWideTable.
-      if (length > 1024) {
+      if (length > splitThreshold) {
         blocks += blockBuilder.toString()
         blockBuilder.clear()
         length = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e8529550b8fca..9ac8a12004515 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -812,6 +812,17 @@ object SQLConf {
     .intConf
     .createWithDefault(65535)
 
+  val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold")
+    .internal()
+    .doc("The maximum source code length of a single Java function by codegen. When the " +
+      "generated Java function source code exceeds this threshold, it will be split into " +
+      "multiple small functions, each function length is spark.sql.codegen.methodSplitThreshold." +
+      " A function's bytecode should not go beyond 8KB, otherwise it will not be JITted, should " +
+      "also not be too small, or we will have many function calls. We can't know how many " +
+      "bytecode will be generated, so use the length of source code as metric.")
+    .intConf
+    .createWithDefault(1024)
+
   val WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR =
     buildConf("spark.sql.codegen.splitConsumeFuncByOperator")
       .internal()
@@ -1733,6 +1744,8 @@ class SQLConf extends Serializable with Logging {
 
   def hugeMethodLimit: Int = getConf(WHOLESTAGE_HUGE_METHOD_LIMIT)
 
+  def methodSplitThreshold: Int = getConf(CODEGEN_METHOD_SPLIT_THRESHOLD)
+
   def wholeStageSplitConsumeFuncByOperator: Boolean =
     getConf(WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR)
 

From ee986ccb4ec48337434f68a3e16953e6ebc2f67f Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Sat, 27 Oct 2018 22:27:19 +0800
Subject: [PATCH 2/9] improve desc

---
 .../org/apache/spark/sql/internal/SQLConf.scala      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 9ac8a12004515..65d1f0043b49c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -814,12 +814,12 @@ object SQLConf {
 
   val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold")
     .internal()
-    .doc("The maximum source code length of a single Java function by codegen. When the " +
-      "generated Java function source code exceeds this threshold, it will be split into " +
-      "multiple small functions, each function length is spark.sql.codegen.methodSplitThreshold." +
-      " A function's bytecode should not go beyond 8KB, otherwise it will not be JITted, should " +
-      "also not be too small, or we will have many function calls. We can't know how many " +
-      "bytecode will be generated, so use the length of source code as metric.")
+    .doc("Splits the generated code of expressions into multiple functions by this threshold." +
+      "Each function's code length (without comments) is larger than but near to this value, " +
+      "except that the last one may be smaller. We can't know how many bytecode will be " +
+      "generated, so use the code length as split metric. A function's bytecode should not go " +
+      "beyond 8KB, otherwise it will not be JITed, should also not be too small, or we will " +
+      "have many function calls. ")
     .intConf
     .createWithDefault(1024)
 

From b578dd45cb4e6831a4bb54ba4c0d9c8f5c84fec5 Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Sat, 27 Oct 2018 22:31:54 +0800
Subject: [PATCH 3/9] improve desc

---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 65d1f0043b49c..55c8adb666878 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -818,8 +818,8 @@ object SQLConf {
       "Each function's code length (without comments) is larger than but near to this value, " +
       "except that the last one may be smaller. We can't know how many bytecode will be " +
       "generated, so use the code length as split metric. A function's bytecode should not go " +
-      "beyond 8KB, otherwise it will not be JITed, should also not be too small, or we will " +
-      "have many function calls. ")
+      "beyond 8KB, otherwise it will not be JITted; it also should not be too small, otherwise " +
+      "there will be many function calls. ")
     .intConf
     .createWithDefault(1024)
 

From 0db224f0eebc52a8fc1dc47fa03ff78151b3b6d9 Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Sat, 27 Oct 2018 22:59:34 +0800
Subject: [PATCH 4/9] address comments

---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 55c8adb666878..1feb388b033c7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -814,12 +814,12 @@ object SQLConf {
 
   val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold")
     .internal()
-    .doc("Splits the generated code of expressions into multiple functions by this threshold." +
-      "Each function's code length (without comments) is larger than but near to this value, " +
-      "except that the last one may be smaller. We can't know how many bytecode will be " +
-      "generated, so use the code length as split metric. A function's bytecode should not go " +
+    .doc("The threshold of source code length without comment of a single Java function by " +
+      "codegen to be split. When the generated Java function source code exceeds this threshold" +
+      ", it will be split into multiple small functions. We can't know how many bytecode will " +
+      "be generated, so use the code length as metric. A function's bytecode should not go " +
       "beyond 8KB, otherwise it will not be JITted; it also should not be too small, otherwise " +
-      "there will be many function calls. ")
+      "there will be many function calls.")
     .intConf
     .createWithDefault(1024)
 

From b0ce2cae731620a0ce7417b2b46c24cffb023059 Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Mon, 29 Oct 2018 13:23:57 +0800
Subject: [PATCH 5/9] address comments

---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 1feb388b033c7..aa4ec18909129 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -816,10 +816,10 @@ object SQLConf {
     .internal()
     .doc("The threshold of source code length without comment of a single Java function by " +
       "codegen to be split. When the generated Java function source code exceeds this threshold" +
-      ", it will be split into multiple small functions. We can't know how many bytecode will " +
-      "be generated, so use the code length as metric. A function's bytecode should not go " +
-      "beyond 8KB, otherwise it will not be JITted; it also should not be too small, otherwise " +
-      "there will be many function calls.")
+      ", it will be split into multiple small functions. We cannot know how many bytecode will " +
+      "be generated, so use the code length as metric. When running on HotSpot, a function's " +
+      "bytecode should not go beyond 8KB, otherwise it will not be JITted; it also should not " +
+      "be too small, otherwise there will be many function calls.")
     .intConf
     .createWithDefault(1024)
 

From ba392c89f6c96b3fa64d22a564ac1742b7d0ff54 Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Wed, 31 Oct 2018 09:55:20 +0800
Subject: [PATCH 6/9] address comments

---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index aa4ec18909129..e0f622ca7fda9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -821,6 +821,7 @@ object SQLConf {
       "bytecode should not go beyond 8KB, otherwise it will not be JITted; it also should not " +
       "be too small, otherwise there will be many function calls.")
     .intConf
+    .checkValue(threshold => threshold > 0, "The threshold must be a positive integer.")
     .createWithDefault(1024)
 
   val WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR =

From 2fc6417e8e4b1e95be2d5f614811d4898cd696f3 Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Wed, 31 Oct 2018 14:15:03 +0800
Subject: [PATCH 7/9] address comments

---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e0f622ca7fda9..b848b27c68e0c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -814,10 +814,10 @@ object SQLConf {
 
   val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold")
     .internal()
-    .doc("The threshold of source code length without comment of a single Java function by " +
-      "codegen to be split. When the generated Java function source code exceeds this threshold" +
-      ", it will be split into multiple small functions. We cannot know how many bytecode will " +
-      "be generated, so use the code length as metric. When running on HotSpot, a function's " +
+    .doc("The threshold of source-code splitting in the codegen. When the number of characters " +
+      "in a single JAVA function (without comment) exceeds the threshold, the function will be " +
+      "automatically split to multiple smaller ones. We cannot know how many bytecode will be " +
+      "generated, so use the code length as metric. When running on HotSpot, a function's " +
       "bytecode should not go beyond 8KB, otherwise it will not be JITted; it also should not " +
       "be too small, otherwise there will be many function calls.")
     .intConf

From 8eeb5385943bab33fd9fa9fe132315b137c831ed Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Wed, 31 Oct 2018 15:08:31 +0800
Subject: [PATCH 8/9] minor

---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index b848b27c68e0c..8d76c4dc4f69b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -815,7 +815,7 @@ object SQLConf {
   val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold")
     .internal()
     .doc("The threshold of source-code splitting in the codegen. When the number of characters " +
-      "in a single JAVA function (without comment) exceeds the threshold, the function will be " +
+      "in a single Java function (without comment) exceeds the threshold, the function will be " +
       "automatically split to multiple smaller ones. We cannot know how many bytecode will be " +
       "generated, so use the code length as metric. When running on HotSpot, a function's " +
       "bytecode should not go beyond 8KB, otherwise it will not be JITted; it also should not " +

From 65a5355a352ad228786b930e41628e0f255e9b59 Mon Sep 17 00:00:00 2001
From: yucai <yyu1@ebay.com>
Date: Fri, 2 Nov 2018 13:41:56 +0800
Subject: [PATCH 9/9] Expression.reduceCodeSize

---
 .../apache/spark/sql/catalyst/expressions/Expression.scala    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index c215735ab1c98..ce0b8ed6d18d3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.trees.TreeNode
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -120,7 +121,8 @@ abstract class Expression extends TreeNode[Expression] {
 
   private def reduceCodeSize(ctx: CodegenContext, eval: ExprCode): Unit = {
     // TODO: support whole stage codegen too
-    if (eval.code.length > 1024 && ctx.INPUT_ROW != null && ctx.currentVars == null) {
+    val splitThreshold = SQLConf.get.methodSplitThreshold
+    if (eval.code.length > splitThreshold && ctx.INPUT_ROW != null && ctx.currentVars == null) {
       val setIsNull = if (!eval.isNull.isInstanceOf[LiteralValue]) {
         val globalIsNull = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "globalIsNull")
         val localIsNull = eval.isNull