apache · mgaido91 · Oct 6, 2017 · Oct 9, 2017 · Oct 12, 2017 · Oct 12, 2017
diff --git a/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream
 import java.util.{Map => JavaMap}
 
 import scala.collection.JavaConverters._
+import scala.collection.immutable.ListMap
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.language.existentials
@@ -77,6 +78,20 @@ case class SubExprEliminationState(isNull: String, value: String)
  */
 case class SubExprCodes(codes: Seq[String], states: Map[Expression, SubExprEliminationState])
 
+/**
+ * The main information about a new added function.
+ *
+ * @param functionName String representing the name of the function
+ * @param subclassName Optional value which is empty if the function is added to
+ *                     the outer class, otherwise it contains the name of the
+ *                     inner class in which the function has been added.
+ * @param subclassInstance Optional value which is empty if the function is added to
+ *                         the outer class, otherwise it contains the name of the
+ *                         instance of the inner class in the outer class.
+ */
+private[codegen] case class NewFunction(functionName: String, subclassName: Option[String],
+    subclassInstance: Option[String])
+
 /**
  * A context for codegen, tracking a list of objects that could be passed into generated Java
  * function.
@@ -277,13 +292,25 @@ class CodegenContext {
       funcName: String,
       funcCode: String,
       inlineToOuterClass: Boolean = false): String = {
+    val newFunction = addNewFunctionInternal(funcName, funcCode, inlineToOuterClass)
+    newFunction match {
+      case NewFunction(functionName, None, None) => functionName
+      case NewFunction(functionName, Some(_), Some(subclassInstance)) =>
+        subclassInstance + "." + functionName
+    }
+  }
+
+  private[this] def addNewFunctionInternal(
+      funcName: String,
+      funcCode: String,
+      inlineToOuterClass: Boolean): NewFunction = {
     // The number of named constants that can exist in the class is limited by the Constant Pool
     // limit, 65,536. We cannot know how many constants will be inserted for a class, so we use a
-    // threshold of 1600k bytes to determine when a function should be inlined to a private, nested
+    // threshold of 1000k bytes to determine when a function should be inlined to a private, nested
     // sub-class.
     val (className, classInstance) = if (inlineToOuterClass) {
       outerClassName -> ""
-    } else if (currClassSize > 1600000) {
+    } else if (currClassSize > 1000000) {
       val className = freshName("NestedClass")
       val classInstance = freshName("nestedClassInstance")
 
@@ -294,17 +321,23 @@ class CodegenContext {
       currClass()
     }
 
-    classSize(className) += funcCode.length
-    classFunctions(className) += funcName -> funcCode
+    addNewFunctionToClass(funcName, funcCode, className)
 
     if (className == outerClassName) {
-      funcName
+      NewFunction(funcName, None, None)
     } else {
-
-      s"$classInstance.$funcName"
+      NewFunction(funcName, Some(className), Some(classInstance))
     }
   }
 
+  private[this] def addNewFunctionToClass(
+      funcName: String,
+      funcCode: String,
+      className: String) = {
+    classSize(className) += funcCode.length
+    classFunctions(className) += funcName -> funcCode
+  }
+
   /**
    * Declares all function code. If the added functions are too many, split them into nested
    * sub-classes to avoid hitting Java compiler constant pool limitation.
@@ -798,10 +831,46 @@ class CodegenContext {
            |  ${makeSplitFunction(body)}
            |}
          """.stripMargin
-        addNewFunction(name, code)
+        addNewFunctionInternal(name, code, inlineToOuterClass = false)
       }
 
-      foldFunctions(functions.map(name => s"$name(${arguments.map(_._2).mkString(", ")})"))
+      // Here we store all the methods which have been added to the outer class.
+      val outerClassFunctions = functions
+        .filter(_.subclassName.isEmpty)
+        .map(_.functionName)
+
+      // Here we handle all the methods which have been added to the nested subclasses and
+      // not to the outer class.
+      // Since they can be many, their direct invocation in the outer class adds many entries
+      // to the outer class' constant pool. This can cause the constant pool to past JVM limit.
+      // To avoid this problem, we group them and we call only the grouping methods in the
+      // outer class.
+      val innerClassFunctions = functions
+        .filter(_.subclassName.isDefined)
+        .foldLeft(ListMap.empty[(String, String), Seq[String]]) { case (acc, f) =>
+          val key = (f.subclassName.get, f.subclassInstance.get)
+          acc.updated(key, acc.getOrElse(key, Seq.empty[String]) ++ Seq(f.functionName))
+        }
+        .flatMap { case ((subclassName, subclassInstance), subclassFunctions) =>
+          if (subclassFunctions.size > CodeGenerator.MERGE_SPLIT_METHODS_THRESHOLD) {
+            // Adding a new function to each subclass which contains
+            // the invocation of all the ones which have been added to
+            // that subclass
+            val code = s"""
+                |private $returnType $func($argString) {
+                |  ${makeSplitFunction(foldFunctions(subclassFunctions.map(name =>
+                      s"$name(${arguments.map(_._2).mkString(", ")})")))}
+                |}
+              """.stripMargin
+            addNewFunctionToClass(func, code, subclassName)
+            Seq(s"$subclassInstance.$func")
+          } else {
+            subclassFunctions.map(f => s"$subclassInstance.$f")
+          }
+        }
+
+      foldFunctions((outerClassFunctions ++ innerClassFunctions).map(
+        name => s"$name(${arguments.map(_._2).mkString(", ")})"))
     }
   }
 
@@ -1010,6 +1079,10 @@ object CodeGenerator extends Logging {
   // This is the value of HugeMethodLimit in the OpenJDK JVM settings
   val DEFAULT_JVM_HUGE_METHOD_LIMIT = 8000
 
+  // This is the threshold over which the methods in a inner class are grouped in a single
+  // method which is going to be called by the outer class instead of the many small ones
+  val MERGE_SPLIT_METHODS_THRESHOLD = 3
+
   /**
    * Compile the Java source code into a Java class, using Janino.
    *

diff --git a/...talyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/...talyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -201,6 +201,23 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
+  test("SPARK-22226: group splitted expressions into one method per nested class") {
+    val length = 10000
+    val expressions = Seq.fill(length) {
+      ToUTCTimestamp(
+        Literal.create(Timestamp.valueOf("2017-10-10 00:00:00"), TimestampType),
+        Literal.create("PST", StringType))
+    }
+    val plan = GenerateMutableProjection.generate(expressions)
+    val actual = plan(new GenericInternalRow(length)).toSeq(expressions.map(_.dataType))
+    val expected = Seq.fill(length)(
+      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2017-10-10 07:00:00")))
+
+    if (actual != expected) {
+      fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")
+    }
+  }
+
   test("test generated safe and unsafe projection") {
     val schema = new StructType(Array(
       StructField("a", StringType, true),

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2103,4 +2103,16 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       testData2.select(lit(7), 'a, 'b).orderBy(lit(1), lit(2), lit(3)),
       Seq(Row(7, 1, 1), Row(7, 1, 2), Row(7, 2, 1), Row(7, 2, 2), Row(7, 3, 1), Row(7, 3, 2)))
   }
+
+  test("SPARK-22226: splitExpressions should not generate codes beyond 64KB") {
+    val colNumber = 10000
+    val input = spark.range(2).rdd.map(_ => Row(1 to colNumber: _*))
+    val df = sqlContext.createDataFrame(input, StructType(
+      (1 to colNumber).map(colIndex => StructField(s"_$colIndex", IntegerType, false))))
+    val newCols = (1 to colNumber).flatMap { colIndex =>
+      Seq(expr(s"if(1000 < _$colIndex, 1000, _$colIndex)"),
+        expr(s"sqrt(_$colIndex)"))
+    }
+    df.select(newCols: _*).collect()
+  }
 }