-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-29061][SQL] Prints bytecode statistics in debugCodegen #25766
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1211,6 +1211,16 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Java bytecode statistics of a compiled class by Janino. | ||
| */ | ||
| case class ByteCodeStats( | ||
| maxClassCodeSize: Int, maxMethodCodeSize: Int, maxConstPoolSize: Int, numInnerClasses: Int) | ||
|
|
||
| object ByteCodeStats { | ||
| val UNAVAILABLE = ByteCodeStats(-1, -1, -1, -1) | ||
| } | ||
|
|
||
| object CodeGenerator extends Logging { | ||
|
|
||
| // This is the default value of HugeMethodLimit in the OpenJDK HotSpot JVM, | ||
|
|
@@ -1220,6 +1230,9 @@ object CodeGenerator extends Logging { | |
| // The max valid length of method parameters in JVM. | ||
| final val MAX_JVM_METHOD_PARAMS_LENGTH = 255 | ||
|
|
||
| // The max number of constant pool entries in JVM. | ||
| final val MAX_JVM_CONSTANT_POOL_SIZE = 65535 | ||
|
|
||
| // This is the threshold over which the methods in an inner class are grouped in a single | ||
| // method which is going to be called by the outer class instead of the many small ones | ||
| final val MERGE_SPLIT_METHODS_THRESHOLD = 3 | ||
|
|
@@ -1244,7 +1257,7 @@ object CodeGenerator extends Logging { | |
| * | ||
| * @return a pair of a generated class and the max bytecode size of generated functions. | ||
| */ | ||
| def compile(code: CodeAndComment): (GeneratedClass, Int) = try { | ||
| def compile(code: CodeAndComment): (GeneratedClass, ByteCodeStats) = try { | ||
| cache.get(code) | ||
| } catch { | ||
| // Cache.get() may wrap the original exception. See the following URL | ||
|
|
@@ -1257,7 +1270,7 @@ object CodeGenerator extends Logging { | |
| /** | ||
| * Compile the Java source code into a Java class, using Janino. | ||
| */ | ||
| private[this] def doCompile(code: CodeAndComment): (GeneratedClass, Int) = { | ||
| private[this] def doCompile(code: CodeAndComment): (GeneratedClass, ByteCodeStats) = { | ||
| val evaluator = new ClassBodyEvaluator() | ||
|
|
||
| // A special classloader used to wrap the actual parent classloader of | ||
|
|
@@ -1318,10 +1331,11 @@ object CodeGenerator extends Logging { | |
| } | ||
|
|
||
| /** | ||
| * Returns the max bytecode size of the generated functions by inspecting janino private fields. | ||
| * Also, this method updates the metrics information. | ||
| * Returns the bytecode statistics (max class bytecode size, max method bytecode size, | ||
| * max constant pool size, and # of inner classes) of generated classes | ||
| * by inspecting Janino classes. Also, this method updates the metrics information. | ||
| */ | ||
| private def updateAndGetCompilationStats(evaluator: ClassBodyEvaluator): Int = { | ||
| private def updateAndGetCompilationStats(evaluator: ClassBodyEvaluator): ByteCodeStats = { | ||
| // First retrieve the generated classes. | ||
| val classes = { | ||
| val resultField = classOf[SimpleCompiler].getDeclaredField("result") | ||
|
|
@@ -1336,11 +1350,13 @@ object CodeGenerator extends Logging { | |
| val codeAttr = Utils.classForName("org.codehaus.janino.util.ClassFile$CodeAttribute") | ||
| val codeAttrField = codeAttr.getDeclaredField("code") | ||
| codeAttrField.setAccessible(true) | ||
| val codeSizes = classes.flatMap { case (_, classBytes) => | ||
| CodegenMetrics.METRIC_GENERATED_CLASS_BYTECODE_SIZE.update(classBytes.length) | ||
| val codeStats = classes.map { case (_, classBytes) => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would like to make the code more readable, by |
||
| val classCodeSize = classBytes.length | ||
| CodegenMetrics.METRIC_GENERATED_CLASS_BYTECODE_SIZE.update(classCodeSize) | ||
| try { | ||
| val cf = new ClassFile(new ByteArrayInputStream(classBytes)) | ||
| val stats = cf.methodInfos.asScala.flatMap { method => | ||
| val constPoolSize = cf.getConstantPoolSize | ||
| val methodCodeSizes = cf.methodInfos.asScala.flatMap { method => | ||
| method.getAttributes().filter(_.getClass eq codeAttr).map { a => | ||
| val byteCodeSize = codeAttrField.get(a).asInstanceOf[Array[Byte]].length | ||
| CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.update(byteCodeSize) | ||
|
|
@@ -1353,19 +1369,21 @@ object CodeGenerator extends Logging { | |
| byteCodeSize | ||
| } | ||
| } | ||
| Some(stats) | ||
| (classCodeSize, methodCodeSizes.max, constPoolSize) | ||
|
||
| } catch { | ||
| case NonFatal(e) => | ||
| logWarning("Error calculating stats of compiled class.", e) | ||
| None | ||
| (classCodeSize, -1, -1) | ||
| } | ||
| }.flatten | ||
|
|
||
| if (codeSizes.nonEmpty) { | ||
| codeSizes.max | ||
| } else { | ||
| 0 | ||
| } | ||
|
|
||
| val (classSizes, maxMethodSizes, constPoolSize) = codeStats.unzip3 | ||
| ByteCodeStats( | ||
| maxClassCodeSize = classSizes.max, | ||
| maxMethodCodeSize = maxMethodSizes.max, | ||
| maxConstPoolSize = constPoolSize.max, | ||
| // Minus 2 for `GeneratedClass` and an outer-most generated class | ||
| numInnerClasses = classSizes.size - 2) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1380,8 +1398,8 @@ object CodeGenerator extends Logging { | |
| private val cache = CacheBuilder.newBuilder() | ||
| .maximumSize(SQLConf.get.codegenCacheMaxEntries) | ||
| .build( | ||
| new CacheLoader[CodeAndComment, (GeneratedClass, Int)]() { | ||
| override def load(code: CodeAndComment): (GeneratedClass, Int) = { | ||
| new CacheLoader[CodeAndComment, (GeneratedClass, ByteCodeStats)]() { | ||
| override def load(code: CodeAndComment): (GeneratedClass, ByteCodeStats) = { | ||
| val startTime = System.nanoTime() | ||
| val result = doCompile(code) | ||
| val endTime = System.nanoTime() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,14 +20,15 @@ package org.apache.spark.sql.execution | |
| import java.util.Collections | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.spark.broadcast.Broadcast | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql._ | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions.Attribute | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode} | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.{ByteCodeStats, CodeFormatter, CodegenContext, CodeGenerator, ExprCode} | ||
| import org.apache.spark.sql.catalyst.plans.physical.Partitioning | ||
| import org.apache.spark.sql.catalyst.trees.TreeNodeRef | ||
| import org.apache.spark.sql.catalyst.util.StringUtils.StringConcat | ||
|
|
@@ -81,11 +82,21 @@ package object debug { | |
| def writeCodegen(append: String => Unit, plan: SparkPlan): Unit = { | ||
| val codegenSeq = codegenStringSeq(plan) | ||
| append(s"Found ${codegenSeq.size} WholeStageCodegen subtrees.\n") | ||
| for (((subtree, code), i) <- codegenSeq.zipWithIndex) { | ||
| append(s"== Subtree ${i + 1} / ${codegenSeq.size} ==\n") | ||
| for (((subtree, code, codeStats), i) <- codegenSeq.zipWithIndex) { | ||
| val usedConstPoolRatio = if (codeStats.maxConstPoolSize > 0) { | ||
| val rt = 100.0 * codeStats.maxConstPoolSize / CodeGenerator.MAX_JVM_CONSTANT_POOL_SIZE | ||
| "(%.2f%% used)".format(rt) | ||
| } else { | ||
| "" | ||
| } | ||
| val codeStatsStr = s"maxClassCodeSize:${codeStats.maxClassCodeSize}; " + | ||
|
||
| s"maxMethodCodeSize:${codeStats.maxMethodCodeSize}; " + | ||
| s"maxConstantPoolSize:${codeStats.maxConstPoolSize}$usedConstPoolRatio; " + | ||
| s"numInnerClasses:${codeStats.numInnerClasses}" | ||
| append(s"== Subtree ${i + 1} / ${codegenSeq.size} ($codeStatsStr) ==\n") | ||
| append(subtree) | ||
| append("\nGenerated code:\n") | ||
| append(s"${code}\n") | ||
| append(s"$code\n") | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -95,7 +106,7 @@ package object debug { | |
| * @param plan the query plan for codegen | ||
| * @return Sequence of WholeStageCodegen subtrees and corresponding codegen | ||
| */ | ||
| def codegenStringSeq(plan: SparkPlan): Seq[(String, String)] = { | ||
| def codegenStringSeq(plan: SparkPlan): Seq[(String, String, ByteCodeStats)] = { | ||
| val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() | ||
| plan transform { | ||
| case s: WholeStageCodegenExec => | ||
|
|
@@ -105,7 +116,13 @@ package object debug { | |
| } | ||
| codegenSubtrees.toSeq.map { subtree => | ||
| val (_, source) = subtree.doCodeGen() | ||
| (subtree.toString, CodeFormatter.format(source)) | ||
| val codeStats = try { | ||
| CodeGenerator.compile(source)._2 | ||
| } catch { | ||
| case NonFatal(_) => | ||
| ByteCodeStats.UNAVAILABLE | ||
| } | ||
| (subtree.toString, CodeFormatter.format(source), codeStats) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -130,7 +147,7 @@ package object debug { | |
| * @param query the streaming query for codegen | ||
| * @return Sequence of WholeStageCodegen subtrees and corresponding codegen | ||
| */ | ||
| def codegenStringSeq(query: StreamingQuery): Seq[(String, String)] = { | ||
| def codegenStringSeq(query: StreamingQuery): Seq[(String, String, ByteCodeStats)] = { | ||
| val w = asStreamExecution(query) | ||
| if (w.lastExecution != null) { | ||
| codegenStringSeq(w.lastExecution.executedPlan) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A ByteCodeStats matches to a compiled class? maxClassCodeSize is for max inner class code size?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The current code just collects the max size among a compiled class and inner classes. But, on second thoughs, I think now we don't need to print the class size cuz IIUC the size is not related to the JVM limits: https://docs.oracle.com/javase/specs/jvms/se8/html/jvms-4.html#jvms-4.11
#25766 (comment)
WDYT? @kiszk