apache · viirya · Apr 26, 2017 · Apr 27, 2017 · Apr 30, 2017 · May 3, 2017
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -72,6 +72,34 @@ object CurrentOrigin {
   }
 }
 
+case class Barrier(node: Option[TreeNode[_]] = None)
+
+/**
+ * Provides a barrier for TreeNodes to prevent transformation from specified nodes.
+ */
+object CurrentBarrier {
+  private val value = new ThreadLocal[Barrier]() {
+    override def initialValue: Barrier = Barrier()
+  }
+
+  def get: Barrier = value.get()
+  def set(b: Barrier): Unit = value.set(b)
+
+  def reset(): Unit = value.set(Barrier())
+
+  def hitBarrier(currentNode: TreeNode[_]): Boolean = {
+    val barrier = value.get()
+    barrier.node.isDefined && (barrier.node.get fastEquals currentNode)
+  }
+
+  def withBarrier[A](b: Barrier)(f: => A): A = {
+    val barrier = get
+    set(b)
+    val ret = try f finally { set(barrier) }
+    ret
+  }
+}
+
 // scalastyle:off
 abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
 // scalastyle:on
@@ -115,15 +143,19 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
    */
   def foreach(f: BaseType => Unit): Unit = {
     f(this)
-    children.foreach(_.foreach(f))
+    if (!CurrentBarrier.hitBarrier(this)) {
+      children.foreach(_.foreach(f))
+    }
   }
 
   /**
    * Runs the given function recursively on [[children]] then on this node.
    * @param f the function to be applied to each node in the tree.
    */
   def foreachUp(f: BaseType => Unit): Unit = {
-    children.foreach(_.foreachUp(f))
+    if (!CurrentBarrier.hitBarrier(this)) {
+      children.foreach(_.foreachUp(f))
+    }
     f(this)
   }
 
@@ -267,11 +299,19 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
       rule.applyOrElse(this, identity[BaseType])
     }
 
-    // Check if unchanged and then possibly return old copy to avoid gc churn.
-    if (this fastEquals afterRule) {
-      mapChildren(_.transformDown(rule))
+    if (CurrentBarrier.hitBarrier(this)) {
+      if (this fastEquals afterRule) {
+        this
+      } else {
+        afterRule
+      }
     } else {
-      afterRule.mapChildren(_.transformDown(rule))
+      // Check if unchanged and then possibly return old copy to avoid gc churn.
+      if (this fastEquals afterRule) {
+        mapChildren(_.transformDown(rule))
+      } else {
+        afterRule.mapChildren(_.transformDown(rule))
+      }
     }
   }
 
@@ -283,14 +323,20 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
    * @param rule the function use to transform this nodes children
    */
   def transformUp(rule: PartialFunction[BaseType, BaseType]): BaseType = {
-    val afterRuleOnChildren = mapChildren(_.transformUp(rule))
-    if (this fastEquals afterRuleOnChildren) {
+    if (CurrentBarrier.hitBarrier(this)) {
       CurrentOrigin.withOrigin(origin) {
         rule.applyOrElse(this, identity[BaseType])
       }
     } else {
-      CurrentOrigin.withOrigin(origin) {
-        rule.applyOrElse(afterRuleOnChildren, identity[BaseType])
+      val afterRuleOnChildren = mapChildren(_.transformUp(rule))
+      if (this fastEquals afterRuleOnChildren) {
+        CurrentOrigin.withOrigin(origin) {
+          rule.applyOrElse(this, identity[BaseType])
+        }
+      } else {
+        CurrentOrigin.withOrigin(origin) {
+          rule.applyOrElse(afterRuleOnChildren, identity[BaseType])
+        }
       }
     }
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -46,6 +46,7 @@ import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection}
+import org.apache.spark.sql.catalyst.trees.{Barrier, CurrentBarrier}
 import org.apache.spark.sql.catalyst.util.{usePrettyExpression, DateTimeUtils}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.command._
@@ -203,7 +204,7 @@ class Dataset[T] private[sql](
    * custom objects, e.g. collect.  Here we resolve and bind the encoder so that we can call its
    * `fromRow` method later.
    */
-  private val boundEnc =
+  private lazy val boundEnc =
     exprEnc.resolveAndBind(logicalPlan.output, sparkSession.sessionState.analyzer)
 
   private implicit def classTag = exprEnc.clsTag
@@ -356,7 +357,11 @@ class Dataset[T] private[sql](
    */
   // This is declared with parentheses to prevent the Scala compiler from treating
   // `ds.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
-  def toDF(): DataFrame = new Dataset[Row](sparkSession, queryExecution, RowEncoder(schema))
+  def toDF(): DataFrame = {
+    CurrentBarrier.withBarrier(Barrier(Some(logicalPlan))) {
+      new Dataset[Row](sparkSession, queryExecution, RowEncoder(schema))
+    }
+  }
 
   /**
    * :: Experimental ::
@@ -2828,21 +2833,27 @@ class Dataset[T] private[sql](
 
   /** A convenient function to wrap a logical plan and produce a DataFrame. */
   @inline private def withPlan(logicalPlan: => LogicalPlan): DataFrame = {
-    Dataset.ofRows(sparkSession, logicalPlan)
+    CurrentBarrier.withBarrier(Barrier(Some(this.logicalPlan))) {
+      Dataset.ofRows(sparkSession, logicalPlan)
+    }
   }
 
   /** A convenient function to wrap a logical plan and produce a Dataset. */
   @inline private def withTypedPlan[U : Encoder](logicalPlan: => LogicalPlan): Dataset[U] = {
-    Dataset(sparkSession, logicalPlan)
+    CurrentBarrier.withBarrier(Barrier(Some(this.logicalPlan))) {
+      Dataset(sparkSession, logicalPlan)
+    }
   }
 
   /** A convenient function to wrap a set based logical plan and produce a Dataset. */
   @inline private def withSetOperator[U : Encoder](logicalPlan: => LogicalPlan): Dataset[U] = {
-    if (classTag.runtimeClass.isAssignableFrom(classOf[Row])) {
-      // Set operators widen types (change the schema), so we cannot reuse the row encoder.
-      Dataset.ofRows(sparkSession, logicalPlan).asInstanceOf[Dataset[U]]
-    } else {
-      Dataset(sparkSession, logicalPlan)
+    CurrentBarrier.withBarrier(Barrier(Some(this.logicalPlan))) {
+      if (classTag.runtimeClass.isAssignableFrom(classOf[Row])) {
+        // Set operators widen types (change the schema), so we cannot reuse the row encoder.
+        Dataset.ofRows(sparkSession, logicalPlan).asInstanceOf[Dataset[U]]
+      } else {
+        Dataset(sparkSession, logicalPlan)
+      }
     }
   }
 }