apache · maryannxue · Jul 13, 2018 · Jul 18, 2018 · Jul 21, 2018 · Jul 21, 2018
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import scala.collection.mutable
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
@@ -36,7 +37,7 @@ import org.apache.spark.util.Utils
  * Optimizers can override this.
  */
 abstract class Optimizer(sessionCatalog: SessionCatalog)
-  extends RuleExecutor[LogicalPlan] {
+  extends RuleExecutor[LogicalPlan] with Logging {
 
   // Check for structural integrity of the plan in test mode. Currently we only check if a plan is
   // still resolved after the execution of each rule.
@@ -46,7 +47,23 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
 
   protected def fixedPoint = FixedPoint(SQLConf.get.optimizerMaxIterations)
 
-  def batches: Seq[Batch] = {
+  protected def postAnalysisBatches: Seq[Batch] = {
+    Batch("Eliminate Distinct", Once, EliminateDistinct) ::
+    // Technically some of the rules in Finish Analysis are not optimizer rules and belong more
+    // in the analyzer, because they are needed for correctness (e.g. ComputeCurrentTime).
+    // However, because we also use the analyzer to canonicalized queries (for view definition),
+    // we do not eliminate subqueries or compute current time in the analyzer.
+    Batch("Finish Analysis", Once,
+      EliminateSubqueryAliases,
+      EliminateView,
+      ReplaceExpressions,
+      ComputeCurrentTime,
+      GetCurrentDatabase(sessionCatalog),
+      RewriteDistinctAggregates,
+      ReplaceDeduplicateWithAggregate) :: Nil
+  }
+
+  protected def optimizationBatches: Seq[Batch] = {
     val operatorOptimizationRuleSet =
       Seq(
         // Operator push down
@@ -100,19 +117,6 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
         rulesWithoutInferFiltersFromConstraints: _*) :: Nil
     }
 
-    (Batch("Eliminate Distinct", Once, EliminateDistinct) ::
-    // Technically some of the rules in Finish Analysis are not optimizer rules and belong more
-    // in the analyzer, because they are needed for correctness (e.g. ComputeCurrentTime).
-    // However, because we also use the analyzer to canonicalized queries (for view definition),
-    // we do not eliminate subqueries or compute current time in the analyzer.
-    Batch("Finish Analysis", Once,
-      EliminateSubqueryAliases,
-      EliminateView,
-      ReplaceExpressions,
-      ComputeCurrentTime,
-      GetCurrentDatabase(sessionCatalog),
-      RewriteDistinctAggregates,
-      ReplaceDeduplicateWithAggregate) ::
     //////////////////////////////////////////////////////////////////////////////////////////
     // Optimizer rules start here
     //////////////////////////////////////////////////////////////////////////////////////////
@@ -121,7 +125,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
     //   extra operators between two adjacent Union operators.
     // - Call CombineUnions again in Batch("Operator Optimizations"),
     //   since the other rules might make two separate Unions operators adjacent.
-    Batch("Union", Once,
+    (Batch("Union", Once,
       CombineUnions) ::
     Batch("Pullup Correlated Expressions", Once,
       PullupCorrelatedPredicates) ::
@@ -175,6 +179,35 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
    * Override to provide additional rules for the operator optimization batch.
    */
   def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = Nil
+
+  override def batches: Seq[Batch] = {
+    val excludedRules =
+      SQLConf.get.optimizerExcludedRules.toSeq.flatMap(_.split(",").map(_.trim).filter(!_.isEmpty))
+    val filteredOptimizationBatches = if (excludedRules.isEmpty) {
+      optimizationBatches
+    } else {
+      optimizationBatches.flatMap { batch =>
+        val filteredRules =
+          batch.rules.filter { rule =>
+            val exclude = excludedRules.contains(rule.ruleName)
+            if (exclude) {
+              logInfo(s"Optimization rule '${rule.ruleName}' is excluded from the optimizer.")
+            }
+            !exclude
+          }
+        if (batch.rules == filteredRules) {
+          Some(batch)
+        } else if (filteredRules.nonEmpty) {
+          Some(Batch(batch.name, batch.strategy, filteredRules: _*))
+        } else {
+          logInfo(s"Optimization batch '${batch.name}' is excluded from the optimizer " +
+            s"as all enclosed rules have been excluded.")
+          None
+        }
+      }
+    }
+    postAnalysisBatches ++ filteredOptimizationBatches
+  }
 }
 
 /**

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -127,6 +127,14 @@ object SQLConf {
     }
   }
 
+  val OPTIMIZER_EXCLUDED_RULES = buildConf("spark.sql.optimizer.excludedRules")
+    .doc("Configures a list of rules to be disabled in the optimizer, in which the rules are " +
+      "specified by their rule names and separated by comma. It is not guaranteed that all the " +
+      "rules in this configuration will eventually be excluded, as some rules are necessary " +
+      "for correctness. The optimizer will log the rules that have indeed been excluded.")
+    .stringConf
+    .createOptional
+
   val OPTIMIZER_MAX_ITERATIONS = buildConf("spark.sql.optimizer.maxIterations")
     .internal()
     .doc("The max number of iterations the optimizer and analyzer runs.")
@@ -1383,6 +1391,8 @@ class SQLConf extends Serializable with Logging {
 
   /** ************************ Spark SQL Params/Hints ******************* */
 
+  def optimizerExcludedRules: Option[String] = getConf(OPTIMIZER_EXCLUDED_RULES)
+
   def optimizerMaxIterations: Int = getConf(OPTIMIZER_MAX_ITERATIONS)
 
   def optimizerInSetConversionThreshold: Int = getConf(OPTIMIZER_INSET_CONVERSION_THRESHOLD)

diff --git a/.../src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerRuleExclusionSuite.scala b/.../src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerRuleExclusionSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_EXCLUDED_RULES
+
+
+class OptimizerRuleExclusionSuite extends PlanTest {
+
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
+
+  private def verifyExcludedRules(excludedRuleNames: Seq[String]) {
+    val optimizer = new SimpleTestOptimizer()
+    // Batches whose rules are all to be excluded should be removed as a whole.
+    val excludedBatchNames = optimizer.batches
+      .filter(batch => batch.rules.forall(rule => excludedRuleNames.contains(rule.ruleName)))
+      .map(_.name)
+
+    withSQLConf(
+      OPTIMIZER_EXCLUDED_RULES.key -> excludedRuleNames.foldLeft("")((l, r) => l + "," + r)) {
+      val batches = optimizer.batches
+      assert(batches.forall(batch => !excludedBatchNames.contains(batch.name)))
+      assert(
+        batches
+          .forall(batch => batch.rules.forall(rule => !excludedRuleNames.contains(rule.ruleName))))
+    }
+  }
+
+  test("Exclude a single rule from multiple batches") {
+    verifyExcludedRules(
+      Seq(
+        PushPredicateThroughJoin.ruleName))
+  }
+
+  test("Exclude multiple rules from single or multiple batches") {
+    verifyExcludedRules(
+      Seq(
+        CombineUnions.ruleName,
+        RemoveLiteralFromGroupExpressions.ruleName,
+        RemoveRepetitionFromGroupExpressions.ruleName))
+  }
+
+  test("Exclude non-existent rule with other valid rules") {
+    verifyExcludedRules(
+      Seq(
+        LimitPushDown.ruleName,
+        InferFiltersFromConstraints.ruleName,
+        "DummyRuleName"))
+  }
+
+  test("Verify optimized plan after excluding CombineUnions rule") {
+    val excludedRules = Seq(
+      ConvertToLocalRelation.ruleName,
+      PropagateEmptyRelation.ruleName,
+      CombineUnions.ruleName)
+
+    withSQLConf(
+      OPTIMIZER_EXCLUDED_RULES.key -> excludedRules.foldLeft("")((l, r) => l + "," + r)) {
+      val optimizer = new SimpleTestOptimizer()
+      val originalQuery = testRelation.union(testRelation.union(testRelation)).analyze
+      val optimized = optimizer.execute(originalQuery)
+      comparePlans(originalQuery, optimized)
+    }
+  }
+}