From 8e16206aa7b8ece8521a64bfabdafbe925ce8e75 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 23 Mar 2015 17:58:54 +0800
Subject: [PATCH 1/3] Only keep necessary attribute output.

---
 .../sql/catalyst/analysis/Analyzer.scala      | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 92d3db077c5e..f7d721684eac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -130,27 +130,34 @@ class Analyzer(catalog: Catalog,
      * expressions which equal GroupBy expressions with Literal(null), if those expressions
      * are not set for this grouping set (according to the bit mask).
      */
-    private[this] def expand(g: GroupingSets): Seq[GroupExpression] = {
+    private[this] def expand(g: GroupingSets): (Seq[GroupExpression], Seq[Attribute]) = {
       val result = new scala.collection.mutable.ArrayBuffer[GroupExpression]
 
+      val allExprs = g.aggregations ++ g.groupByExprs
+
       g.bitmasks.foreach { bitmask =>
         // get the non selected grouping attributes according to the bit mask
         val nonSelectedGroupExprSet = buildNonSelectExprSet(bitmask, g.groupByExprs)
 
-        val substitution = (g.child.output :+ g.gid).map(expr => expr transformDown {
+        val substitution = (g.child.output :+ g.gid).collect {
           case x: Expression if nonSelectedGroupExprSet.contains(x) =>
             // if the input attribute in the Invalid Grouping Expression set of for this group
             // replace it with constant null
-            Literal(null, expr.dataType)
+            Literal(null, x.dataType)
+          case x: Expression if allExprs.exists(_.references.contains(x)) => x
           case x if x == g.gid =>
             // replace the groupingId with concrete value (the bit mask)
             Literal(bitmask, IntegerType)
-        })
+        }
 
         result += GroupExpression(substitution)
       }
 
-      result.toSeq
+      val output = g.child.output.collect {
+          case x: Expression if allExprs.exists(_.references.contains(x)) => x
+      }
+
+      (result.toSeq, output)
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -159,10 +166,11 @@ class Analyzer(catalog: Catalog,
       case a: Rollup if a.resolved =>
         GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid)
       case x: GroupingSets if x.resolved =>
+        val expanded = expand(x)
         Aggregate(
           x.groupByExprs :+ x.gid,
           x.aggregations,
-          Expand(expand(x), x.child.output :+ x.gid, x.child))
+          Expand(expanded._1, expanded._2 :+ x.gid, x.child))
     }
   }
 

From a2734b3d88b145ceaadccf22a497bc84beb5f6a4 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 24 Mar 2015 18:26:58 +0800
Subject: [PATCH 2/3] Move it to Optimizer.

---
 .../sql/catalyst/analysis/Analyzer.scala      | 20 ++++++-------------
 .../sql/catalyst/optimizer/Optimizer.scala    | 19 ++++++++++++++++++
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f7d721684eac..92d3db077c5e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -130,34 +130,27 @@ class Analyzer(catalog: Catalog,
      * expressions which equal GroupBy expressions with Literal(null), if those expressions
      * are not set for this grouping set (according to the bit mask).
      */
-    private[this] def expand(g: GroupingSets): (Seq[GroupExpression], Seq[Attribute]) = {
+    private[this] def expand(g: GroupingSets): Seq[GroupExpression] = {
       val result = new scala.collection.mutable.ArrayBuffer[GroupExpression]
 
-      val allExprs = g.aggregations ++ g.groupByExprs
-
       g.bitmasks.foreach { bitmask =>
         // get the non selected grouping attributes according to the bit mask
         val nonSelectedGroupExprSet = buildNonSelectExprSet(bitmask, g.groupByExprs)
 
-        val substitution = (g.child.output :+ g.gid).collect {
+        val substitution = (g.child.output :+ g.gid).map(expr => expr transformDown {
           case x: Expression if nonSelectedGroupExprSet.contains(x) =>
             // if the input attribute in the Invalid Grouping Expression set of for this group
             // replace it with constant null
-            Literal(null, x.dataType)
-          case x: Expression if allExprs.exists(_.references.contains(x)) => x
+            Literal(null, expr.dataType)
           case x if x == g.gid =>
             // replace the groupingId with concrete value (the bit mask)
             Literal(bitmask, IntegerType)
-        }
+        })
 
         result += GroupExpression(substitution)
       }
 
-      val output = g.child.output.collect {
-          case x: Expression if allExprs.exists(_.references.contains(x)) => x
-      }
-
-      (result.toSeq, output)
+      result.toSeq
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -166,11 +159,10 @@ class Analyzer(catalog: Catalog,
       case a: Rollup if a.resolved =>
         GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid)
       case x: GroupingSets if x.resolved =>
-        val expanded = expand(x)
         Aggregate(
           x.groupByExprs :+ x.gid,
           x.aggregations,
-          Expand(expanded._1, expanded._2 :+ x.gid, x.child))
+          Expand(expand(x), x.child.output :+ x.gid, x.child))
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 1a75fcf3545b..3b4cd75db9c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -115,6 +115,25 @@ object UnionPushdown extends Rule[LogicalPlan] {
  */
 object ColumnPruning extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    // Eliminate unneeded attributes from Expand that is used in GroupingSets
+    case a @ Aggregate(groupByExprs, aggregations, e @ Expand(projections, output, child))
+        if (e.outputSet -- a.references).nonEmpty =>
+
+      val substitution = projections.map { groupExpr =>
+        val newExprs = groupExpr.collect {
+          case x: NamedExpression if a.references.contains(x) => x
+          case l: Literal => l
+        }
+        GroupExpression(newExprs)
+      }
+
+      val newOutput = output.collect {
+        case x: NamedExpression if a.references.contains(x) => x
+        case x: AttributeReference if x.name == VirtualColumn.groupingIdName => x
+      }
+
+      Aggregate(groupByExprs, aggregations, Expand(substitution, newOutput, child))
+ 
     // Eliminate attributes that are not needed to calculate the specified aggregates.
     case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty =>
       a.copy(child = Project(a.references.toSeq, child))

From d5dadecd940aa402366cde1918b5b96489d5eac8 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 28 Apr 2015 16:54:16 +0800
Subject: [PATCH 3/3] Add comment and unit test.

---
 .../sql/catalyst/optimizer/Optimizer.scala    |  9 +--
 .../hive/optimizer/FilterPushdownSuite.scala  | 66 +++++++++++++++++++
 2 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index b55f97663424..e0ee0ad18254 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -122,8 +122,9 @@ object ColumnPruning extends Rule[LogicalPlan] {
     // Eliminate unneeded attributes from Expand that is used in GroupingSets
     case a @ Aggregate(groupByExprs, aggregations, e @ Expand(projections, output, child))
         if (e.outputSet -- a.references).nonEmpty =>
-
-      val substitution = projections.map { groupExpr =>
+      // It is safe to remove these attributes because they are not referred in the wrapping
+      // Aggregate node.
+      val prunedProjections = projections.map { groupExpr =>
         val newExprs = groupExpr.collect {
           case x: NamedExpression if a.references.contains(x) => x
           case l: Literal => l
@@ -136,8 +137,8 @@ object ColumnPruning extends Rule[LogicalPlan] {
         case x: AttributeReference if x.name == VirtualColumn.groupingIdName => x
       }
 
-      Aggregate(groupByExprs, aggregations, Expand(substitution, newOutput, child))
- 
+      Aggregate(groupByExprs, aggregations, Expand(prunedProjections, newOutput, child))
+
     // Eliminate attributes that are not needed to calculate the specified aggregates.
     case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty =>
       a.copy(child = Project(a.references.toSeq, child))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala
new file mode 100644
index 000000000000..57149914a32e
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.optimizer
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.QueryExecutionException
+import org.apache.spark.sql.{QueryTest, _}
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
+
+import org.apache.spark.sql.hive.test.TestHive._
+
+case class TestData(a: Int, b: Int, c: Int, d: Int)
+
+class FilterPushdownSuite extends QueryTest with BeforeAndAfter {
+  import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+
+  val testData = TestHive.sparkContext.parallelize(
+    (1 to 100).map(i => TestData(i, i + 1, i + 10, i + 20))).toDF()
+  
+  before {
+    // Since every we are doing tests for DDL statements,
+    // it is better to reset before every test.
+    TestHive.reset()
+    // Register the testData, which will be used in every test.
+    testData.registerTempTable("testData")
+  }
+
+  test("Remove unnecessary attributes when resolving GroupingSets") {
+    val sqlString = "SELECT a, SUM(c) FROM testData GROUP BY a, b GROUPING SETS ((a, b), a)"
+    val queryExecution = sql(sqlString).queryExecution
+
+    // Since the field `d` is not referred in Aggregate node, it will be removed from
+    // the GroupExpressions in optimizedPlan
+    val groupExpressions = queryExecution.optimizedPlan.collect {
+      case e: Expand => e
+    } match {
+      case Seq(e: Expand) => e.projections // Expand.projections is Seq[GroupExpression]
+      case _ => fail(s"More than one Expand found\n$queryExecution")
+    }
+    groupExpressions.foreach(_.collect {
+      case ne: NamedExpression if ne.name == "d" =>
+        fail(s"Attribute ${ne.name} should not be found after optimizaton")
+    })
+  }
+}