diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 2d03fbfb0d31..e0ee0ad18254 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -119,6 +119,26 @@ object UnionPushdown extends Rule[LogicalPlan] { */ object ColumnPruning extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transform { + // Eliminate unneeded attributes from Expand that is used in GroupingSets + case a @ Aggregate(groupByExprs, aggregations, e @ Expand(projections, output, child)) + if (e.outputSet -- a.references).nonEmpty => + // It is safe to remove these attributes because they are not referred in the wrapping + // Aggregate node. + val prunedProjections = projections.map { groupExpr => + val newExprs = groupExpr.collect { + case x: NamedExpression if a.references.contains(x) => x + case l: Literal => l + } + GroupExpression(newExprs) + } + + val newOutput = output.collect { + case x: NamedExpression if a.references.contains(x) => x + case x: AttributeReference if x.name == VirtualColumn.groupingIdName => x + } + + Aggregate(groupByExprs, aggregations, Expand(prunedProjections, newOutput, child)) + // Eliminate attributes that are not needed to calculate the specified aggregates. case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty => a.copy(child = Project(a.references.toSeq, child)) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala new file mode 100644 index 000000000000..57149914a32e --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.optimizer + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.execution.QueryExecutionException +import org.apache.spark.sql.{QueryTest, _} +import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils + +import org.apache.spark.sql.hive.test.TestHive._ + +case class TestData(a: Int, b: Int, c: Int, d: Int) + +class FilterPushdownSuite extends QueryTest with BeforeAndAfter { + import org.apache.spark.sql.hive.test.TestHive.implicits._ + + + val testData = TestHive.sparkContext.parallelize( + (1 to 100).map(i => TestData(i, i + 1, i + 10, i + 20))).toDF() + + before { + // Since every we are doing tests for DDL statements, + // it is better to reset before every test. + TestHive.reset() + // Register the testData, which will be used in every test. + testData.registerTempTable("testData") + } + + test("Remove unnecessary attributes when resolving GroupingSets") { + val sqlString = "SELECT a, SUM(c) FROM testData GROUP BY a, b GROUPING SETS ((a, b), a)" + val queryExecution = sql(sqlString).queryExecution + + // Since the field `d` is not referred in Aggregate node, it will be removed from + // the GroupExpressions in optimizedPlan + val groupExpressions = queryExecution.optimizedPlan.collect { + case e: Expand => e + } match { + case Seq(e: Expand) => e.projections // Expand.projections is Seq[GroupExpression] + case _ => fail(s"More than one Expand found\n$queryExecution") + } + groupExpressions.foreach(_.collect { + case ne: NamedExpression if ne.name == "d" => + fail(s"Attribute ${ne.name} should not be found after optimizaton") + }) + } +}