From 8e16206aa7b8ece8521a64bfabdafbe925ce8e75 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 23 Mar 2015 17:58:54 +0800 Subject: [PATCH 1/3] Only keep necessary attribute output. --- .../sql/catalyst/analysis/Analyzer.scala | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 92d3db077c5e..f7d721684eac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -130,27 +130,34 @@ class Analyzer(catalog: Catalog, * expressions which equal GroupBy expressions with Literal(null), if those expressions * are not set for this grouping set (according to the bit mask). */ - private[this] def expand(g: GroupingSets): Seq[GroupExpression] = { + private[this] def expand(g: GroupingSets): (Seq[GroupExpression], Seq[Attribute]) = { val result = new scala.collection.mutable.ArrayBuffer[GroupExpression] + val allExprs = g.aggregations ++ g.groupByExprs + g.bitmasks.foreach { bitmask => // get the non selected grouping attributes according to the bit mask val nonSelectedGroupExprSet = buildNonSelectExprSet(bitmask, g.groupByExprs) - val substitution = (g.child.output :+ g.gid).map(expr => expr transformDown { + val substitution = (g.child.output :+ g.gid).collect { case x: Expression if nonSelectedGroupExprSet.contains(x) => // if the input attribute in the Invalid Grouping Expression set of for this group // replace it with constant null - Literal(null, expr.dataType) + Literal(null, x.dataType) + case x: Expression if allExprs.exists(_.references.contains(x)) => x case x if x == g.gid => // replace the groupingId with concrete value (the bit mask) Literal(bitmask, IntegerType) - }) + } result += GroupExpression(substitution) } - result.toSeq + val output = g.child.output.collect { + case x: Expression if allExprs.exists(_.references.contains(x)) => x + } + + (result.toSeq, output) } def apply(plan: LogicalPlan): LogicalPlan = plan transform { @@ -159,10 +166,11 @@ class Analyzer(catalog: Catalog, case a: Rollup if a.resolved => GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid) case x: GroupingSets if x.resolved => + val expanded = expand(x) Aggregate( x.groupByExprs :+ x.gid, x.aggregations, - Expand(expand(x), x.child.output :+ x.gid, x.child)) + Expand(expanded._1, expanded._2 :+ x.gid, x.child)) } } From a2734b3d88b145ceaadccf22a497bc84beb5f6a4 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 24 Mar 2015 18:26:58 +0800 Subject: [PATCH 2/3] Move it to Optimizer. --- .../sql/catalyst/analysis/Analyzer.scala | 20 ++++++------------- .../sql/catalyst/optimizer/Optimizer.scala | 19 ++++++++++++++++++ 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index f7d721684eac..92d3db077c5e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -130,34 +130,27 @@ class Analyzer(catalog: Catalog, * expressions which equal GroupBy expressions with Literal(null), if those expressions * are not set for this grouping set (according to the bit mask). */ - private[this] def expand(g: GroupingSets): (Seq[GroupExpression], Seq[Attribute]) = { + private[this] def expand(g: GroupingSets): Seq[GroupExpression] = { val result = new scala.collection.mutable.ArrayBuffer[GroupExpression] - val allExprs = g.aggregations ++ g.groupByExprs - g.bitmasks.foreach { bitmask => // get the non selected grouping attributes according to the bit mask val nonSelectedGroupExprSet = buildNonSelectExprSet(bitmask, g.groupByExprs) - val substitution = (g.child.output :+ g.gid).collect { + val substitution = (g.child.output :+ g.gid).map(expr => expr transformDown { case x: Expression if nonSelectedGroupExprSet.contains(x) => // if the input attribute in the Invalid Grouping Expression set of for this group // replace it with constant null - Literal(null, x.dataType) - case x: Expression if allExprs.exists(_.references.contains(x)) => x + Literal(null, expr.dataType) case x if x == g.gid => // replace the groupingId with concrete value (the bit mask) Literal(bitmask, IntegerType) - } + }) result += GroupExpression(substitution) } - val output = g.child.output.collect { - case x: Expression if allExprs.exists(_.references.contains(x)) => x - } - - (result.toSeq, output) + result.toSeq } def apply(plan: LogicalPlan): LogicalPlan = plan transform { @@ -166,11 +159,10 @@ class Analyzer(catalog: Catalog, case a: Rollup if a.resolved => GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid) case x: GroupingSets if x.resolved => - val expanded = expand(x) Aggregate( x.groupByExprs :+ x.gid, x.aggregations, - Expand(expanded._1, expanded._2 :+ x.gid, x.child)) + Expand(expand(x), x.child.output :+ x.gid, x.child)) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 1a75fcf3545b..3b4cd75db9c3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -115,6 +115,25 @@ object UnionPushdown extends Rule[LogicalPlan] { */ object ColumnPruning extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transform { + // Eliminate unneeded attributes from Expand that is used in GroupingSets + case a @ Aggregate(groupByExprs, aggregations, e @ Expand(projections, output, child)) + if (e.outputSet -- a.references).nonEmpty => + + val substitution = projections.map { groupExpr => + val newExprs = groupExpr.collect { + case x: NamedExpression if a.references.contains(x) => x + case l: Literal => l + } + GroupExpression(newExprs) + } + + val newOutput = output.collect { + case x: NamedExpression if a.references.contains(x) => x + case x: AttributeReference if x.name == VirtualColumn.groupingIdName => x + } + + Aggregate(groupByExprs, aggregations, Expand(substitution, newOutput, child)) + // Eliminate attributes that are not needed to calculate the specified aggregates. case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty => a.copy(child = Project(a.references.toSeq, child)) From d5dadecd940aa402366cde1918b5b96489d5eac8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 28 Apr 2015 16:54:16 +0800 Subject: [PATCH 3/3] Add comment and unit test. --- .../sql/catalyst/optimizer/Optimizer.scala | 9 +-- .../hive/optimizer/FilterPushdownSuite.scala | 66 +++++++++++++++++++ 2 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index b55f97663424..e0ee0ad18254 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -122,8 +122,9 @@ object ColumnPruning extends Rule[LogicalPlan] { // Eliminate unneeded attributes from Expand that is used in GroupingSets case a @ Aggregate(groupByExprs, aggregations, e @ Expand(projections, output, child)) if (e.outputSet -- a.references).nonEmpty => - - val substitution = projections.map { groupExpr => + // It is safe to remove these attributes because they are not referred in the wrapping + // Aggregate node. + val prunedProjections = projections.map { groupExpr => val newExprs = groupExpr.collect { case x: NamedExpression if a.references.contains(x) => x case l: Literal => l @@ -136,8 +137,8 @@ object ColumnPruning extends Rule[LogicalPlan] { case x: AttributeReference if x.name == VirtualColumn.groupingIdName => x } - Aggregate(groupByExprs, aggregations, Expand(substitution, newOutput, child)) - + Aggregate(groupByExprs, aggregations, Expand(prunedProjections, newOutput, child)) + // Eliminate attributes that are not needed to calculate the specified aggregates. case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty => a.copy(child = Project(a.references.toSeq, child)) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala new file mode 100644 index 000000000000..57149914a32e --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/optimizer/FilterPushdownSuite.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.optimizer + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.execution.QueryExecutionException +import org.apache.spark.sql.{QueryTest, _} +import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils + +import org.apache.spark.sql.hive.test.TestHive._ + +case class TestData(a: Int, b: Int, c: Int, d: Int) + +class FilterPushdownSuite extends QueryTest with BeforeAndAfter { + import org.apache.spark.sql.hive.test.TestHive.implicits._ + + + val testData = TestHive.sparkContext.parallelize( + (1 to 100).map(i => TestData(i, i + 1, i + 10, i + 20))).toDF() + + before { + // Since every we are doing tests for DDL statements, + // it is better to reset before every test. + TestHive.reset() + // Register the testData, which will be used in every test. + testData.registerTempTable("testData") + } + + test("Remove unnecessary attributes when resolving GroupingSets") { + val sqlString = "SELECT a, SUM(c) FROM testData GROUP BY a, b GROUPING SETS ((a, b), a)" + val queryExecution = sql(sqlString).queryExecution + + // Since the field `d` is not referred in Aggregate node, it will be removed from + // the GroupExpressions in optimizedPlan + val groupExpressions = queryExecution.optimizedPlan.collect { + case e: Expand => e + } match { + case Seq(e: Expand) => e.projections // Expand.projections is Seq[GroupExpression] + case _ => fail(s"More than one Expand found\n$queryExecution") + } + groupExpressions.foreach(_.collect { + case ne: NamedExpression if ne.name == "d" => + fail(s"Attribute ${ne.name} should not be found after optimizaton") + }) + } +}