apache · yhuai · Jul 31, 2015 · Jul 31, 2015 · Jul 31, 2015 · Jul 31, 2015
diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -110,7 +110,11 @@ abstract class AggregateFunction2
    * buffer value of `avg(x)` will be 0 and the position of the first buffer value of `avg(y)`
    * will be 2.
    */
-  var mutableBufferOffset: Int = 0
+  protected var mutableBufferOffset: Int = 0
+
+  def withNewMutableBufferOffset(newMutableBufferOffset: Int): Unit = {
+    mutableBufferOffset = newMutableBufferOffset
+  }
 
   /**
    * The offset of this function's start buffer value in the
@@ -126,7 +130,11 @@ abstract class AggregateFunction2
    * buffer value of `avg(x)` will be 1 and the position of the first buffer value of `avg(y)`
    * will be 3 (position 0 is used for the value of key`).
    */
-  var inputBufferOffset: Int = 0
+  protected var inputBufferOffset: Int = 0
+
+  def withNewInputBufferOffset(newInputBufferOffset: Int): Unit = {
+    inputBufferOffset = newInputBufferOffset
+  }
 
   /** The schema of the aggregation buffer. */
   def bufferSchema: StructType
@@ -195,11 +203,8 @@ abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable w
   override def bufferSchema: StructType = StructType.fromAttributes(bufferAttributes)
 
   override def initialize(buffer: MutableRow): Unit = {
-    var i = 0
-    while (i < bufferAttributes.size) {
-      buffer(i + mutableBufferOffset) = initialValues(i).eval()
-      i += 1
-    }
+    throw new UnsupportedOperationException(
+      "AlgebraicAggregate's initialize should not be called directly")
   }
 
   override final def update(buffer: MutableRow, input: InternalRow): Unit = {

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -137,6 +137,8 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
   }
 
   override def keyExpressions: Seq[Expression] = expressions
+
+  override def toString: String = s"${super.toString} numPartitions=$numPartitions"
 }
 
 /**

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -409,6 +409,9 @@ private[spark] object SQLConf {
   val USE_SQL_AGGREGATE2 = booleanConf("spark.sql.useAggregate2",
     defaultValue = Some(true), doc = "<TODO>")
 
+  val USE_HYBRID_AGGREGATE = booleanConf("spark.sql.aggregate.useHybridAggregate",
+    defaultValue = Some(true), doc = "<TODO>")
+
   val USE_SQL_SERIALIZER2 = booleanConf(
     "spark.sql.useSerializer2",
     defaultValue = Some(true), isPublic = false)
@@ -482,6 +485,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def useSqlAggregate2: Boolean = getConf(USE_SQL_AGGREGATE2)
 
+  private[spark] def useHybridAggregate: Boolean = getConf(USE_HYBRID_AGGREGATE)
+
   private[spark] def useSqlSerializer2: Boolean = getConf(USE_SQL_SERIALIZER2)
 
   private[spark] def autoBroadcastJoinThreshold: Int = getConf(AUTO_BROADCASTJOIN_THRESHOLD)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -161,6 +161,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
                   allAggregates(rewrittenAggregateExpressions)) &&
                codegenEnabled &&
                !canBeConvertedToNewAggregation(plan) =>
+          logInfo(s"Using ${classOf[execution.GeneratedAggregate].getCanonicalName} as " +
+            s"the physical Aggregate Operator.")
           execution.GeneratedAggregate(
             partial = false,
             namedGroupingAttributes,
@@ -180,6 +182,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
              groupingExpressions,
              partialComputation,
              child) if !canBeConvertedToNewAggregation(plan) =>
+        logInfo(s"Using ${classOf[execution.Aggregate].getCanonicalName} as " +
+          s"the physical Aggregate Operator.")
         execution.Aggregate(
           partial = false,
           namedGroupingAttributes,
@@ -227,6 +231,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         converted match {
           case None => Nil // Cannot convert to new aggregation code path.
           case Some(logical.Aggregate(groupingExpressions, resultExpressions, child)) =>
+            logInfo(s"Using ${classOf[aggregate.Aggregate].getCanonicalName} as " +
+              s"the physical Aggregate Operator.")
             // Extracts all distinct aggregate expressions from the resultExpressions.
             val aggregateExpressions = resultExpressions.flatMap { expr =>
               expr.collect {
@@ -386,6 +392,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           Nil
         } else {
           Utils.checkInvalidAggregateFunction2(a)
+          logInfo(s"Using ${classOf[execution.Aggregate].getCanonicalName} as " +
+            s"the physical Aggregate Operator.")
           execution.Aggregate(partial = false, group, agg, planLater(child)) :: Nil
         }
       }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/Aggregate.scala
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.aggregate
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.errors._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.catalyst.plans.physical.{UnspecifiedDistribution, ClusteredDistribution, AllTuples, Distribution}
+import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan, UnaryNode}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * An Aggregate Operator used to evaluate [[AggregateFunction2]]. Based on the data types
+ * of the grouping expressions and aggregate functions, it determines if it uses
+ * sort-based aggregation and hybrid (hash-based with sort-based as the fallback) to
+ * process input rows.
+ */
+case class Aggregate(
+    requiredChildDistributionExpressions: Option[Seq[Expression]],
+    groupingExpressions: Seq[NamedExpression],
+    nonCompleteAggregateExpressions: Seq[AggregateExpression2],
+    nonCompleteAggregateAttributes: Seq[Attribute],
+    completeAggregateExpressions: Seq[AggregateExpression2],
+    completeAggregateAttributes: Seq[Attribute],
+    initialInputBufferOffset: Int,
+    resultExpressions: Seq[NamedExpression],
+    child: SparkPlan)
+  extends UnaryNode {
+
+  private[this] val allAggregateExpressions =
+    nonCompleteAggregateExpressions ++ completeAggregateExpressions
+
+  private[this] val hasNonAlgebricAggregateFunctions =
+    !allAggregateExpressions.forall(_.aggregateFunction.isInstanceOf[AlgebraicAggregate])
+
+  // Use the hybrid iterator if (1) unsafe is enabled, (2) the schemata of
+  // grouping key and aggregation buffer is supported; and (3) all
+  // aggregate functions are algebraic.
+  private[this] val supportsHybridIterator: Boolean = {
+    val aggregationBufferSchema: StructType =
+      StructType.fromAttributes(
+        allAggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes))
+    val groupKeySchema: StructType =
+      StructType.fromAttributes(groupingExpressions.map(_.toAttribute))
+
+    val schemaSupportsUnsafe: Boolean =
+      UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) &&
+        UnsafeProjection.canSupport(groupKeySchema)
+
+    // TODO: Use the hybrid iterator for non-algebric aggregate functions.
+    sqlContext.conf.unsafeEnabled && schemaSupportsUnsafe && !hasNonAlgebricAggregateFunctions
+  }
+
+  private[this] val hybridAggregateEnabled = sqlContext.conf.useHybridAggregate
+
+  // We need to use sorted input if we have grouping expressions, and
+  // we cannot use the hybrid iterator or the hybrid is disabled.
+  private[this] val requiresSortedInput: Boolean = {
+    groupingExpressions.nonEmpty && (!supportsHybridIterator || !hybridAggregateEnabled)
+  }
+
+  override def canProcessUnsafeRows: Boolean = !hasNonAlgebricAggregateFunctions
+
+  // If result expressions' data types are all fixed length, we generate unsafe rows
+  // (We have this requirement instead of check the result of UnsafeProjection.canSupport
+  // is because we use a mutable projection to generate the result).
+  override def outputsUnsafeRows: Boolean = {
+    // resultExpressions.map(_.dataType).forall(UnsafeRow.isFixedLength)
+    // TODO: Supports generating UnsafeRows. We can just re-enable the line above and fix
+    // any issue we get.
+    false
+  }
+
+  override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
+
+  override def requiredChildDistribution: List[Distribution] = {
+    requiredChildDistributionExpressions match {
+      case Some(exprs) if exprs.length == 0 => AllTuples :: Nil
+      case Some(exprs) if exprs.length > 0 => ClusteredDistribution(exprs) :: Nil
+      case None => UnspecifiedDistribution :: Nil
+    }
+  }
+
+  override def requiredChildOrdering: Seq[Seq[SortOrder]] = {
+    if (requiresSortedInput) {
+      // TODO: We should not sort the input rows if they are just in reversed order.
+      groupingExpressions.map(SortOrder(_, Ascending)) :: Nil
+    } else {
+      Seq.fill(children.size)(Nil)
+    }
+  }
+
+  override def outputOrdering: Seq[SortOrder] = {
+    if (requiresSortedInput) {
+      // It is possible that the child.outputOrdering starts with the required
+      // ordering expressions (e.g. we require [a] as the sort expression and the
+      // child's outputOrdering is [a, b]). We can only guarantee the output rows
+      // are sorted by values of groupingExpressions.
+      groupingExpressions.map(SortOrder(_, Ascending))
+    } else {
+      Nil
+    }
+  }
+
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
+    child.execute().mapPartitions { iter =>
+      // Because the constructor of an aggregation iterator will read at least the first row,
+      // we need to get the value of iter.hasNext first.
+      val hasInput = iter.hasNext
+      val useHybridIterator =
+        hasInput &&
+          supportsHybridIterator &&
+          groupingExpressions.nonEmpty &&
+          hybridAggregateEnabled
+      if (useHybridIterator) {
+        UnsafeHybridAggregationIterator.createFromInputIterator(
+          groupingExpressions,
+          nonCompleteAggregateExpressions,
+          nonCompleteAggregateAttributes,
+          completeAggregateExpressions,
+          completeAggregateAttributes,
+          initialInputBufferOffset,
+          resultExpressions,
+          newMutableProjection _,
+          child.output,
+          iter,
+          outputsUnsafeRows)
+      } else {
+        if (!hasInput && groupingExpressions.nonEmpty) {
+          // This is a grouped aggregate and the input iterator is empty,
+          // so return an empty iterator.
+          Iterator[InternalRow]()
+        } else {
+          val outputIter = SortBasedAggregationIterator.createFromInputIterator(
+            groupingExpressions,
+            nonCompleteAggregateExpressions,
+            nonCompleteAggregateAttributes,
+            completeAggregateExpressions,
+            completeAggregateAttributes,
+            initialInputBufferOffset,
+            resultExpressions,
+            newMutableProjection _ ,
+            newProjection _,
+            child.output,
+            iter,
+            outputsUnsafeRows)
+          if (!hasInput && groupingExpressions.isEmpty) {
+            // There is no input and there is no grouping expressions.
+            // We need to output a single row as the output.
+            Iterator[InternalRow](outputIter.outputForEmptyGroupingKeyWithoutInput())
+          } else {
+            outputIter
+          }
+        }
+      }
+    }
+  }
+
+  override def simpleString: String = {
+    val iterator = if (supportsHybridIterator && groupingExpressions.nonEmpty) {
+      classOf[UnsafeHybridAggregationIterator].getSimpleName
+    } else {
+      classOf[SortBasedAggregationIterator].getSimpleName
+    }
+
+    s"""NewAggregate with $iterator ${groupingExpressions} ${allAggregateExpressions}"""
+  }
+}
-Original file line number
+Diff line change
@@ Expand Up @@
       }
       override def keyExpressions: Seq[Expression] = expressions
+      override def toString: String = s"${super.toString} numPartitions=$numPartitions"
     }
     /**
@@ Expand Down @@