From c2137d78ab474f2ccbcc944eb3654d9b1bdbdcef Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@cloudera.com>
Date: Tue, 27 Feb 2018 16:42:17 -0800
Subject: [PATCH 1/3] [SPARK-23500][SQL] Fix complex type simplification rules
 to apply to entire plan

## What changes were proposed in this pull request?

Complex type simplification optimizer rules were not applied to the
entire plan, just the expressions reachable from the root node. This
patch fixes the rules to transform the entire plan.

## How was this patch tested?

New unit test + sql / core tests.
---
 .../sql/catalyst/optimizer/ComplexTypes.scala | 50 ++++++-------------
 .../sql/catalyst/optimizer/Optimizer.scala    |  4 +-
 .../optimizer/complexTypesSuite.scala         | 42 ++++++++++++++--
 3 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
index be0009ec8c76..61946774dc42 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
@@ -22,35 +22,26 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 /**
-* push down operations into [[CreateNamedStructLike]].
-*/
-object SimplifyCreateStructOps extends Rule[LogicalPlan] {
-  override def apply(plan: LogicalPlan): LogicalPlan = {
-    plan.transformExpressionsUp {
-      // push down field extraction
+ * Simplify redundant [[CreateNamedStructLike]], [[CreateArray]] and [[CreateMap]] expressions.
+ */
+object SimplifyExtractValueOps extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transform { case p =>
+    p.transformExpressionsUp {
+      // Remove redundant field extraction.
       case GetStructField(createNamedStructLike: CreateNamedStructLike, ordinal, _) =>
         createNamedStructLike.valExprs(ordinal)
-    }
-  }
-}
 
-/**
-* push down operations into [[CreateArray]].
-*/
-object SimplifyCreateArrayOps extends Rule[LogicalPlan] {
-  override def apply(plan: LogicalPlan): LogicalPlan = {
-    plan.transformExpressionsUp {
-      // push down field selection (array of structs)
-      case GetArrayStructFields(CreateArray(elems), field, ordinal, numFields, containsNull) =>
-        // instead f selecting the field on the entire array,
-        // select it from each member of the array.
-        // pushing down the operation this way open other optimizations opportunities
-        // (i.e. struct(...,x,...).x)
+      // Remove redundant array indexing.
+      case GetArrayStructFields(CreateArray(elems), field, ordinal, _, _) =>
+        // Instead of selecting the field on the entire array, select it from each member
+        // of the array. Pushing down the operation this way may open other optimizations
+        // opportunities (i.e. struct(...,x,...).x)
         CreateArray(elems.map(GetStructField(_, ordinal, Some(field.name))))
-      // push down item selection.
+
+      // Remove redundant map lookup.
       case ga @ GetArrayItem(CreateArray(elems), IntegerLiteral(idx)) =>
-        // instead of creating the array and then selecting one row,
-        // remove array creation altgether.
+        // Instead of creating the array and then selecting one row, remove array creation
+        // altogether.
         if (idx >= 0 && idx < elems.size) {
           // valid index
           elems(idx)
@@ -58,18 +49,7 @@ object SimplifyCreateArrayOps extends Rule[LogicalPlan] {
           // out of bounds, mimic the runtime behavior and return null
           Literal(null, ga.dataType)
         }
-    }
-  }
-}
-
-/**
-* push down operations into [[CreateMap]].
-*/
-object SimplifyCreateMapOps extends Rule[LogicalPlan] {
-  override def apply(plan: LogicalPlan): LogicalPlan = {
-    plan.transformExpressionsUp {
       case GetMapValue(CreateMap(elems), key) => CaseKeyWhen(key, elems)
     }
   }
 }
-
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 91208479be03..2829d1d81eb1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -85,9 +85,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
         EliminateSerialization,
         RemoveRedundantAliases,
         RemoveRedundantProject,
-        SimplifyCreateStructOps,
-        SimplifyCreateArrayOps,
-        SimplifyCreateMapOps,
+        SimplifyExtractValueOps,
         CombineConcats) ++
         extendedOperatorOptimizationRules
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index de544ac31478..08b0c80ef8c6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -44,14 +44,13 @@ class ComplexTypesSuite extends PlanTest with ExpressionEvalHelper {
           BooleanSimplification,
           SimplifyConditionals,
           SimplifyBinaryComparison,
-          SimplifyCreateStructOps,
-          SimplifyCreateArrayOps,
-          SimplifyCreateMapOps) :: Nil
+          SimplifyExtractValueOps) :: Nil
   }
 
   val idAtt = ('id).long.notNull
+  val nullableIdAtt = ('nullable_id).long
 
-  lazy val relation = LocalRelation(idAtt )
+  lazy val relation = LocalRelation(idAtt, nullableIdAtt)
 
   test("explicit get from namedStruct") {
     val query = relation
@@ -321,7 +320,7 @@ class ComplexTypesSuite extends PlanTest with ExpressionEvalHelper {
       .select(
         CaseWhen(Seq(
           (EqualTo(2L, 'id), ('id + 1L)),
-          // these two are possible matches, we can't tell untill runtime
+          // these two are possible matches, we can't tell until runtime
           (EqualTo(2L, ('id + 1L)), ('id + 2L)),
           (EqualTo(2L, 'id + 2L), Literal.create(null, LongType)),
           // this is a definite match (two constants),
@@ -331,4 +330,37 @@ class ComplexTypesSuite extends PlanTest with ExpressionEvalHelper {
       .analyze
     comparePlans(Optimizer execute rel, expected)
   }
+
+  test("SPARK-23500: Simplify complex ops that aren't at the plan root") {
+    val structRel = relation
+      .select(GetStructField(CreateNamedStruct(Seq("att1", 'nullable_id)), 0, None) as "foo")
+      .groupBy($"foo")("1").analyze
+    val structExpected = relation
+      .select('nullable_id as "foo")
+      .groupBy($"foo")("1").analyze
+    comparePlans(Optimizer execute structRel, structExpected)
+
+    // These tests must use nullable attributes from the base relation for the following reason:
+    // in the 'original' plans below, the Aggregate node produced by groupBy() has a
+    // nullable AttributeReference to a1, because both array indexing and map lookup are
+    // nullable expressions. After optimization, the same attribute is now non-nullable,
+    // but the AttributeReference is not updated to reflect this. In the 'expected' plans,
+    // the grouping expressions have the same nullability as the original attribute in the
+    // relation. If that attribute is non-nullable, the tests will fail as the plans will
+    // compare differently, so for these tests we must use a nullable attribute. See
+    // SPARK-23634.
+    val arrayRel = relation
+      .select(GetArrayItem(CreateArray(Seq('nullable_id, 'nullable_id + 1L)), 0) as "a1")
+      .groupBy($"a1")("1").analyze
+    val arrayExpected = relation.select('nullable_id as "a1").groupBy($"a1")("1").analyze
+    comparePlans(Optimizer execute arrayRel, arrayExpected)
+
+    val mapRel = relation
+      .select(GetMapValue(CreateMap(Seq("id", 'nullable_id)), "id") as "m1")
+      .groupBy($"m1")("1").analyze
+    val mapExpected = relation
+      .select('nullable_id as "m1")
+      .groupBy($"m1")("1").analyze
+    comparePlans(Optimizer execute mapRel, mapExpected)
+  }
 }

From 8adaa4748b7b0d7990b45e58d82c7d9f2248923f Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@apache.org>
Date: Thu, 15 Mar 2018 15:57:52 -0700
Subject: [PATCH 2/3] Handle aggregation case

---
 .../sql/catalyst/optimizer/ComplexTypes.scala      | 14 ++++++++++++--
 .../sql/catalyst/optimizer/complexTypesSuite.scala | 10 ++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
index 61946774dc42..a22141a3b542 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
@@ -19,14 +19,24 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 import org.apache.spark.sql.catalyst.rules.Rule
 
 /**
  * Simplify redundant [[CreateNamedStructLike]], [[CreateArray]] and [[CreateMap]] expressions.
  */
 object SimplifyExtractValueOps extends Rule[LogicalPlan] {
-  override def apply(plan: LogicalPlan): LogicalPlan = plan transform { case p =>
-    p.transformExpressionsUp {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    // One place where this optimization is invalid is an aggregation where the select
+    // list expression is a function of a grouping expression:
+    //
+    // SELECT struct(a,b).a FROM tbl GROUP BY struct(a,b)
+    //
+    // cannot be simplified to SELECT a FROM tbl GROUP BY struct(a,b). So just skip this
+    // optimization for Aggregates (although this misses some cases where the optimization
+    // can be made).
+    case a: Aggregate => a
+    case p => p.transformExpressionsUp {
       // Remove redundant field extraction.
       case GetStructField(createNamedStructLike: CreateNamedStructLike, ordinal, _) =>
         createNamedStructLike.valExprs(ordinal)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index 08b0c80ef8c6..efc965e4b07a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -362,5 +362,15 @@ class ComplexTypesSuite extends PlanTest with ExpressionEvalHelper {
       .select('nullable_id as "m1")
       .groupBy($"m1")("1").analyze
     comparePlans(Optimizer execute mapRel, mapExpected)
+
+    // Make sure that aggregation exprs are correctly ignored. Maps can't be used in grouping exprs.
+    val structAggRel = relation.groupBy(
+      CreateNamedStruct(Seq("att1", 'nullable_id)))(
+      GetStructField(CreateNamedStruct(Seq("att1", 'nullable_id)), 0, None)).analyze
+    comparePlans(Optimizer execute structAggRel, structAggRel)
+
+    val arrayAggRel = relation.groupBy(
+      CreateArray(Seq('nullable_id)))(GetArrayItem(CreateArray(Seq('nullable_id)), 0)).analyze
+    comparePlans(Optimizer execute arrayAggRel, arrayAggRel)
   }
 }

From 592630148af19adbb72703dd1ff49f82c33087d2 Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@apache.org>
Date: Mon, 19 Mar 2018 16:18:55 -0700
Subject: [PATCH 3/3] Style fixes etc.

---
 .../apache/spark/sql/catalyst/optimizer/ComplexTypes.scala   | 3 +--
 .../spark/sql/catalyst/optimizer/complexTypesSuite.scala     | 5 ++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
index a22141a3b542..db7d6d3254bd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.Aggregate
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.Rule
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index efc965e4b07a..e44a6692ad8e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -362,8 +362,11 @@ class ComplexTypesSuite extends PlanTest with ExpressionEvalHelper {
       .select('nullable_id as "m1")
       .groupBy($"m1")("1").analyze
     comparePlans(Optimizer execute mapRel, mapExpected)
+  }
 
-    // Make sure that aggregation exprs are correctly ignored. Maps can't be used in grouping exprs.
+  test("SPARK-23500: Ensure that aggregation expressions are not simplified") {
+    // Make sure that aggregation exprs are correctly ignored. Maps can't be used in
+    // grouping exprs so aren't tested here.
     val structAggRel = relation.groupBy(
       CreateNamedStruct(Seq("att1", 'nullable_id)))(
       GetStructField(CreateNamedStruct(Seq("att1", 'nullable_id)), 0, None)).analyze