[SPARK-20688][SQL] correctly check analysis for scalar sub-queries

cloud-fan · cloud-fan · commit 789bdbe3d0d9 · 2017-05-10T19:30:00.000+08:00
## What changes were proposed in this pull request? In `CheckAnalysis`, we should call `checkAnalysis` for `ScalarSubquery` at the beginning, as later we will call `plan.output` which is invalid if `plan` is not resolved. ## How was this patch tested? new regression test Author: Wenchen Fan <wenchen@databricks.com> Closes #17930 from cloud-fan/tmp.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -130,12 +130,13 @@ trait CheckAnalysis extends PredicateHelper {
             }
 
           case s @ ScalarSubquery(query, conditions, _) =>
+            checkAnalysis(query)
+
             // If no correlation, the output must be exactly one column
             if (conditions.isEmpty && query.output.size != 1) {
               failAnalysis(
                 s"Scalar subquery must return only one column, but got ${query.output.size}")
-            }
-            else if (conditions.nonEmpty) {
+            } else if (conditions.nonEmpty) {
               def checkAggregate(agg: Aggregate): Unit = {
                 // Make sure correlated scalar subqueries contain one row for every outer row by
                 // enforcing that they are aggregates containing exactly one aggregate expression.
@@ -179,7 +180,6 @@ trait CheckAnalysis extends PredicateHelper {
                 case fail => failAnalysis(s"Correlated scalar subqueries must be Aggregated: $fail")
               }
             }
-            checkAnalysis(query)
             s
 
           case s: SubqueryExpression =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -72,7 +72,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("rdd deserialization does not crash [SPARK-15791]") {
+  test("SPARK-15791: rdd deserialization does not crash") {
     sql("select (select 1 as b) as b").rdd.count()
   }
 
@@ -867,4 +867,12 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       sql("select * from l, r where l.a = r.c + 1 AND (exists (select * from r) OR l.a = r.c)"),
       Row(3, 3.0, 2, 3.0) :: Row(3, 3.0, 2, 3.0) :: Nil)
   }
+
+  test("SPARK-20688: correctly check analysis for scalar sub-queries") {
+    withTempView("t") {
+      Seq(1 -> "a").toDF("i", "j").createTempView("t")
+      val e = intercept[AnalysisException](sql("SELECT (SELECT count(*) FROM t WHERE a = 1)"))
+      assert(e.message.contains("cannot resolve '`a`' given input columns: [i, j]"))
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -130,12 +130,13 @@ trait CheckAnalysis extends PredicateHelper {`
`130`	`130`	`}`
`131`	`131`
`132`	`132`	`case s @ ScalarSubquery(query, conditions, _) =>`
	`133`	`+ checkAnalysis(query)`
	`134`	`+`
`133`	`135`	`// If no correlation, the output must be exactly one column`
`134`	`136`	`if (conditions.isEmpty && query.output.size != 1) {`
`135`	`137`	`failAnalysis(`
`136`	`138`	`s"Scalar subquery must return only one column, but got ${query.output.size}")`
`137`		`- }`
`138`		`- else if (conditions.nonEmpty) {`
	`139`	`+ } else if (conditions.nonEmpty) {`
`139`	`140`	`def checkAggregate(agg: Aggregate): Unit = {`
`140`	`141`	`// Make sure correlated scalar subqueries contain one row for every outer row by`
`141`	`142`	`// enforcing that they are aggregates containing exactly one aggregate expression.`
`@@ -179,7 +180,6 @@ trait CheckAnalysis extends PredicateHelper {`
`179`	`180`	`case fail => failAnalysis(s"Correlated scalar subqueries must be Aggregated: $fail")`
`180`	`181`	`}`
`181`	`182`	`}`
`182`		`- checkAnalysis(query)`
`183`	`183`	`s`
`184`	`184`
`185`	`185`	`case s: SubqueryExpression =>`
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext {`
`72`	`72`	`}`
`73`	`73`	`}`
`74`	`74`
`75`		`- test("rdd deserialization does not crash [SPARK-15791]") {`
	`75`	`+ test("SPARK-15791: rdd deserialization does not crash") {`
`76`	`76`	`sql("select (select 1 as b) as b").rdd.count()`
`77`	`77`	`}`
`78`	`78`
`@@ -867,4 +867,12 @@ class SubquerySuite extends QueryTest with SharedSQLContext {`
`867`	`867`	`sql("select * from l, r where l.a = r.c + 1 AND (exists (select * from r) OR l.a = r.c)"),`
`868`	`868`	`Row(3, 3.0, 2, 3.0) :: Row(3, 3.0, 2, 3.0) :: Nil)`
`869`	`869`	`}`
	`870`	`+`
	`871`	`+ test("SPARK-20688: correctly check analysis for scalar sub-queries") {`
	`872`	`+ withTempView("t") {`
	`873`	`+ Seq(1 -> "a").toDF("i", "j").createTempView("t")`
	`874`	`+ val e = intercept[AnalysisException](sql("SELECT (SELECT count(*) FROM t WHERE a = 1)"))`
	`875`	+ assert(e.message.contains("cannot resolve '`a`' given input columns: [i, j]"))
	`876`	`+ }`
	`877`	`+ }`
`870`	`878`	`}`