apache · agubichev · Dec 21, 2023 · Dec 21, 2023
diff --git a/...talyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/DecorrelateInnerQuery.scala b/...talyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/DecorrelateInnerQuery.scala
@@ -22,7 +22,6 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.TreePattern.OUTER_REFERENCE
@@ -462,22 +461,6 @@ object DecorrelateInnerQuery extends PredicateHelper {
       p.mapChildren(rewriteDomainJoins(outerPlan, _, conditions))
   }
 
-  private def isCountBugFree(aggregateExpressions: Seq[NamedExpression]): Boolean = {
-    // The COUNT bug only appears if an aggregate expression returns a non-NULL result on an empty
-    // input.
-    // Typical example (hence the name) is COUNT(*) that returns 0 from an empty result.
-    // However, SUM(x) IS NULL is another case that returns 0, and in general any IS/NOT IS and CASE
-    // expressions are suspect (and the combination of those).
-    // For now we conservatively accept only those expressions that are guaranteed to be safe.
-    aggregateExpressions.forall {
-      case _ : AttributeReference => true
-      case Alias(_: AttributeReference, _) => true
-      case Alias(_: Literal, _) => true
-      case Alias(a: AggregateExpression, _) if a.aggregateFunction.defaultResult == None => true
-      case _ => false
-    }
-  }
-
   def apply(
       innerPlan: LogicalPlan,
       outerPlan: LogicalPlan,
@@ -727,8 +710,6 @@ object DecorrelateInnerQuery extends PredicateHelper {
           case a @ Aggregate(groupingExpressions, aggregateExpressions, child) =>
             val outerReferences = collectOuterReferences(a.expressions)
             val newOuterReferences = parentOuterReferences ++ outerReferences
-            val countBugSusceptible = groupingExpressions.isEmpty &&
-              !isCountBugFree(aggregateExpressions)
             val (newChild, joinCond, outerReferenceMap) =
               decorrelate(child, newOuterReferences, aggregated = true, underSetOp)
             // Replace all outer references in grouping and aggregate expressions, and keep
@@ -791,8 +772,7 @@ object DecorrelateInnerQuery extends PredicateHelper {
             // | 0 | 2    | true       | 2                              |
             // | 0 | null | null       | 0                              |  <--- correct result
             // +---+------+------------+--------------------------------+
-            // TODO(a.gubichev): retire the 'handleCountBug' parameter.
-            if (countBugSusceptible && handleCountBug) {
+            if (groupingExpressions.isEmpty && handleCountBug) {
               // Evaluate the aggregate expressions with zero tuples.
               val resultMap = RewriteCorrelatedScalarSubquery.evalAggregateOnZeroTups(newAggregate)
               val alwaysTrue = Alias(Literal.TrueLiteral, "alwaysTrue")()

diff --git a/...st/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out b/...st/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out
@@ -345,3 +345,32 @@ Project [emp_name#x, bonus_amt#x]
             +- Project [emp_name#x, bonus_amt#x]
                +- SubqueryAlias BONUS
                   +- LocalRelation [emp_name#x, bonus_amt#x]
+
+
+-- !query
+SELECT tt1.emp_name
+FROM EMP as tt1
+WHERE EXISTS (
+  select max(tt2.id)
+  from EMP as tt2
+  where tt1.emp_name is null
+)
+-- !query analysis
+Project [emp_name#x]
++- Filter exists#x [emp_name#x]
+   :  +- Aggregate [max(id#x) AS max(id)#x]
+   :     +- Filter isnull(outer(emp_name#x))
+   :        +- SubqueryAlias tt2
+   :           +- SubqueryAlias emp
+   :              +- View (`EMP`, [id#x,emp_name#x,hiredate#x,salary#x,dept_id#x])
+   :                 +- Project [cast(id#x as int) AS id#x, cast(emp_name#x as string) AS emp_name#x, cast(hiredate#x as date) AS hiredate#x, cast(salary#x as double) AS salary#x, cast(dept_id#x as int) AS dept_id#x]
+   :                    +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]
+   :                       +- SubqueryAlias EMP
+   :                          +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]
+   +- SubqueryAlias tt1
+      +- SubqueryAlias emp
+         +- View (`EMP`, [id#x,emp_name#x,hiredate#x,salary#x,dept_id#x])
+            +- Project [cast(id#x as int) AS id#x, cast(emp_name#x as string) AS emp_name#x, cast(hiredate#x as date) AS hiredate#x, cast(salary#x as double) AS salary#x, cast(dept_id#x as int) AS dept_id#x]
+               +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]
+                  +- SubqueryAlias EMP
+                     +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]
diff --git a/...st/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-count-bug.sql.out b/...st/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-count-bug.sql.out
@@ -143,6 +143,23 @@ Project [c1#x, c2#x]
             +- LocalRelation [col1#x, col2#x]
 
 
+-- !query
+select * from t1 where exists (select count(*) from t2 where t1.c1 = 100)
+-- !query analysis
+Project [c1#x, c2#x]
++- Filter exists#x [c1#x]
+   :  +- Aggregate [count(1) AS count(1)#xL]
+   :     +- Filter (outer(c1#x) = 100)
+   :        +- SubqueryAlias t2
+   :           +- View (`t2`, [c1#x,c2#x])
+   :              +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x]
+   :                 +- LocalRelation [col1#x, col2#x]
+   +- SubqueryAlias t1
+      +- View (`t1`, [c1#x,c2#x])
+         +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x]
+            +- LocalRelation [col1#x, col2#x]
+
+
 -- !query
 set spark.sql.optimizer.decorrelateExistsSubqueryLegacyIncorrectCountHandling.enabled = true
 -- !query analysis
@@ -240,6 +257,23 @@ Project [c1#x, c2#x]
             +- LocalRelation [col1#x, col2#x]
 
 
+-- !query
+select * from t1 where exists (select count(*) from t2 where t1.c1 = 100)
+-- !query analysis
+Project [c1#x, c2#x]
++- Filter exists#x [c1#x]
+   :  +- Aggregate [count(1) AS count(1)#xL]
+   :     +- Filter (outer(c1#x) = 100)
+   :        +- SubqueryAlias t2
+   :           +- View (`t2`, [c1#x,c2#x])
+   :              +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x]
+   :                 +- LocalRelation [col1#x, col2#x]
+   +- SubqueryAlias t1
+      +- View (`t1`, [c1#x,c2#x])
+         +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x]
+            +- LocalRelation [col1#x, col2#x]
+
+
 -- !query
 set spark.sql.optimizer.decorrelateExistsSubqueryLegacyIncorrectCountHandling.enabled = false
 -- !query analysis

diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql
@@ -125,3 +125,12 @@ FROM BONUS
 WHERE EXISTS(SELECT RANK() OVER (PARTITION BY hiredate ORDER BY salary) AS s
                     FROM EMP, DEPT where EMP.dept_id = DEPT.dept_id
                         AND DEPT.dept_name < BONUS.emp_name);
+
+-- SPARK-46468: Aggregate always returns 1 row, so EXISTS is always true.
+SELECT tt1.emp_name
+FROM EMP as tt1
+WHERE EXISTS (
+  select max(tt2.id)
+  from EMP as tt2
+  where tt1.emp_name is null
+);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-count-bug.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-count-bug.sql
@@ -20,6 +20,9 @@ select * from t1 where
  not exists(select count(*) - 1 from t2 where t2.c1 = t1.c1)) AND
  exists(select count(*) from t2 where t2.c1 = t1.c2);
 
+select * from t1 where exists (select count(*) from t2 where t1.c1 = 100);
+
+
 -- With legacy behavior flag set, some answers are not correct.
 set spark.sql.optimizer.decorrelateExistsSubqueryLegacyIncorrectCountHandling.enabled = true;
 select * from t1 where exists (select count(*) from t2 where t2.c1 = t1.c1);
@@ -34,4 +37,6 @@ select * from t1 where
  exists(select count(*) + 1 from t2 where t2.c1 = t1.c1) OR
  not exists (select count(*) - 1 from t2 where t2.c1 = t1.c1);
 
+select * from t1 where exists (select count(*) from t2 where t1.c1 = 100);
+
 set spark.sql.optimizer.decorrelateExistsSubqueryLegacyIncorrectCountHandling.enabled = false;
diff --git a/sql/core/src/test/resources/sql-tests/results/join-lateral.sql.out b/sql/core/src/test/resources/sql-tests/results/join-lateral.sql.out
@@ -759,6 +759,7 @@ SELECT * FROM t1, LATERAL (SELECT SUM(cnt) FROM (SELECT COUNT(*) cnt FROM t2 WHE
 struct<c1:int,c2:int,sum(cnt):bigint>
 -- !query output
 0	1	2
+1	2	NULL
 
 
 -- !query

diff --git a/...re/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out b/...re/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out
@@ -197,3 +197,25 @@ emp 3	300.0
 emp 4	100.0
 emp 5	1000.0
 emp 6 - no dept	500.0
+
+
+-- !query
+SELECT tt1.emp_name
+FROM EMP as tt1
+WHERE EXISTS (
+  select max(tt2.id)
+  from EMP as tt2
+  where tt1.emp_name is null
+)
+-- !query schema
+struct<emp_name:string>
+-- !query output
+emp 1
+emp 1
+emp 2
+emp 3
+emp 4
+emp 5
+emp 6 - no dept
+emp 7
+emp 8
diff --git a/...re/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-count-bug.sql.out b/...re/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-count-bug.sql.out
@@ -81,6 +81,15 @@ struct<c1:int,c2:int>
 1	2
 
 
+-- !query
+select * from t1 where exists (select count(*) from t2 where t1.c1 = 100)
+-- !query schema
+struct<c1:int,c2:int>
+-- !query output
+0	1
+1	2
+
+
 -- !query
 set spark.sql.optimizer.decorrelateExistsSubqueryLegacyIncorrectCountHandling.enabled = true
 -- !query schema
@@ -134,6 +143,14 @@ struct<c1:int,c2:int>
 1	2
 
 
+-- !query
+select * from t1 where exists (select count(*) from t2 where t1.c1 = 100)
+-- !query schema
+struct<c1:int,c2:int>
+-- !query output
+
+
+
 -- !query
 set spark.sql.optimizer.decorrelateExistsSubqueryLegacyIncorrectCountHandling.enabled = false
 -- !query schema
-Original file line number
+Diff line change
@@ Expand Up @@
     struct<c1:int,c2:int,sum(cnt):bigint>
     -- !query output
 	1	2
+	2	NULL
     -- !query
@@ Expand Down @@