diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql index 393de498edfa6..0cc57c97b0202 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql @@ -20,29 +20,25 @@ SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1; SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1); -- Aggregate grouped by literals (hash aggregate). -SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY 1; +SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1); -- Aggregate grouped by literals (sort aggregate). -SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY 1; +SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1); -- Aggregate with complex GroupBy expressions. SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b; SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1; - --- [SPARK-28445] Inconsistency between Scala and Python/Panda udfs when groupby with udf() is used --- The following query will make Scala UDF work, but Python and Pandas udfs will fail with an AnalysisException. --- The query should be added after SPARK-28445. --- SELECT udf(a + 1), udf(COUNT(b)) FROM testData GROUP BY udf(a + 1); +SELECT udf(a + 1) + 1, udf(COUNT(b)) FROM testData GROUP BY udf(a + 1); -- Aggregate with nulls. SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) FROM testData; -- Aggregate with foldable input and multiple distinct groups. -SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a; +SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY udf(a); -- Aliases in SELECT could be used in GROUP BY -SELECT a AS k, COUNT(udf(b)) FROM testData GROUP BY k; +SELECT udf(a) AS k, COUNT(udf(b)) FROM testData GROUP BY k; SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1; -- Aggregate functions cannot be used in GROUP BY @@ -51,7 +47,7 @@ SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k; -- Test data. CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES (1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v); -SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY a; +SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY udf(a); -- turn off group by aliases set spark.sql.groupByAliases=false; @@ -60,7 +56,7 @@ set spark.sql.groupByAliases=false; SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k; -- Aggregate with empty input and non-empty GroupBy expressions. -SELECT a, COUNT(udf(1)) FROM testData WHERE false GROUP BY a; +SELECT udf(a), COUNT(udf(1)) FROM testData WHERE false GROUP BY udf(a); -- Aggregate with empty input and empty GroupBy expressions. SELECT udf(COUNT(1)) FROM testData WHERE false; @@ -104,21 +100,21 @@ SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4; SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5; -- group by -SELECT k, every(v), udf(some(v)), any(v) FROM test_agg GROUP BY k; +SELECT udf(k), every(v), udf(some(v)), any(v) FROM test_agg GROUP BY udf(k); -- having SELECT udf(k), every(v) FROM test_agg GROUP BY k HAVING every(v) = false; -SELECT k, udf(every(v)) FROM test_agg GROUP BY k HAVING every(v) IS NULL; +SELECT udf(k), udf(every(v)) FROM test_agg GROUP BY udf(k) HAVING every(v) IS NULL; -- basic subquery path to make sure rewrite happens in both parent and child plans. -SELECT k, +SELECT udf(k), udf(Every(v)) AS every FROM test_agg WHERE k = 2 AND v IN (SELECT Any(v) FROM test_agg WHERE k = 1) -GROUP BY k; +GROUP BY udf(k); -- basic subquery path to make sure rewrite happens in both parent and child plans. SELECT udf(udf(k)), @@ -128,7 +124,7 @@ WHERE k = 2 AND v IN (SELECT Every(v) FROM test_agg WHERE k = 1) -GROUP BY k; +GROUP BY udf(udf(k)); -- input type checking Int SELECT every(udf(1)); diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out index 789c91fa3870b..febe47b5ba84e 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 51 +-- Number of queries: 52 -- !query 0 @@ -77,7 +77,7 @@ struct -- !query 8 -SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY 1 +SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) -- !query 8 schema struct -- !query 8 output @@ -85,7 +85,7 @@ struct> -- !query 9 output @@ -114,111 +114,122 @@ expression 'testdata.`a`' is neither present in the group by, nor is it an aggre -- !query 12 -SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) -FROM testData +SELECT udf(a + 1) + 1, udf(COUNT(b)) FROM testData GROUP BY udf(a + 1) -- !query 12 schema -struct +struct<(CAST(udf(cast((a + 1) as string)) AS INT) + 1):int,CAST(udf(cast(count(b) as string)) AS BIGINT):bigint> -- !query 12 output --0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 +3 2 +4 2 +5 2 +NULL 1 -- !query 13 -SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a +SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) +FROM testData -- !query 13 schema -struct +struct -- !query 13 output -1 1 +-0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 -- !query 14 -SELECT a AS k, COUNT(udf(b)) FROM testData GROUP BY k +SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY udf(a) -- !query 14 schema -struct +struct -- !query 14 output +1 1 + + +-- !query 15 +SELECT udf(a) AS k, COUNT(udf(b)) FROM testData GROUP BY k +-- !query 15 schema +struct +-- !query 15 output 1 2 2 2 3 2 NULL 1 --- !query 15 +-- !query 16 SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1 --- !query 15 schema +-- !query 16 schema struct --- !query 15 output +-- !query 16 output 2 2 3 2 --- !query 16 +-- !query 17 SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k --- !query 16 schema +-- !query 17 schema struct<> --- !query 16 output +-- !query 17 output org.apache.spark.sql.AnalysisException aggregate functions are not allowed in GROUP BY, but found CAST(udf(cast(count(b) as string)) AS BIGINT); --- !query 17 +-- !query 18 CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES (1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) --- !query 17 schema +-- !query 18 schema struct<> --- !query 17 output +-- !query 18 output --- !query 18 -SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY a --- !query 18 schema +-- !query 19 +SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY udf(a) +-- !query 19 schema struct<> --- !query 18 output +-- !query 19 output org.apache.spark.sql.AnalysisException expression 'testdatahassamenamewithalias.`k`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; --- !query 19 +-- !query 20 set spark.sql.groupByAliases=false --- !query 19 schema +-- !query 20 schema struct --- !query 19 output +-- !query 20 output spark.sql.groupByAliases false --- !query 20 +-- !query 21 SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k --- !query 20 schema +-- !query 21 schema struct<> --- !query 20 output +-- !query 21 output org.apache.spark.sql.AnalysisException cannot resolve '`k`' given input columns: [testdata.a, testdata.b]; line 1 pos 57 --- !query 21 -SELECT a, COUNT(udf(1)) FROM testData WHERE false GROUP BY a --- !query 21 schema -struct --- !query 21 output +-- !query 22 +SELECT udf(a), COUNT(udf(1)) FROM testData WHERE false GROUP BY udf(a) +-- !query 22 schema +struct +-- !query 22 output --- !query 22 +-- !query 23 SELECT udf(COUNT(1)) FROM testData WHERE false --- !query 22 schema +-- !query 23 schema struct --- !query 22 output +-- !query 23 output 0 --- !query 23 +-- !query 24 SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t --- !query 23 schema +-- !query 24 schema struct<1:int> --- !query 23 output +-- !query 24 output 1 --- !query 24 +-- !query 25 SELECT 1 from ( SELECT 1 AS z, udf(MIN(a.x)) @@ -226,88 +237,88 @@ SELECT 1 from ( WHERE false ) b where b.z != b.z --- !query 24 schema +-- !query 25 schema struct<1:int> --- !query 24 output +-- !query 25 output --- !query 25 +-- !query 26 SELECT corr(DISTINCT x, y), udf(corr(DISTINCT y, x)), count(*) FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y) --- !query 25 schema +-- !query 26 schema struct --- !query 25 output +-- !query 26 output 1.0 1.0 3 --- !query 26 +-- !query 27 SELECT udf(1) FROM range(10) HAVING true --- !query 26 schema +-- !query 27 schema struct --- !query 26 output +-- !query 27 output 1 --- !query 27 +-- !query 28 SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0 --- !query 27 schema +-- !query 28 schema struct --- !query 27 output +-- !query 28 output 1 --- !query 28 +-- !query 29 SELECT udf(id) FROM range(10) HAVING id > 0 --- !query 28 schema +-- !query 29 schema struct<> --- !query 28 output +-- !query 29 output org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and '`id`' is not an aggregate function. Wrap '()' in windowing function(s) or wrap '`id`' in first() (or first_value) if you don't care which value you get.; --- !query 29 +-- !query 30 CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES (1, true), (1, false), (2, true), (3, false), (3, null), (4, null), (4, null), (5, null), (5, true), (5, false) AS test_agg(k, v) --- !query 29 schema +-- !query 30 schema struct<> --- !query 29 output +-- !query 30 output --- !query 30 +-- !query 31 SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0 --- !query 30 schema +-- !query 31 schema struct --- !query 30 output +-- !query 31 output NULL NULL NULL --- !query 31 +-- !query 32 SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4 --- !query 31 schema +-- !query 32 schema struct --- !query 31 output +-- !query 32 output NULL NULL NULL --- !query 32 +-- !query 33 SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5 --- !query 32 schema +-- !query 33 schema struct --- !query 32 output +-- !query 33 output false true true --- !query 33 -SELECT k, every(v), udf(some(v)), any(v) FROM test_agg GROUP BY k --- !query 33 schema -struct --- !query 33 output +-- !query 34 +SELECT udf(k), every(v), udf(some(v)), any(v) FROM test_agg GROUP BY udf(k) +-- !query 34 schema +struct +-- !query 34 output 1 false true true 2 true true true 3 false false false @@ -315,40 +326,40 @@ struct --- !query 34 output +-- !query 35 output 1 false 3 false 5 false --- !query 35 -SELECT k, udf(every(v)) FROM test_agg GROUP BY k HAVING every(v) IS NULL --- !query 35 schema -struct --- !query 35 output +-- !query 36 +SELECT udf(k), udf(every(v)) FROM test_agg GROUP BY udf(k) HAVING every(v) IS NULL +-- !query 36 schema +struct +-- !query 36 output 4 NULL --- !query 36 -SELECT k, +-- !query 37 +SELECT udf(k), udf(Every(v)) AS every FROM test_agg WHERE k = 2 AND v IN (SELECT Any(v) FROM test_agg WHERE k = 1) -GROUP BY k --- !query 36 schema -struct --- !query 36 output +GROUP BY udf(k) +-- !query 37 schema +struct +-- !query 37 output 2 true --- !query 37 +-- !query 38 SELECT udf(udf(k)), Every(v) AS every FROM test_agg @@ -356,54 +367,54 @@ WHERE k = 2 AND v IN (SELECT Every(v) FROM test_agg WHERE k = 1) -GROUP BY k --- !query 37 schema +GROUP BY udf(udf(k)) +-- !query 38 schema struct --- !query 37 output +-- !query 38 output --- !query 38 +-- !query 39 SELECT every(udf(1)) --- !query 38 schema +-- !query 39 schema struct<> --- !query 38 output +-- !query 39 output org.apache.spark.sql.AnalysisException cannot resolve 'every(CAST(udf(cast(1 as string)) AS INT))' due to data type mismatch: Input to function 'every' should have been boolean, but it's [int].; line 1 pos 7 --- !query 39 +-- !query 40 SELECT some(udf(1S)) --- !query 39 schema +-- !query 40 schema struct<> --- !query 39 output +-- !query 40 output org.apache.spark.sql.AnalysisException cannot resolve 'some(CAST(udf(cast(1 as string)) AS SMALLINT))' due to data type mismatch: Input to function 'some' should have been boolean, but it's [smallint].; line 1 pos 7 --- !query 40 +-- !query 41 SELECT any(udf(1L)) --- !query 40 schema +-- !query 41 schema struct<> --- !query 40 output +-- !query 41 output org.apache.spark.sql.AnalysisException cannot resolve 'any(CAST(udf(cast(1 as string)) AS BIGINT))' due to data type mismatch: Input to function 'any' should have been boolean, but it's [bigint].; line 1 pos 7 --- !query 41 +-- !query 42 SELECT udf(every("true")) --- !query 41 schema +-- !query 42 schema struct<> --- !query 41 output +-- !query 42 output org.apache.spark.sql.AnalysisException cannot resolve 'every('true')' due to data type mismatch: Input to function 'every' should have been boolean, but it's [string].; line 1 pos 11 --- !query 42 +-- !query 43 SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg --- !query 42 schema +-- !query 43 schema struct --- !query 42 output +-- !query 43 output 1 false false 1 true false 2 true true @@ -416,11 +427,11 @@ struct --- !query 43 output +-- !query 44 output 1 false false 1 true true 2 true true @@ -433,11 +444,11 @@ struct --- !query 44 output +-- !query 45 output 1 false false 1 true true 2 true true @@ -450,37 +461,37 @@ struct 1L --- !query 45 schema +-- !query 46 schema struct --- !query 45 output +-- !query 46 output 10 --- !query 46 +-- !query 47 SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true --- !query 46 schema +-- !query 47 schema struct --- !query 46 output +-- !query 47 output 1 true 2 true 5 true --- !query 47 +-- !query 48 SELECT * FROM (SELECT udf(COUNT(*)) AS cnt FROM test_agg) WHERE cnt > 1L --- !query 47 schema +-- !query 48 schema struct --- !query 47 output +-- !query 48 output 10 --- !query 48 +-- !query 49 SELECT udf(count(*)) FROM test_agg WHERE count(*) > 1L --- !query 48 schema +-- !query 49 schema struct<> --- !query 48 output +-- !query 49 output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. @@ -488,11 +499,11 @@ Expression in where clause: [(count(1) > 1L)] Invalid expressions: [count(1)]; --- !query 49 +-- !query 50 SELECT udf(count(*)) FROM test_agg WHERE count(*) + 1L > 1L --- !query 49 schema +-- !query 50 schema struct<> --- !query 49 output +-- !query 50 output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. @@ -500,11 +511,11 @@ Expression in where clause: [((count(1) + 1L) > 1L)] Invalid expressions: [count(1)]; --- !query 50 +-- !query 51 SELECT udf(count(*)) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1 --- !query 50 schema +-- !query 51 schema struct<> --- !query 50 output +-- !query 51 output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query.