diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql index 80edf12ef8ac..373896ccd167 100755 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql @@ -10,8 +10,6 @@ -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/select_implicit.sql -- -- This test file was converted from pgSQL/select_implicit.sql --- [SPARK-28445] Inconsistency between Scala and Python/Panda udfs when groupby with udf() is used --- TODO: We should add UDFs in GROUP BY clause when [SPARK-28445] is resolved. -- load test data CREATE TABLE test_missing_target (a int, b int, c string, d string) using parquet; @@ -29,29 +27,29 @@ INSERT INTO test_missing_target VALUES (9, 4, 'CCCC', 'j'); -- w/ existing GROUP BY target SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY -test_missing_target.c +udf(test_missing_target.c) ORDER BY udf(c); -- w/o existing GROUP BY target using a relation name in GROUP BY clause -SELECT udf(count(*)) FROM test_missing_target GROUP BY test_missing_target.c +SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c) ORDER BY udf(c); -- w/o existing GROUP BY target and w/o existing a different ORDER BY target -- failure expected -SELECT udf(count(*)) FROM test_missing_target GROUP BY a ORDER BY udf(b); +SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b); -- w/o existing GROUP BY target and w/o existing same ORDER BY target -SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b); +SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b); -- w/ existing GROUP BY target using a relation name in target SELECT udf(test_missing_target.b), udf(count(*)) - FROM test_missing_target GROUP BY b ORDER BY udf(b); + FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b); -- w/o existing GROUP BY target SELECT udf(c) FROM test_missing_target ORDER BY udf(a); -- w/o existing ORDER BY target -SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b) desc; +SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) desc; -- group using reference number SELECT udf(count(*)) FROM test_missing_target ORDER BY udf(1) desc; @@ -67,7 +65,7 @@ SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 3; -- failure expected SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY b ORDER BY udf(b); + GROUP BY udf(b) ORDER BY udf(b); -- order w/ target under ambiguous condition -- failure NOT expected @@ -82,17 +80,17 @@ SELECT udf(udf(a)/2), udf(udf(a)/2) FROM test_missing_target -- group expression w/ target under ambiguous condition -- failure NOT expected SELECT udf(a/2), udf(a/2) FROM test_missing_target - GROUP BY a/2 ORDER BY udf(a/2); + GROUP BY udf(a/2) ORDER BY udf(a/2); -- group w/ existing GROUP BY target under ambiguous condition SELECT udf(x.b), udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY x.b ORDER BY udf(x.b); + GROUP BY udf(x.b) ORDER BY udf(x.b); -- group w/o existing GROUP BY target under ambiguous condition SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY x.b ORDER BY udf(x.b); + GROUP BY udf(x.b) ORDER BY udf(x.b); -- [SPARK-28329] SELECT INTO syntax -- group w/o existing GROUP BY target under ambiguous condition @@ -107,50 +105,50 @@ SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y -- Functions and expressions -- w/ existing GROUP BY target -SELECT a%2, udf(count(udf(b))) FROM test_missing_target -GROUP BY test_missing_target.a%2 +SELECT udf(a%2), udf(count(udf(b))) FROM test_missing_target +GROUP BY udf(test_missing_target.a%2) ORDER BY udf(test_missing_target.a%2); -- w/o existing GROUP BY target using a relation name in GROUP BY clause SELECT udf(count(c)) FROM test_missing_target -GROUP BY lower(test_missing_target.c) +GROUP BY udf(lower(test_missing_target.c)) ORDER BY udf(lower(test_missing_target.c)); -- w/o existing GROUP BY target and w/o existing a different ORDER BY target -- failure expected -SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY a ORDER BY udf(b); +SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b); -- w/o existing GROUP BY target and w/o existing same ORDER BY target -SELECT udf(count(b)) FROM test_missing_target GROUP BY b/2 ORDER BY udf(b/2); +SELECT udf(count(b)) FROM test_missing_target GROUP BY udf(b/2) ORDER BY udf(b/2); -- w/ existing GROUP BY target using a relation name in target SELECT udf(lower(test_missing_target.c)), udf(count(udf(c))) - FROM test_missing_target GROUP BY lower(c) ORDER BY udf(lower(c)); + FROM test_missing_target GROUP BY udf(lower(c)) ORDER BY udf(lower(c)); -- w/o existing GROUP BY target SELECT udf(a) FROM test_missing_target ORDER BY udf(upper(udf(d))); -- w/o existing ORDER BY target SELECT udf(count(b)) FROM test_missing_target - GROUP BY (b + 1) / 2 ORDER BY udf((b + 1) / 2) desc; + GROUP BY udf((b + 1) / 2) ORDER BY udf((b + 1) / 2) desc; -- group w/o existing GROUP BY and ORDER BY target under ambiguous condition -- failure expected SELECT udf(count(udf(x.a))) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY b/2 ORDER BY udf(b/2); + GROUP BY udf(b/2) ORDER BY udf(b/2); -- group w/ existing GROUP BY target under ambiguous condition SELECT udf(x.b/2), udf(count(udf(x.b))) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY x.b/2 ORDER BY udf(x.b/2); + GROUP BY udf(x.b/2) ORDER BY udf(x.b/2); -- group w/o existing GROUP BY target under ambiguous condition -- failure expected due to ambiguous b in count(b) SELECT udf(count(udf(b))) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY x.b/2; + GROUP BY udf(x.b/2); -- [SPARK-28329] SELECT INTO syntax -- group w/o existing GROUP BY target under ambiguous condition diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out index e6a5995d24d2..a60cbf33b9b2 100755 --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out @@ -92,7 +92,7 @@ struct<> -- !query 11 SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY -test_missing_target.c +udf(test_missing_target.c) ORDER BY udf(c) -- !query 11 schema struct @@ -106,7 +106,7 @@ cccc 2 -- !query 12 -SELECT udf(count(*)) FROM test_missing_target GROUP BY test_missing_target.c +SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c) ORDER BY udf(c) -- !query 12 schema struct @@ -120,16 +120,16 @@ struct -- !query 13 -SELECT udf(count(*)) FROM test_missing_target GROUP BY a ORDER BY udf(b) +SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b) -- !query 13 schema struct<> -- !query 13 output org.apache.spark.sql.AnalysisException -cannot resolve '`b`' given input columns: [CAST(udf(cast(count(1) as string)) AS BIGINT)]; line 1 pos 70 +cannot resolve '`b`' given input columns: [CAST(udf(cast(count(1) as string)) AS BIGINT)]; line 1 pos 75 -- !query 14 -SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b) +SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) -- !query 14 schema struct -- !query 14 output @@ -141,7 +141,7 @@ struct -- !query 15 SELECT udf(test_missing_target.b), udf(count(*)) - FROM test_missing_target GROUP BY b ORDER BY udf(b) + FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) -- !query 15 schema struct -- !query 15 output @@ -169,7 +169,7 @@ CCCC -- !query 17 -SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b) desc +SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) desc -- !query 17 schema struct -- !query 17 output @@ -212,12 +212,12 @@ GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 63 -- !query 21 SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY b ORDER BY udf(b) + GROUP BY udf(b) ORDER BY udf(b) -- !query 21 schema struct<> -- !query 21 output org.apache.spark.sql.AnalysisException -Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 10 +Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 14 -- !query 22 @@ -258,7 +258,7 @@ struct -- !query 24 output @@ -272,7 +272,7 @@ struct -- !query 25 output @@ -285,7 +285,7 @@ struct -- !query 26 output @@ -296,11 +296,11 @@ struct -- !query 27 -SELECT a%2, udf(count(udf(b))) FROM test_missing_target -GROUP BY test_missing_target.a%2 +SELECT udf(a%2), udf(count(udf(b))) FROM test_missing_target +GROUP BY udf(test_missing_target.a%2) ORDER BY udf(test_missing_target.a%2) -- !query 27 schema -struct<(a % 2):int,CAST(udf(cast(count(cast(udf(cast(b as string)) as int)) as string)) AS BIGINT):bigint> +struct -- !query 27 output 0 5 1 5 @@ -308,7 +308,7 @@ struct<(a % 2):int,CAST(udf(cast(count(cast(udf(cast(b as string)) as int)) as s -- !query 28 SELECT udf(count(c)) FROM test_missing_target -GROUP BY lower(test_missing_target.c) +GROUP BY udf(lower(test_missing_target.c)) ORDER BY udf(lower(test_missing_target.c)) -- !query 28 schema struct @@ -320,16 +320,16 @@ struct -- !query 29 -SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY a ORDER BY udf(b) +SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b) -- !query 29 schema struct<> -- !query 29 output org.apache.spark.sql.AnalysisException -cannot resolve '`b`' given input columns: [CAST(udf(cast(count(cast(udf(cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 75 +cannot resolve '`b`' given input columns: [CAST(udf(cast(count(cast(udf(cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 80 -- !query 30 -SELECT udf(count(b)) FROM test_missing_target GROUP BY b/2 ORDER BY udf(b/2) +SELECT udf(count(b)) FROM test_missing_target GROUP BY udf(b/2) ORDER BY udf(b/2) -- !query 30 schema struct -- !query 30 output @@ -340,7 +340,7 @@ struct -- !query 31 SELECT udf(lower(test_missing_target.c)), udf(count(udf(c))) - FROM test_missing_target GROUP BY lower(c) ORDER BY udf(lower(c)) + FROM test_missing_target GROUP BY udf(lower(c)) ORDER BY udf(lower(c)) -- !query 31 schema struct -- !query 31 output @@ -369,7 +369,7 @@ struct -- !query 33 SELECT udf(count(b)) FROM test_missing_target - GROUP BY (b + 1) / 2 ORDER BY udf((b + 1) / 2) desc + GROUP BY udf((b + 1) / 2) ORDER BY udf((b + 1) / 2) desc -- !query 33 schema struct -- !query 33 output @@ -380,19 +380,19 @@ struct -- !query 34 SELECT udf(count(udf(x.a))) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY b/2 ORDER BY udf(b/2) + GROUP BY udf(b/2) ORDER BY udf(b/2) -- !query 34 schema struct<> -- !query 34 output org.apache.spark.sql.AnalysisException -Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 10 +Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 14 -- !query 35 SELECT udf(x.b/2), udf(count(udf(x.b))) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) - GROUP BY x.b/2 ORDER BY udf(x.b/2) + GROUP BY udf(x.b/2) ORDER BY udf(x.b/2) -- !query 35 schema struct -- !query 35 output @@ -404,7 +404,7 @@ struct -- !query 36 output