Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/select_implicit.sql
--
-- This test file was converted from pgSQL/select_implicit.sql
-- [SPARK-28445] Inconsistency between Scala and Python/Panda udfs when groupby with udf() is used
-- TODO: We should add UDFs in GROUP BY clause when [SPARK-28445] is resolved.

-- load test data
CREATE TABLE test_missing_target (a int, b int, c string, d string) using parquet;
Expand All @@ -29,29 +27,29 @@ INSERT INTO test_missing_target VALUES (9, 4, 'CCCC', 'j');

-- w/ existing GROUP BY target
SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY
test_missing_target.c
udf(test_missing_target.c)
ORDER BY udf(c);

-- w/o existing GROUP BY target using a relation name in GROUP BY clause
SELECT udf(count(*)) FROM test_missing_target GROUP BY test_missing_target.c
SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c)
ORDER BY udf(c);

-- w/o existing GROUP BY target and w/o existing a different ORDER BY target
-- failure expected
SELECT udf(count(*)) FROM test_missing_target GROUP BY a ORDER BY udf(b);
SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b);

-- w/o existing GROUP BY target and w/o existing same ORDER BY target
SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b);
SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b);

-- w/ existing GROUP BY target using a relation name in target
SELECT udf(test_missing_target.b), udf(count(*))
FROM test_missing_target GROUP BY b ORDER BY udf(b);
FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b);

-- w/o existing GROUP BY target
SELECT udf(c) FROM test_missing_target ORDER BY udf(a);

-- w/o existing ORDER BY target
SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b) desc;
SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) desc;

-- group using reference number
SELECT udf(count(*)) FROM test_missing_target ORDER BY udf(1) desc;
Expand All @@ -67,7 +65,7 @@ SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 3;
-- failure expected
SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY b ORDER BY udf(b);
GROUP BY udf(b) ORDER BY udf(b);

-- order w/ target under ambiguous condition
-- failure NOT expected
Expand All @@ -82,17 +80,17 @@ SELECT udf(udf(a)/2), udf(udf(a)/2) FROM test_missing_target
-- group expression w/ target under ambiguous condition
-- failure NOT expected
SELECT udf(a/2), udf(a/2) FROM test_missing_target
GROUP BY a/2 ORDER BY udf(a/2);
GROUP BY udf(a/2) ORDER BY udf(a/2);

-- group w/ existing GROUP BY target under ambiguous condition
SELECT udf(x.b), udf(count(*)) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY x.b ORDER BY udf(x.b);
GROUP BY udf(x.b) ORDER BY udf(x.b);

-- group w/o existing GROUP BY target under ambiguous condition
SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY x.b ORDER BY udf(x.b);
GROUP BY udf(x.b) ORDER BY udf(x.b);

-- [SPARK-28329] SELECT INTO syntax
-- group w/o existing GROUP BY target under ambiguous condition
Expand All @@ -107,50 +105,50 @@ SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y
-- Functions and expressions

-- w/ existing GROUP BY target
SELECT a%2, udf(count(udf(b))) FROM test_missing_target
GROUP BY test_missing_target.a%2
SELECT udf(a%2), udf(count(udf(b))) FROM test_missing_target
GROUP BY udf(test_missing_target.a%2)
ORDER BY udf(test_missing_target.a%2);

-- w/o existing GROUP BY target using a relation name in GROUP BY clause
SELECT udf(count(c)) FROM test_missing_target
GROUP BY lower(test_missing_target.c)
GROUP BY udf(lower(test_missing_target.c))
ORDER BY udf(lower(test_missing_target.c));

-- w/o existing GROUP BY target and w/o existing a different ORDER BY target
-- failure expected
SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY a ORDER BY udf(b);
SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b);

-- w/o existing GROUP BY target and w/o existing same ORDER BY target
SELECT udf(count(b)) FROM test_missing_target GROUP BY b/2 ORDER BY udf(b/2);
SELECT udf(count(b)) FROM test_missing_target GROUP BY udf(b/2) ORDER BY udf(b/2);

-- w/ existing GROUP BY target using a relation name in target
SELECT udf(lower(test_missing_target.c)), udf(count(udf(c)))
FROM test_missing_target GROUP BY lower(c) ORDER BY udf(lower(c));
FROM test_missing_target GROUP BY udf(lower(c)) ORDER BY udf(lower(c));

-- w/o existing GROUP BY target
SELECT udf(a) FROM test_missing_target ORDER BY udf(upper(udf(d)));

-- w/o existing ORDER BY target
SELECT udf(count(b)) FROM test_missing_target
GROUP BY (b + 1) / 2 ORDER BY udf((b + 1) / 2) desc;
GROUP BY udf((b + 1) / 2) ORDER BY udf((b + 1) / 2) desc;

-- group w/o existing GROUP BY and ORDER BY target under ambiguous condition
-- failure expected
SELECT udf(count(udf(x.a))) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY b/2 ORDER BY udf(b/2);
GROUP BY udf(b/2) ORDER BY udf(b/2);

-- group w/ existing GROUP BY target under ambiguous condition
SELECT udf(x.b/2), udf(count(udf(x.b))) FROM test_missing_target x,
test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY x.b/2 ORDER BY udf(x.b/2);
GROUP BY udf(x.b/2) ORDER BY udf(x.b/2);

-- group w/o existing GROUP BY target under ambiguous condition
-- failure expected due to ambiguous b in count(b)
SELECT udf(count(udf(b))) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY x.b/2;
GROUP BY udf(x.b/2);

-- [SPARK-28329] SELECT INTO syntax
-- group w/o existing GROUP BY target under ambiguous condition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ struct<>

-- !query 11
SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY
test_missing_target.c
udf(test_missing_target.c)
ORDER BY udf(c)
-- !query 11 schema
struct<CAST(udf(cast(c as string)) AS STRING):string,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
Expand All @@ -106,7 +106,7 @@ cccc 2


-- !query 12
SELECT udf(count(*)) FROM test_missing_target GROUP BY test_missing_target.c
SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c)
ORDER BY udf(c)
-- !query 12 schema
struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
Expand All @@ -120,16 +120,16 @@ struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>


-- !query 13
SELECT udf(count(*)) FROM test_missing_target GROUP BY a ORDER BY udf(b)
SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b)
-- !query 13 schema
struct<>
-- !query 13 output
org.apache.spark.sql.AnalysisException
cannot resolve '`b`' given input columns: [CAST(udf(cast(count(1) as string)) AS BIGINT)]; line 1 pos 70
cannot resolve '`b`' given input columns: [CAST(udf(cast(count(1) as string)) AS BIGINT)]; line 1 pos 75


-- !query 14
SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b)
SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b)
-- !query 14 schema
struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
-- !query 14 output
Expand All @@ -141,7 +141,7 @@ struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>

-- !query 15
SELECT udf(test_missing_target.b), udf(count(*))
FROM test_missing_target GROUP BY b ORDER BY udf(b)
FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b)
-- !query 15 schema
struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
-- !query 15 output
Expand Down Expand Up @@ -169,7 +169,7 @@ CCCC


-- !query 17
SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b) desc
SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) desc
-- !query 17 schema
struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
-- !query 17 output
Expand Down Expand Up @@ -212,12 +212,12 @@ GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 63
-- !query 21
SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY b ORDER BY udf(b)
GROUP BY udf(b) ORDER BY udf(b)
-- !query 21 schema
struct<>
-- !query 21 output
org.apache.spark.sql.AnalysisException
Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 10
Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 14


-- !query 22
Expand Down Expand Up @@ -258,7 +258,7 @@ struct<CAST(udf(cast((cast(udf(cast(a as string)) as int) div 2) as string)) AS

-- !query 24
SELECT udf(a/2), udf(a/2) FROM test_missing_target
GROUP BY a/2 ORDER BY udf(a/2)
GROUP BY udf(a/2) ORDER BY udf(a/2)
-- !query 24 schema
struct<CAST(udf(cast((a div 2) as string)) AS INT):int,CAST(udf(cast((a div 2) as string)) AS INT):int>
-- !query 24 output
Expand All @@ -272,7 +272,7 @@ struct<CAST(udf(cast((a div 2) as string)) AS INT):int,CAST(udf(cast((a div 2) a
-- !query 25
SELECT udf(x.b), udf(count(*)) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY x.b ORDER BY udf(x.b)
GROUP BY udf(x.b) ORDER BY udf(x.b)
-- !query 25 schema
struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
-- !query 25 output
Expand All @@ -285,7 +285,7 @@ struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(count(1) as string)
-- !query 26
SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY x.b ORDER BY udf(x.b)
GROUP BY udf(x.b) ORDER BY udf(x.b)
-- !query 26 schema
struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
-- !query 26 output
Expand All @@ -296,19 +296,19 @@ struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>


-- !query 27
SELECT a%2, udf(count(udf(b))) FROM test_missing_target
GROUP BY test_missing_target.a%2
SELECT udf(a%2), udf(count(udf(b))) FROM test_missing_target
GROUP BY udf(test_missing_target.a%2)
ORDER BY udf(test_missing_target.a%2)
-- !query 27 schema
struct<(a % 2):int,CAST(udf(cast(count(cast(udf(cast(b as string)) as int)) as string)) AS BIGINT):bigint>
struct<CAST(udf(cast((a % 2) as string)) AS INT):int,CAST(udf(cast(count(cast(udf(cast(b as string)) as int)) as string)) AS BIGINT):bigint>
-- !query 27 output
0 5
1 5


-- !query 28
SELECT udf(count(c)) FROM test_missing_target
GROUP BY lower(test_missing_target.c)
GROUP BY udf(lower(test_missing_target.c))
ORDER BY udf(lower(test_missing_target.c))
-- !query 28 schema
struct<CAST(udf(cast(count(c) as string)) AS BIGINT):bigint>
Expand All @@ -320,16 +320,16 @@ struct<CAST(udf(cast(count(c) as string)) AS BIGINT):bigint>


-- !query 29
SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY a ORDER BY udf(b)
SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b)
-- !query 29 schema
struct<>
-- !query 29 output
org.apache.spark.sql.AnalysisException
cannot resolve '`b`' given input columns: [CAST(udf(cast(count(cast(udf(cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 75
cannot resolve '`b`' given input columns: [CAST(udf(cast(count(cast(udf(cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 80


-- !query 30
SELECT udf(count(b)) FROM test_missing_target GROUP BY b/2 ORDER BY udf(b/2)
SELECT udf(count(b)) FROM test_missing_target GROUP BY udf(b/2) ORDER BY udf(b/2)
-- !query 30 schema
struct<CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>
-- !query 30 output
Expand All @@ -340,7 +340,7 @@ struct<CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>

-- !query 31
SELECT udf(lower(test_missing_target.c)), udf(count(udf(c)))
FROM test_missing_target GROUP BY lower(c) ORDER BY udf(lower(c))
FROM test_missing_target GROUP BY udf(lower(c)) ORDER BY udf(lower(c))
-- !query 31 schema
struct<CAST(udf(cast(lower(c) as string)) AS STRING):string,CAST(udf(cast(count(cast(udf(cast(c as string)) as string)) as string)) AS BIGINT):bigint>
-- !query 31 output
Expand Down Expand Up @@ -369,7 +369,7 @@ struct<CAST(udf(cast(a as string)) AS INT):int>

-- !query 33
SELECT udf(count(b)) FROM test_missing_target
GROUP BY (b + 1) / 2 ORDER BY udf((b + 1) / 2) desc
GROUP BY udf((b + 1) / 2) ORDER BY udf((b + 1) / 2) desc
-- !query 33 schema
struct<CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>
-- !query 33 output
Expand All @@ -380,19 +380,19 @@ struct<CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>
-- !query 34
SELECT udf(count(udf(x.a))) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY b/2 ORDER BY udf(b/2)
GROUP BY udf(b/2) ORDER BY udf(b/2)
-- !query 34 schema
struct<>
-- !query 34 output
org.apache.spark.sql.AnalysisException
Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 10
Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 14


-- !query 35
SELECT udf(x.b/2), udf(count(udf(x.b))) FROM test_missing_target x,
test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY x.b/2 ORDER BY udf(x.b/2)
GROUP BY udf(x.b/2) ORDER BY udf(x.b/2)
-- !query 35 schema
struct<CAST(udf(cast((b div 2) as string)) AS INT):int,CAST(udf(cast(count(cast(udf(cast(b as string)) as int)) as string)) AS BIGINT):bigint>
-- !query 35 output
Expand All @@ -404,7 +404,7 @@ struct<CAST(udf(cast((b div 2) as string)) AS INT):int,CAST(udf(cast(count(cast(
-- !query 36
SELECT udf(count(udf(b))) FROM test_missing_target x, test_missing_target y
WHERE udf(x.a) = udf(y.a)
GROUP BY x.b/2
GROUP BY udf(x.b/2)
-- !query 36 schema
struct<>
-- !query 36 output
Expand Down