From aefdf4fccf6f6ae6ac524dd00da6c6430907d2ab Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 10 Jul 2019 17:41:08 -0700 Subject: [PATCH 1/2] [SPARK-28285][SQL][PYTHON][TESTS] Convert and port 'outer-join.sql' into UDF test base --- .../sql-tests/inputs/udf/udf-outer-join.sql | 45 ++++++++++ .../results/udf/udf-outer-join.sql.out | 88 +++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql new file mode 100644 index 0000000000000..0fe27ac906970 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql @@ -0,0 +1,45 @@ +-- This test file was converted from outer-join.sql. +-- List of configuration the test suite is run against: +--SET spark.sql.autoBroadcastJoinThreshold=10485760 +--SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true +--SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false + +-- SPARK-17099: Incorrect result when HAVING clause is added to group by query +CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES +(-234), (145), (367), (975), (298) +as t1(int_col1); + +CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES +(-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158) +as t2(int_col0, int_col1); + +SELECT + (udf(SUM(COALESCE(t1.int_col1, t2.int_col0)))), + (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) +FROM t1 +RIGHT JOIN t2 + ON (t2.int_col0) = (t1.int_col1) +GROUP BY udf(GREATEST(COALESCE(t2.int_col1, 109), COALESCE(t1.int_col1, -449))), + COALESCE(t1.int_col1, t2.int_col0) +HAVING (udf(SUM(COALESCE(t1.int_col1, t2.int_col0)))) + > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2); + + +-- SPARK-17120: Analyzer incorrectly optimizes plan to empty LocalRelation +CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1); + +CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1); + +-- Set the cross join enabled flag for the LEFT JOIN test since there's no join condition. +-- Ultimately the join should be optimized away. +set spark.sql.crossJoin.enabled = true; +SELECT * +FROM ( +SELECT + udf(COALESCE(t2.int_col1, udf(t1.int_col1))) AS int_col + FROM t1 + LEFT JOIN t2 ON false +) t where (udf(t.int_col)) is not null; +set spark.sql.crossJoin.enabled = false; + + diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out new file mode 100644 index 0000000000000..6394dad0f4acc --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out @@ -0,0 +1,88 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 8 + + +-- !query 0 +CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES +(-234), (145), (367), (975), (298) +as t1(int_col1) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES +(-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158) +as t2(int_col0, int_col1) +-- !query 1 schema +struct<> +-- !query 1 output + + + +-- !query 2 +SELECT + (udf(SUM(COALESCE(t1.int_col1, t2.int_col0)))), + (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) +FROM t1 +RIGHT JOIN t2 + ON (t2.int_col0) = (t1.int_col1) +GROUP BY udf(GREATEST(COALESCE(t2.int_col1, 109), COALESCE(t1.int_col1, -449))), + COALESCE(t1.int_col1, t2.int_col0) +HAVING (udf(SUM(COALESCE(t1.int_col1, t2.int_col0)))) + > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) +-- !query 2 schema +struct +-- !query 2 output +-367 -734.0 +-507 -1014.0 +-769 -1538.0 +-800 -1600.0 + + +-- !query 3 +CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1) +-- !query 3 schema +struct<> +-- !query 3 output + + + +-- !query 4 +CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1) +-- !query 4 schema +struct<> +-- !query 4 output + + + +-- !query 5 +set spark.sql.crossJoin.enabled = true +-- !query 5 schema +struct +-- !query 5 output +spark.sql.crossJoin.enabled true + + +-- !query 6 +SELECT * +FROM ( +SELECT + udf(COALESCE(t2.int_col1, udf(t1.int_col1))) AS int_col + FROM t1 + LEFT JOIN t2 ON false +) t where (udf(t.int_col)) is not null +-- !query 6 schema +struct +-- !query 6 output +97 + + +-- !query 7 +set spark.sql.crossJoin.enabled = false +-- !query 7 schema +struct +-- !query 7 output +spark.sql.crossJoin.enabled false From 5955d46daddcb8b875dffbf645af1c0f5e63a986 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 17 Jul 2019 14:48:51 -0700 Subject: [PATCH 2/2] add a few more udf --- .../sql-tests/inputs/udf/udf-outer-join.sql | 10 ++++----- .../results/udf/udf-outer-join.sql.out | 22 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql index 0fe27ac906970..4eb0805c9cc67 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql @@ -14,14 +14,14 @@ CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES as t2(int_col0, int_col1); SELECT - (udf(SUM(COALESCE(t1.int_col1, t2.int_col0)))), + (udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))), (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) FROM t1 RIGHT JOIN t2 - ON (t2.int_col0) = (t1.int_col1) -GROUP BY udf(GREATEST(COALESCE(t2.int_col1, 109), COALESCE(t1.int_col1, -449))), + ON udf(t2.int_col0) = udf(t1.int_col1) +GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf(-449)))), COALESCE(t1.int_col1, t2.int_col0) -HAVING (udf(SUM(COALESCE(t1.int_col1, t2.int_col0)))) +HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0))))) > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2); @@ -36,7 +36,7 @@ set spark.sql.crossJoin.enabled = true; SELECT * FROM ( SELECT - udf(COALESCE(t2.int_col1, udf(t1.int_col1))) AS int_col + udf(COALESCE(udf(t2.int_col1), udf(t1.int_col1))) AS int_col FROM t1 LEFT JOIN t2 ON false ) t where (udf(t.int_col)) is not null; diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out index 6394dad0f4acc..819f786070882 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out @@ -24,22 +24,22 @@ struct<> -- !query 2 SELECT - (udf(SUM(COALESCE(t1.int_col1, t2.int_col0)))), + (udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))), (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) FROM t1 RIGHT JOIN t2 - ON (t2.int_col0) = (t1.int_col1) -GROUP BY udf(GREATEST(COALESCE(t2.int_col1, 109), COALESCE(t1.int_col1, -449))), + ON udf(t2.int_col0) = udf(t1.int_col1) +GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf(-449)))), COALESCE(t1.int_col1, t2.int_col0) -HAVING (udf(SUM(COALESCE(t1.int_col1, t2.int_col0)))) +HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0))))) > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) -- !query 2 schema -struct +struct -- !query 2 output --367 -734.0 --507 -1014.0 --769 -1538.0 --800 -1600.0 +-367 -734 +-507 -1014 +-769 -1538 +-800 -1600 -- !query 3 @@ -70,12 +70,12 @@ spark.sql.crossJoin.enabled true SELECT * FROM ( SELECT - udf(COALESCE(t2.int_col1, udf(t1.int_col1))) AS int_col + udf(COALESCE(udf(t2.int_col1), udf(t1.int_col1))) AS int_col FROM t1 LEFT JOIN t2 ON false ) t where (udf(t.int_col)) is not null -- !query 6 schema -struct +struct -- !query 6 output 97