From 9be50d5d770556e9d98bf317fae8457f6749217e Mon Sep 17 00:00:00 2001 From: chitralverma Date: Fri, 12 Jul 2019 04:01:09 +0530 Subject: [PATCH 1/6] Port 'pivot.sql' to UDF test base --- .../sql-tests/inputs/udf/udf-pivot.sql | 306 +++++++++++ .../sql-tests/results/udf/udf-pivot.sql.out | 494 ++++++++++++++++++ 2 files changed, 800 insertions(+) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql new file mode 100644 index 000000000000..24d73fdf5dbe --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql @@ -0,0 +1,306 @@ +-- This test file was converted from pivot.sql. + +-- Note that currently registered UDF returns a string. So there are some differences, for instance +-- in string cast within UDF in Scala and Python. + +create temporary view courseSales as select * from values + ("dotNET", 2012, 10000), + ("Java", 2012, 20000), + ("dotNET", 2012, 5000), + ("dotNET", 2013, 48000), + ("Java", 2013, 30000) + as courseSales(course, year, earnings); + +create temporary view years as select * from values + (2012, 1), + (2013, 2) + as years(y, s); + +create temporary view yearsWithComplexTypes as select * from values + (2012, array(1, 1), map('1', 1), struct(1, 'a')), + (2013, array(2, 2), map('2', 2), struct(2, 'b')) + as yearsWithComplexTypes(y, a, m, s); + +-- pivot courses +SELECT * FROM ( + SELECT udf(year), course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)) + FOR course IN ('dotNET', 'Java') +); + +-- pivot years with no subquery +SELECT * FROM courseSales +PIVOT ( + udf(sum(earnings)) + FOR year IN (2012, 2013) +); + +-- pivot courses with multiple aggregations +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)), udf(avg(earnings)) + FOR course IN ('dotNET', 'Java') +); + +-- pivot with no group by column +SELECT * FROM ( + SELECT udf(course) as course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)) + FOR course IN ('dotNET', 'Java') +); + +-- pivot with no group by column and with multiple aggregations on different columns +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)), udf(min(year)) + FOR course IN ('dotNET', 'Java') +); + +-- pivot on join query with multiple group by columns +SELECT * FROM ( + SELECT course, year, earnings, udf(s) as s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR s IN (1, 2) +); + +-- pivot on join query with multiple aggregations on different columns +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)), udf(min(s)) + FOR course IN ('dotNET', 'Java') +); + +-- pivot on join query with multiple columns in one aggregation +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings * s)) + FOR course IN ('dotNET', 'Java') +); + +-- pivot with aliases and projection +SELECT 2012_s, 2013_s, 2012_a, 2013_a, c FROM ( + SELECT year y, course c, earnings e FROM courseSales +) +PIVOT ( + udf(sum(e)) s, udf(avg(e)) a + FOR y IN (2012, 2013) +); + +-- pivot with projection and value aliases +SELECT firstYear_s, secondYear_s, firstYear_a, secondYear_a, c FROM ( + SELECT year y, course c, earnings e FROM courseSales +) +PIVOT ( + udf(sum(e)) s, udf(avg(e)) a + FOR y IN (2012 as firstYear, 2013 secondYear) +); + +-- pivot years with non-aggregate function +SELECT * FROM courseSales +PIVOT ( + udf(abs(earnings)) + FOR year IN (2012, 2013) +); + +-- pivot with one of the expressions as non-aggregate function +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)), year + FOR course IN ('dotNET', 'Java') +); + +-- pivot with unresolvable columns +SELECT * FROM ( + SELECT course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)) + FOR year IN (2012, 2013) +); + +-- pivot with complex aggregate expressions +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + udf(ceil(udf(sum(earnings)))), avg(earnings) + 1 as a1 + FOR course IN ('dotNET', 'Java') +); + +-- pivot with invalid arguments in aggregate expressions +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + sum(udf(avg(earnings))) + FOR course IN ('dotNET', 'Java') +); + +-- pivot on multiple pivot columns +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) +); + +-- pivot on multiple pivot columns with aliased values +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, s) IN (('dotNET', 2) as c1, ('Java', 1) as c2) +); + +-- pivot on multiple pivot columns with values of wrong data types +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, year) IN ('dotNET', 'Java') +); + +-- pivot with unresolvable values +SELECT * FROM courseSales +PIVOT ( + udf(sum(earnings)) + FOR year IN (s, 2013) +); + +-- pivot with non-literal values +SELECT * FROM courseSales +PIVOT ( + udf(sum(earnings)) + FOR year IN (course, 2013) +); + +-- pivot on join query with columns of complex data types +SELECT * FROM ( + SELECT course, year, a + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(min(a)) + FOR course IN ('dotNET', 'Java') +); + +-- pivot on multiple pivot columns with agg columns of complex data types +SELECT * FROM ( + SELECT course, year, y, a + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(max(a)) + FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) +); + +-- pivot on pivot column of array type +SELECT * FROM ( + SELECT earnings, year, a + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR a IN (array(1, 1), array(2, 2)) +); + +-- pivot on multiple pivot columns containing array type +SELECT * FROM ( + SELECT course, earnings, year, a + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) +); + +-- pivot on pivot column of struct type +SELECT * FROM ( + SELECT earnings, year, s + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR s IN ((1, 'a'), (2, 'b')) +); + +-- pivot on multiple pivot columns containing struct type +SELECT * FROM ( + SELECT course, earnings, year, s + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) +); + +-- pivot on pivot column of map type +SELECT * FROM ( + SELECT earnings, year, m + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR m IN (map('1', 1), map('2', 2)) +); + +-- pivot on multiple pivot columns containing map type +SELECT * FROM ( + SELECT course, earnings, year, m + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) +); + +-- grouping columns output in the same order as input +-- correctly handle pivot columns with different cases +SELECT * FROM ( + SELECT course, earnings, udf("a") as a, udf("z") as z, udf("b") as b, udf("y") as y, + udf("c") as c, udf("x") as x, udf("d") as d, udf("w") as w + FROM courseSales +) +PIVOT ( + udf(sum(Earnings)) + FOR Course IN ('dotNET', 'Java') +); diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out new file mode 100644 index 000000000000..bea1da6afc12 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out @@ -0,0 +1,494 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 32 + + +-- !query 0 +create temporary view courseSales as select * from values + ("dotNET", 2012, 10000), + ("Java", 2012, 20000), + ("dotNET", 2012, 5000), + ("dotNET", 2013, 48000), + ("Java", 2013, 30000) + as courseSales(course, year, earnings) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +create temporary view years as select * from values + (2012, 1), + (2013, 2) + as years(y, s) +-- !query 1 schema +struct<> +-- !query 1 output + + + +-- !query 2 +create temporary view yearsWithComplexTypes as select * from values + (2012, array(1, 1), map('1', 1), struct(1, 'a')), + (2013, array(2, 2), map('2', 2), struct(2, 'b')) + as yearsWithComplexTypes(y, a, m, s) +-- !query 2 schema +struct<> +-- !query 2 output + + + +-- !query 3 +SELECT * FROM ( + SELECT udf(year), course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)) + FOR course IN ('dotNET', 'Java') +) +-- !query 3 schema +struct +-- !query 3 output +2012 15000 20000 +2013 48000 30000 + + +-- !query 4 +SELECT * FROM courseSales +PIVOT ( + udf(sum(earnings)) + FOR year IN (2012, 2013) +) +-- !query 4 schema +struct +-- !query 4 output +Java 20000 30000 +dotNET 15000 48000 + + +-- !query 5 +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)), udf(avg(earnings)) + FOR course IN ('dotNET', 'Java') +) +-- !query 5 schema +struct +-- !query 5 output +2012 15000 7500.0 20000 20000.0 +2013 48000 48000.0 30000 30000.0 + + +-- !query 6 +SELECT * FROM ( + SELECT udf(course) as course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)) + FOR course IN ('dotNET', 'Java') +) +-- !query 6 schema +struct +-- !query 6 output +63000 50000 + + +-- !query 7 +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)), udf(min(year)) + FOR course IN ('dotNET', 'Java') +) +-- !query 7 schema +struct +-- !query 7 output +63000 2012 50000 2012 + + +-- !query 8 +SELECT * FROM ( + SELECT course, year, earnings, udf(s) as s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR s IN (1, 2) +) +-- !query 8 schema +struct +-- !query 8 output +Java 2012 20000 nan +Java 2013 nan 30000 +dotNET 2012 15000 nan +dotNET 2013 nan 48000 + + +-- !query 9 +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)), udf(min(s)) + FOR course IN ('dotNET', 'Java') +) +-- !query 9 schema +struct +-- !query 9 output +2012 15000 1 20000 1 +2013 48000 2 30000 2 + + +-- !query 10 +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings * s)) + FOR course IN ('dotNET', 'Java') +) +-- !query 10 schema +struct +-- !query 10 output +2012 15000 20000 +2013 96000 60000 + + +-- !query 11 +SELECT 2012_s, 2013_s, 2012_a, 2013_a, c FROM ( + SELECT year y, course c, earnings e FROM courseSales +) +PIVOT ( + udf(sum(e)) s, udf(avg(e)) a + FOR y IN (2012, 2013) +) +-- !query 11 schema +struct<2012_s:string,2013_s:string,2012_a:string,2013_a:string,c:string> +-- !query 11 output +15000 48000 7500.0 48000.0 dotNET +20000 30000 20000.0 30000.0 Java + + +-- !query 12 +SELECT firstYear_s, secondYear_s, firstYear_a, secondYear_a, c FROM ( + SELECT year y, course c, earnings e FROM courseSales +) +PIVOT ( + udf(sum(e)) s, udf(avg(e)) a + FOR y IN (2012 as firstYear, 2013 secondYear) +) +-- !query 12 schema +struct +-- !query 12 output +15000 48000 7500.0 48000.0 dotNET +20000 30000 20000.0 30000.0 Java + + +-- !query 13 +SELECT * FROM courseSales +PIVOT ( + udf(abs(earnings)) + FOR year IN (2012, 2013) +) +-- !query 13 schema +struct<> +-- !query 13 output +org.apache.spark.sql.AnalysisException +Aggregate expression required for pivot, but 'coursesales.`earnings`' did not appear in any aggregate function.; + + +-- !query 14 +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)), year + FOR course IN ('dotNET', 'Java') +) +-- !query 14 schema +struct<> +-- !query 14 output +org.apache.spark.sql.AnalysisException +Aggregate expression required for pivot, but '__auto_generated_subquery_name.`year`' did not appear in any aggregate function.; + + +-- !query 15 +SELECT * FROM ( + SELECT course, earnings FROM courseSales +) +PIVOT ( + udf(sum(earnings)) + FOR year IN (2012, 2013) +) +-- !query 15 schema +struct<> +-- !query 15 output +org.apache.spark.sql.AnalysisException +cannot resolve '`year`' given input columns: [__auto_generated_subquery_name.course, __auto_generated_subquery_name.earnings]; line 4 pos 0 + + +-- !query 16 +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + udf(ceil(udf(sum(earnings)))), avg(earnings) + 1 as a1 + FOR course IN ('dotNET', 'Java') +) +-- !query 16 schema +struct +-- !query 16 output +2012 15000 7501.0 20000 20001.0 +2013 48000 48001.0 30000 30001.0 + + +-- !query 17 +SELECT * FROM ( + SELECT year, course, earnings FROM courseSales +) +PIVOT ( + sum(udf(avg(earnings))) + FOR course IN ('dotNET', 'Java') +) +-- !query 17 schema +struct<> +-- !query 17 output +org.apache.spark.sql.AnalysisException +It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.; + + +-- !query 18 +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) +) +-- !query 18 schema +struct +-- !query 18 output +1 15000 nan +2 nan 30000 + + +-- !query 19 +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, s) IN (('dotNET', 2) as c1, ('Java', 1) as c2) +) +-- !query 19 schema +struct +-- !query 19 output +2012 nan 20000 +2013 48000 nan + + +-- !query 20 +SELECT * FROM ( + SELECT course, year, earnings, s + FROM courseSales + JOIN years ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, year) IN ('dotNET', 'Java') +) +-- !query 20 schema +struct<> +-- !query 20 output +org.apache.spark.sql.AnalysisException +Invalid pivot value 'dotNET': value data type string does not match pivot column data type struct; + + +-- !query 21 +SELECT * FROM courseSales +PIVOT ( + udf(sum(earnings)) + FOR year IN (s, 2013) +) +-- !query 21 schema +struct<> +-- !query 21 output +org.apache.spark.sql.AnalysisException +cannot resolve '`s`' given input columns: [coursesales.course, coursesales.earnings, coursesales.year]; line 4 pos 15 + + +-- !query 22 +SELECT * FROM courseSales +PIVOT ( + udf(sum(earnings)) + FOR year IN (course, 2013) +) +-- !query 22 schema +struct<> +-- !query 22 output +org.apache.spark.sql.AnalysisException +Literal expressions required for pivot values, found 'course#x'; + + +-- !query 23 +SELECT * FROM ( + SELECT course, year, a + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(min(a)) + FOR course IN ('dotNET', 'Java') +) +-- !query 23 schema +struct +-- !query 23 output +2012 [1 1] [1 1] +2013 [2 2] [2 2] + + +-- !query 24 +SELECT * FROM ( + SELECT course, year, y, a + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(max(a)) + FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) +) +-- !query 24 schema +struct +-- !query 24 output +2012 [1 1] None +2013 None [2 2] + + +-- !query 25 +SELECT * FROM ( + SELECT earnings, year, a + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR a IN (array(1, 1), array(2, 2)) +) +-- !query 25 schema +struct +-- !query 25 output +2012 35000 nan +2013 nan 78000 + + +-- !query 26 +SELECT * FROM ( + SELECT course, earnings, year, a + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) +) +-- !query 26 schema +struct +-- !query 26 output +2012 15000 nan +2013 nan 30000 + + +-- !query 27 +SELECT * FROM ( + SELECT earnings, year, s + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR s IN ((1, 'a'), (2, 'b')) +) +-- !query 27 schema +struct +-- !query 27 output +2012 35000 nan +2013 nan 78000 + + +-- !query 28 +SELECT * FROM ( + SELECT course, earnings, year, s + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) +) +-- !query 28 schema +struct +-- !query 28 output +2012 15000 nan +2013 nan 30000 + + +-- !query 29 +SELECT * FROM ( + SELECT earnings, year, m + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR m IN (map('1', 1), map('2', 2)) +) +-- !query 29 schema +struct<> +-- !query 29 output +org.apache.spark.sql.AnalysisException +Invalid pivot column 'm#x'. Pivot columns must be comparable.; + + +-- !query 30 +SELECT * FROM ( + SELECT course, earnings, year, m + FROM courseSales + JOIN yearsWithComplexTypes ON year = y +) +PIVOT ( + udf(sum(earnings)) + FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) +) +-- !query 30 schema +struct<> +-- !query 30 output +org.apache.spark.sql.AnalysisException +Invalid pivot column 'named_struct(course, course#x, m, m#x)'. Pivot columns must be comparable.; + + +-- !query 31 +SELECT * FROM ( + SELECT course, earnings, udf("a") as a, udf("z") as z, udf("b") as b, udf("y") as y, + udf("c") as c, udf("x") as x, udf("d") as d, udf("w") as w + FROM courseSales +) +PIVOT ( + udf(sum(Earnings)) + FOR Course IN ('dotNET', 'Java') +) +-- !query 31 schema +struct +-- !query 31 output +a z b y c x d w 63000 50000 From b0336e0db3cdc5409f55abdb75ba973a5edab02c Mon Sep 17 00:00:00 2001 From: chitralverma Date: Fri, 12 Jul 2019 16:45:13 +0530 Subject: [PATCH 2/6] Marked and commented problematic test cases for now --- .../sql-tests/inputs/udf/udf-pivot.sql | 51 ++++++++----- .../sql-tests/results/udf/udf-pivot.sql.out | 76 +++++-------------- 2 files changed, 52 insertions(+), 75 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql index 24d73fdf5dbe..8319ec252b25 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql @@ -3,6 +3,8 @@ -- Note that currently registered UDF returns a string. So there are some differences, for instance -- in string cast within UDF in Scala and Python. +--Note some test cases have been commented as the current integrated UDFs cannot handle complex types + create temporary view courseSales as select * from values ("dotNET", 2012, 10000), ("Java", 2012, 20000), @@ -64,6 +66,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ); +--todo nan fix -- pivot on join query with multiple group by columns SELECT * FROM ( SELECT course, year, earnings, udf(s) as s @@ -158,6 +161,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ); +--todo nan fix -- pivot on multiple pivot columns SELECT * FROM ( SELECT course, year, earnings, s @@ -169,6 +173,7 @@ PIVOT ( FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) ); +--todo nan fix -- pivot on multiple pivot columns with aliased values SELECT * FROM ( SELECT course, year, earnings, s @@ -205,28 +210,31 @@ PIVOT ( FOR year IN (course, 2013) ); +--todo complex type fix -- pivot on join query with columns of complex data types -SELECT * FROM ( - SELECT course, year, a - FROM courseSales - JOIN yearsWithComplexTypes ON year = y -) -PIVOT ( - udf(min(a)) - FOR course IN ('dotNET', 'Java') -); - +--SELECT * FROM ( +-- SELECT course, year, a +-- FROM courseSales +-- JOIN yearsWithComplexTypes ON year = y +--) +--PIVOT ( +-- udf(min(a)) +-- FOR course IN ('dotNET', 'Java') +--); + +--todo complex type fix -- pivot on multiple pivot columns with agg columns of complex data types -SELECT * FROM ( - SELECT course, year, y, a - FROM courseSales - JOIN yearsWithComplexTypes ON year = y -) -PIVOT ( - udf(max(a)) - FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) -); - +--SELECT * FROM ( +-- SELECT course, year, y, a +-- FROM courseSales +-- JOIN yearsWithComplexTypes ON year = y +--) +--PIVOT ( +-- udf(max(a)) +-- FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) +--); + +--todo nan fix -- pivot on pivot column of array type SELECT * FROM ( SELECT earnings, year, a @@ -238,6 +246,7 @@ PIVOT ( FOR a IN (array(1, 1), array(2, 2)) ); +--todo nan fix -- pivot on multiple pivot columns containing array type SELECT * FROM ( SELECT course, earnings, year, a @@ -249,6 +258,7 @@ PIVOT ( FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ); +--todo nan fix -- pivot on pivot column of struct type SELECT * FROM ( SELECT earnings, year, s @@ -260,6 +270,7 @@ PIVOT ( FOR s IN ((1, 'a'), (2, 'b')) ); +--todo nan fix -- pivot on multiple pivot columns containing struct type SELECT * FROM ( SELECT course, earnings, year, s diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out index bea1da6afc12..d786afab05a6 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 32 +-- Number of queries: 30 -- !query 0 @@ -343,40 +343,6 @@ Literal expressions required for pivot values, found 'course#x'; -- !query 23 -SELECT * FROM ( - SELECT course, year, a - FROM courseSales - JOIN yearsWithComplexTypes ON year = y -) -PIVOT ( - udf(min(a)) - FOR course IN ('dotNET', 'Java') -) --- !query 23 schema -struct --- !query 23 output -2012 [1 1] [1 1] -2013 [2 2] [2 2] - - --- !query 24 -SELECT * FROM ( - SELECT course, year, y, a - FROM courseSales - JOIN yearsWithComplexTypes ON year = y -) -PIVOT ( - udf(max(a)) - FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) -) --- !query 24 schema -struct --- !query 24 output -2012 [1 1] None -2013 None [2 2] - - --- !query 25 SELECT * FROM ( SELECT earnings, year, a FROM courseSales @@ -386,14 +352,14 @@ PIVOT ( udf(sum(earnings)) FOR a IN (array(1, 1), array(2, 2)) ) --- !query 25 schema +-- !query 23 schema struct --- !query 25 output +-- !query 23 output 2012 35000 nan 2013 nan 78000 --- !query 26 +-- !query 24 SELECT * FROM ( SELECT course, earnings, year, a FROM courseSales @@ -403,14 +369,14 @@ PIVOT ( udf(sum(earnings)) FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ) --- !query 26 schema +-- !query 24 schema struct --- !query 26 output +-- !query 24 output 2012 15000 nan 2013 nan 30000 --- !query 27 +-- !query 25 SELECT * FROM ( SELECT earnings, year, s FROM courseSales @@ -420,14 +386,14 @@ PIVOT ( udf(sum(earnings)) FOR s IN ((1, 'a'), (2, 'b')) ) --- !query 27 schema +-- !query 25 schema struct --- !query 27 output +-- !query 25 output 2012 35000 nan 2013 nan 78000 --- !query 28 +-- !query 26 SELECT * FROM ( SELECT course, earnings, year, s FROM courseSales @@ -437,14 +403,14 @@ PIVOT ( udf(sum(earnings)) FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) ) --- !query 28 schema +-- !query 26 schema struct --- !query 28 output +-- !query 26 output 2012 15000 nan 2013 nan 30000 --- !query 29 +-- !query 27 SELECT * FROM ( SELECT earnings, year, m FROM courseSales @@ -454,14 +420,14 @@ PIVOT ( udf(sum(earnings)) FOR m IN (map('1', 1), map('2', 2)) ) --- !query 29 schema +-- !query 27 schema struct<> --- !query 29 output +-- !query 27 output org.apache.spark.sql.AnalysisException Invalid pivot column 'm#x'. Pivot columns must be comparable.; --- !query 30 +-- !query 28 SELECT * FROM ( SELECT course, earnings, year, m FROM courseSales @@ -471,14 +437,14 @@ PIVOT ( udf(sum(earnings)) FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) ) --- !query 30 schema +-- !query 28 schema struct<> --- !query 30 output +-- !query 28 output org.apache.spark.sql.AnalysisException Invalid pivot column 'named_struct(course, course#x, m, m#x)'. Pivot columns must be comparable.; --- !query 31 +-- !query 29 SELECT * FROM ( SELECT course, earnings, udf("a") as a, udf("z") as z, udf("b") as b, udf("y") as y, udf("c") as c, udf("x") as x, udf("d") as d, udf("w") as w @@ -488,7 +454,7 @@ PIVOT ( udf(sum(Earnings)) FOR Course IN ('dotNET', 'Java') ) --- !query 31 schema +-- !query 29 schema struct --- !query 31 output +-- !query 29 output a z b y c x d w 63000 50000 From efb5ece5df5a7a1e4f6938d85c1d31efc5bd148c Mon Sep 17 00:00:00 2001 From: chitralverma Date: Thu, 18 Jul 2019 13:14:10 +0530 Subject: [PATCH 3/6] fix breaking tests --- .../sql-tests/inputs/udf/udf-pivot.sql | 10 --- .../sql-tests/results/udf/udf-pivot.sql.out | 68 +++++++++---------- 2 files changed, 34 insertions(+), 44 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql index 8319ec252b25..58d650c47e2d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql @@ -1,8 +1,5 @@ -- This test file was converted from pivot.sql. --- Note that currently registered UDF returns a string. So there are some differences, for instance --- in string cast within UDF in Scala and Python. - --Note some test cases have been commented as the current integrated UDFs cannot handle complex types create temporary view courseSales as select * from values @@ -66,7 +63,6 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ); ---todo nan fix -- pivot on join query with multiple group by columns SELECT * FROM ( SELECT course, year, earnings, udf(s) as s @@ -161,7 +157,6 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ); ---todo nan fix -- pivot on multiple pivot columns SELECT * FROM ( SELECT course, year, earnings, s @@ -173,7 +168,6 @@ PIVOT ( FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) ); ---todo nan fix -- pivot on multiple pivot columns with aliased values SELECT * FROM ( SELECT course, year, earnings, s @@ -234,7 +228,6 @@ PIVOT ( -- FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) --); ---todo nan fix -- pivot on pivot column of array type SELECT * FROM ( SELECT earnings, year, a @@ -246,7 +239,6 @@ PIVOT ( FOR a IN (array(1, 1), array(2, 2)) ); ---todo nan fix -- pivot on multiple pivot columns containing array type SELECT * FROM ( SELECT course, earnings, year, a @@ -258,7 +250,6 @@ PIVOT ( FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ); ---todo nan fix -- pivot on pivot column of struct type SELECT * FROM ( SELECT earnings, year, s @@ -270,7 +261,6 @@ PIVOT ( FOR s IN ((1, 'a'), (2, 'b')) ); ---todo nan fix -- pivot on multiple pivot columns containing struct type SELECT * FROM ( SELECT course, earnings, year, s diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out index d786afab05a6..2f1365ee19aa 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out @@ -47,7 +47,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ) -- !query 3 schema -struct +struct -- !query 3 output 2012 15000 20000 2013 48000 30000 @@ -60,7 +60,7 @@ PIVOT ( FOR year IN (2012, 2013) ) -- !query 4 schema -struct +struct -- !query 4 output Java 20000 30000 dotNET 15000 48000 @@ -75,7 +75,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ) -- !query 5 schema -struct +struct -- !query 5 output 2012 15000 7500.0 20000 20000.0 2013 48000 48000.0 30000 30000.0 @@ -90,7 +90,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ) -- !query 6 schema -struct +struct -- !query 6 output 63000 50000 @@ -104,7 +104,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ) -- !query 7 schema -struct +struct -- !query 7 output 63000 2012 50000 2012 @@ -120,12 +120,12 @@ PIVOT ( FOR s IN (1, 2) ) -- !query 8 schema -struct +struct -- !query 8 output -Java 2012 20000 nan -Java 2013 nan 30000 -dotNET 2012 15000 nan -dotNET 2013 nan 48000 +Java 2012 20000 NULL +Java 2013 NULL 30000 +dotNET 2012 15000 NULL +dotNET 2013 NULL 48000 -- !query 9 @@ -139,7 +139,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ) -- !query 9 schema -struct +struct -- !query 9 output 2012 15000 1 20000 1 2013 48000 2 30000 2 @@ -156,7 +156,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ) -- !query 10 schema -struct +struct -- !query 10 output 2012 15000 20000 2013 96000 60000 @@ -171,7 +171,7 @@ PIVOT ( FOR y IN (2012, 2013) ) -- !query 11 schema -struct<2012_s:string,2013_s:string,2012_a:string,2013_a:string,c:string> +struct<2012_s:bigint,2013_s:bigint,2012_a:double,2013_a:double,c:string> -- !query 11 output 15000 48000 7500.0 48000.0 dotNET 20000 30000 20000.0 30000.0 Java @@ -186,7 +186,7 @@ PIVOT ( FOR y IN (2012 as firstYear, 2013 secondYear) ) -- !query 12 schema -struct +struct -- !query 12 output 15000 48000 7500.0 48000.0 dotNET 20000 30000 20000.0 30000.0 Java @@ -244,7 +244,7 @@ PIVOT ( FOR course IN ('dotNET', 'Java') ) -- !query 16 schema -struct +struct -- !query 16 output 2012 15000 7501.0 20000 20001.0 2013 48000 48001.0 30000 30001.0 @@ -276,10 +276,10 @@ PIVOT ( FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) ) -- !query 18 schema -struct +struct -- !query 18 output -1 15000 nan -2 nan 30000 +1 15000 NULL +2 NULL 30000 -- !query 19 @@ -293,10 +293,10 @@ PIVOT ( FOR (course, s) IN (('dotNET', 2) as c1, ('Java', 1) as c2) ) -- !query 19 schema -struct +struct -- !query 19 output -2012 nan 20000 -2013 48000 nan +2012 NULL 20000 +2013 48000 NULL -- !query 20 @@ -353,10 +353,10 @@ PIVOT ( FOR a IN (array(1, 1), array(2, 2)) ) -- !query 23 schema -struct +struct -- !query 23 output -2012 35000 nan -2013 nan 78000 +2012 35000 NULL +2013 NULL 78000 -- !query 24 @@ -370,10 +370,10 @@ PIVOT ( FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ) -- !query 24 schema -struct +struct -- !query 24 output -2012 15000 nan -2013 nan 30000 +2012 15000 NULL +2013 NULL 30000 -- !query 25 @@ -387,10 +387,10 @@ PIVOT ( FOR s IN ((1, 'a'), (2, 'b')) ) -- !query 25 schema -struct +struct -- !query 25 output -2012 35000 nan -2013 nan 78000 +2012 35000 NULL +2013 NULL 78000 -- !query 26 @@ -404,10 +404,10 @@ PIVOT ( FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) ) -- !query 26 schema -struct +struct -- !query 26 output -2012 15000 nan -2013 nan 30000 +2012 15000 NULL +2013 NULL 30000 -- !query 27 @@ -455,6 +455,6 @@ PIVOT ( FOR Course IN ('dotNET', 'Java') ) -- !query 29 schema -struct +struct -- !query 29 output a z b y c x d w 63000 50000 From ae938b2734b30969c83d0a858fd799b2d73da9fd Mon Sep 17 00:00:00 2001 From: chitralverma Date: Thu, 18 Jul 2019 14:24:36 +0530 Subject: [PATCH 4/6] incorporate review comments --- .../src/test/resources/sql-tests/inputs/udf/udf-pivot.sql | 4 ++-- .../test/resources/sql-tests/results/udf/udf-pivot.sql.out | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql index 58d650c47e2d..215b438d7207 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql @@ -59,7 +59,7 @@ SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) PIVOT ( - udf(sum(earnings)), udf(min(year)) + udf(sum(udf(earnings))), udf(min(year)) FOR course IN ('dotNET', 'Java') ); @@ -241,7 +241,7 @@ PIVOT ( -- pivot on multiple pivot columns containing array type SELECT * FROM ( - SELECT course, earnings, year, a + SELECT course, earnings, udf(year) as year, a FROM courseSales JOIN yearsWithComplexTypes ON year = y ) diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out index 2f1365ee19aa..cb9e4d736c9a 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out @@ -100,11 +100,11 @@ SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) PIVOT ( - udf(sum(earnings)), udf(min(year)) + udf(sum(udf(earnings))), udf(min(year)) FOR course IN ('dotNET', 'Java') ) -- !query 7 schema -struct +struct -- !query 7 output 63000 2012 50000 2012 @@ -361,7 +361,7 @@ struct -- !query 24 SELECT * FROM ( - SELECT course, earnings, year, a + SELECT course, earnings, udf(year) as year, a FROM courseSales JOIN yearsWithComplexTypes ON year = y ) From 46120b6824bf755a9fbb466a601c648e5d34b3e8 Mon Sep 17 00:00:00 2001 From: chitralverma Date: Thu, 18 Jul 2019 17:16:43 +0530 Subject: [PATCH 5/6] change to comments --- .../test/resources/sql-tests/inputs/udf/udf-pivot.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql index 215b438d7207..d40f60a28bfd 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql @@ -1,6 +1,6 @@ -- This test file was converted from pivot.sql. ---Note some test cases have been commented as the current integrated UDFs cannot handle complex types +-- Note some test cases have been commented as the current integrated UDFs cannot handle complex types create temporary view courseSales as select * from values ("dotNET", 2012, 10000), @@ -204,9 +204,9 @@ PIVOT ( FOR year IN (course, 2013) ); ---todo complex type fix +-- todo complex type fix -- pivot on join query with columns of complex data types ---SELECT * FROM ( +-- SELECT * FROM ( -- SELECT course, year, a -- FROM courseSales -- JOIN yearsWithComplexTypes ON year = y @@ -216,9 +216,9 @@ PIVOT ( -- FOR course IN ('dotNET', 'Java') --); ---todo complex type fix +-- todo complex type fix -- pivot on multiple pivot columns with agg columns of complex data types ---SELECT * FROM ( +-- SELECT * FROM ( -- SELECT course, year, y, a -- FROM courseSales -- JOIN yearsWithComplexTypes ON year = y From f979a47674c589b34fcf7e988311e89b1c67821c Mon Sep 17 00:00:00 2001 From: chitralverma Date: Thu, 18 Jul 2019 17:30:35 +0530 Subject: [PATCH 6/6] Changed comment for complex types --- .../src/test/resources/sql-tests/inputs/udf/udf-pivot.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql index d40f60a28bfd..93937930de7f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-pivot.sql @@ -204,7 +204,7 @@ PIVOT ( FOR year IN (course, 2013) ); --- todo complex type fix +-- Complex type is not supported in the current UDF. Skipped for now. -- pivot on join query with columns of complex data types -- SELECT * FROM ( -- SELECT course, year, a @@ -216,7 +216,7 @@ PIVOT ( -- FOR course IN ('dotNET', 'Java') --); --- todo complex type fix +-- Complex type is not supported in the current UDF. Skipped for now. -- pivot on multiple pivot columns with agg columns of complex data types -- SELECT * FROM ( -- SELECT course, year, y, a