diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part1.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part1.sql new file mode 100644 index 0000000000000..bab15a02cd5df --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part1.sql @@ -0,0 +1,156 @@ +-- +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- +-- AGGREGATES [Part 1] +-- https://github.com/postgres/postgres/blob/REL_12_BETA1/src/test/regress/sql/aggregates.sql#L1-L143 + +-- avoid bit-exact output here because operations may not be bit-exact. +-- SET extra_float_digits = 0; + +-- This test file was converted from pgSQL/aggregates_part1.sql. +-- Note that currently registered UDF returns a string. So there are some differences, for instance +-- in string cast within UDF in Scala and Python. + +SELECT avg(udf(four)) AS avg_1 FROM onek; + +SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100; + +-- In 7.1, avg(float4) is computed using float8 arithmetic. +-- Round the result to 3 digits to avoid platform-specific results. + +select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest; +-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766] +-- SELECT avg(gpa) AS avg_3_4 FROM ONLY student; + +SELECT sum(udf(four)) AS sum_1500 FROM onek; +SELECT udf(sum(a)) AS sum_198 FROM aggtest; +SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest; +-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766] +-- SELECT sum(gpa) AS avg_6_8 FROM ONLY student; + +SELECT udf(max(four)) AS max_3 FROM onek; +SELECT max(udf(a)) AS max_100 FROM aggtest; +SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest; +-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766] +-- SELECT max(student.gpa) AS max_3_7 FROM student; + +SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest; +SELECT udf(stddev_samp(b)) FROM aggtest; +SELECT CAST(var_pop(udf(b)) as int) FROM aggtest; +SELECT udf(var_samp(b)) FROM aggtest; + +SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest; +SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest; +SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest; +SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest; + +-- population variance is defined for a single tuple, sample variance +-- is not +SELECT udf(var_pop(1.0)), var_samp(udf(2.0)); +SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0))); + + +-- verify correct results for null and NaN inputs +select sum(udf(CAST(null AS int))) from range(1,4); +select sum(udf(CAST(null AS long))) from range(1,4); +select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4); +select sum(udf(CAST(null AS DOUBLE))) from range(1,4); +select avg(udf(CAST(null AS int))) from range(1,4); +select avg(udf(CAST(null AS long))) from range(1,4); +select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4); +select avg(udf(CAST(null AS DOUBLE))) from range(1,4); +select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4); +select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4); + +-- [SPARK-27768] verify correct results for infinite inputs +-- [SPARK-28291] UDFs cannot be evaluated within inline table definition +-- SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +-- FROM (VALUES (CAST(udf('1') AS DOUBLE)), (CAST(udf('Infinity') AS DOUBLE))) v(x); +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) +FROM (VALUES ('Infinity'), ('1')) v(x); +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) +FROM (VALUES ('Infinity'), ('Infinity')) v(x); +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) +FROM (VALUES ('-Infinity'), ('Infinity')) v(x); + + +-- test accuracy with a large input offset +SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) +FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x); +SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) +FROM (VALUES (7000000000005), (7000000000007)) v(x); + +-- SQL2003 binary aggregates [SPARK-23907] +-- SELECT regr_count(b, a) FROM aggtest; +-- SELECT regr_sxx(b, a) FROM aggtest; +-- SELECT regr_syy(b, a) FROM aggtest; +-- SELECT regr_sxy(b, a) FROM aggtest; +-- SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest; +-- SELECT regr_r2(b, a) FROM aggtest; +-- SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest; +SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest; +SELECT corr(b, udf(a)) FROM aggtest; + + +-- test accum and combine functions directly [SPARK-23907] +-- CREATE TABLE regr_test (x float8, y float8); +-- INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200); +-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +-- FROM regr_test WHERE x IN (10,20,30,80); +-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +-- FROM regr_test; +-- SELECT float8_accum('{4,140,2900}'::float8[], 100); +-- SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100); +-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +-- FROM regr_test WHERE x IN (10,20,30); +-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +-- FROM regr_test WHERE x IN (80,100); +-- SELECT float8_combine('{3,60,200}'::float8[],ELECT CAST(udf(covar_pop(b, udf(a))) AS '{0,0,0}'::float8[]); +-- SELECT float8_combine('{0,0,0}'::float8[], '{2,180,200}'::float8[]); +-- SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]); +-- SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[], +-- '{0,0,0,0,0,0}'::float8[]); +-- SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[], +-- '{2,180,200,740,57800,-3400}'::float8[]); +-- SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[], +-- '{2,180,200,740,57800,-3400}'::float8[]); +-- DROP TABLE regr_test; + + +-- test count, distinct +SELECT count(udf(four)) AS cnt_1000 FROM onek; +SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek; + +select ten, udf(count(*)), sum(udf(four)) from onek +group by ten order by ten; + +select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek +group by ten order by ten; + +-- user-defined aggregates +-- SELECT newavg(four) AS avg_1 FROM onek; +-- SELECT newsum(four) AS sum_1500 FROM onek; +-- SELECT newcnt(four) AS cnt_1000 FROM onek; +-- SELECT newcnt(*) AS cnt_1000 FROM onek; +-- SELECT oldcnt(*) AS cnt_1000 FROM onek; +-- SELECT sum2(q1,q2) FROM int8_tbl; + +-- test for outer-level aggregates + +-- this should work +select ten, udf(sum(distinct four)) from onek a +group by ten +having exists (select 1 from onek b where udf(sum(distinct a.four)) = b.four); + +-- this should fail because subquery has an agg of its own in WHERE +select ten, sum(distinct four) from onek a +group by ten +having exists (select 1 from onek b + where sum(distinct a.four + b.four) = udf(b.four)); + +-- [SPARK-27769] Test handling of sublinks within outer-level aggregates. +-- Per bug report from Daniel Grace. +select + (select udf(max((select i.unique2 from tenk1 i where i.unique1 = o.unique1)))) +from tenk1 o; diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out new file mode 100644 index 0000000000000..32be362d87cad --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out @@ -0,0 +1,387 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 43 + + +-- !query 0 +SELECT avg(udf(four)) AS avg_1 FROM onek +-- !query 0 schema +struct +-- !query 0 output +1.5 + + +-- !query 1 +SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100 +-- !query 1 schema +struct +-- !query 1 output +32.666666666666664 + + +-- !query 2 +select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest +-- !query 2 schema +struct +-- !query 2 output +107.943 + + +-- !query 3 +SELECT sum(udf(four)) AS sum_1500 FROM onek +-- !query 3 schema +struct +-- !query 3 output +1500.0 + + +-- !query 4 +SELECT udf(sum(a)) AS sum_198 FROM aggtest +-- !query 4 schema +struct +-- !query 4 output +198 + + +-- !query 5 +SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest +-- !query 5 schema +struct +-- !query 5 output +431.77260909229517 + + +-- !query 6 +SELECT udf(max(four)) AS max_3 FROM onek +-- !query 6 schema +struct +-- !query 6 output +3 + + +-- !query 7 +SELECT max(udf(a)) AS max_100 FROM aggtest +-- !query 7 schema +struct +-- !query 7 output +56 + + +-- !query 8 +SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest +-- !query 8 schema +struct +-- !query 8 output +324 + + +-- !query 9 +SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest +-- !query 9 schema +struct +-- !query 9 output +131 + + +-- !query 10 +SELECT udf(stddev_samp(b)) FROM aggtest +-- !query 10 schema +struct +-- !query 10 output +151.38936080399804 + + +-- !query 11 +SELECT CAST(var_pop(udf(b)) as int) FROM aggtest +-- !query 11 schema +struct +-- !query 11 output +17189 + + +-- !query 12 +SELECT udf(var_samp(b)) FROM aggtest +-- !query 12 schema +struct +-- !query 12 output +22918.738564643096 + + +-- !query 13 +SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest +-- !query 13 schema +struct +-- !query 13 output +131.18117242958306 + + +-- !query 14 +SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest +-- !query 14 schema +struct +-- !query 14 output +151.47497042966097 + + +-- !query 15 +SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest +-- !query 15 schema +struct +-- !query 15 output +17208.5 + + +-- !query 16 +SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest +-- !query 16 schema +struct +-- !query 16 output +22944.666666666668 + + +-- !query 17 +SELECT udf(var_pop(1.0)), var_samp(udf(2.0)) +-- !query 17 schema +struct +-- !query 17 output +0.0 NaN + + +-- !query 18 +SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0))) +-- !query 18 schema +struct +-- !query 18 output +0.0 NaN + + +-- !query 19 +select sum(udf(CAST(null AS int))) from range(1,4) +-- !query 19 schema +struct +-- !query 19 output +NULL + + +-- !query 20 +select sum(udf(CAST(null AS long))) from range(1,4) +-- !query 20 schema +struct +-- !query 20 output +NULL + + +-- !query 21 +select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4) +-- !query 21 schema +struct +-- !query 21 output +NULL + + +-- !query 22 +select sum(udf(CAST(null AS DOUBLE))) from range(1,4) +-- !query 22 schema +struct +-- !query 22 output +NULL + + +-- !query 23 +select avg(udf(CAST(null AS int))) from range(1,4) +-- !query 23 schema +struct +-- !query 23 output +NULL + + +-- !query 24 +select avg(udf(CAST(null AS long))) from range(1,4) +-- !query 24 schema +struct +-- !query 24 output +NULL + + +-- !query 25 +select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4) +-- !query 25 schema +struct +-- !query 25 output +NULL + + +-- !query 26 +select avg(udf(CAST(null AS DOUBLE))) from range(1,4) +-- !query 26 schema +struct +-- !query 26 output +NULL + + +-- !query 27 +select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4) +-- !query 27 schema +struct +-- !query 27 output +NaN + + +-- !query 28 +select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4) +-- !query 28 schema +struct +-- !query 28 output +NaN + + +-- !query 29 +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) +FROM (VALUES ('Infinity'), ('1')) v(x) +-- !query 29 schema +struct +-- !query 29 output +Infinity NaN + + +-- !query 30 +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) +FROM (VALUES ('Infinity'), ('Infinity')) v(x) +-- !query 30 schema +struct +-- !query 30 output +Infinity NaN + + +-- !query 31 +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) +FROM (VALUES ('-Infinity'), ('Infinity')) v(x) +-- !query 31 schema +struct +-- !query 31 output +NaN NaN + + +-- !query 32 +SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) +FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x) +-- !query 32 schema +struct +-- !query 32 output +1.00000005E8 2.5 + + +-- !query 33 +SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) +FROM (VALUES (7000000000005), (7000000000007)) v(x) +-- !query 33 schema +struct +-- !query 33 output +7.000000000006E12 1.0 + + +-- !query 34 +SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest +-- !query 34 schema +struct +-- !query 34 output +653 871 + + +-- !query 35 +SELECT corr(b, udf(a)) FROM aggtest +-- !query 35 schema +struct +-- !query 35 output +0.1396345165178734 + + +-- !query 36 +SELECT count(udf(four)) AS cnt_1000 FROM onek +-- !query 36 schema +struct +-- !query 36 output +1000 + + +-- !query 37 +SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek +-- !query 37 schema +struct +-- !query 37 output +4 + + +-- !query 38 +select ten, udf(count(*)), sum(udf(four)) from onek +group by ten order by ten +-- !query 38 schema +struct +-- !query 38 output +0 100 100.0 +1 100 200.0 +2 100 100.0 +3 100 200.0 +4 100 100.0 +5 100 200.0 +6 100 100.0 +7 100 200.0 +8 100 100.0 +9 100 200.0 + + +-- !query 39 +select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek +group by ten order by ten +-- !query 39 schema +struct +-- !query 39 output +0 100 2 +1 100 4 +2 100 2 +3 100 4 +4 100 2 +5 100 4 +6 100 2 +7 100 4 +8 100 2 +9 100 4 + + +-- !query 40 +select ten, udf(sum(distinct four)) from onek a +group by ten +having exists (select 1 from onek b where udf(sum(distinct a.four)) = b.four) +-- !query 40 schema +struct +-- !query 40 output +0 2 +2 2 +4 2 +6 2 +8 2 + + +-- !query 41 +select ten, sum(distinct four) from onek a +group by ten +having exists (select 1 from onek b + where sum(distinct a.four + b.four) = udf(b.four)) +-- !query 41 schema +struct<> +-- !query 41 output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(udf(four) AS BIGINT))] +Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))]; + + +-- !query 42 +select + (select udf(max((select i.unique2 from tenk1 i where i.unique1 = o.unique1)))) +from tenk1 o +-- !query 42 schema +struct<> +-- !query 42 output +org.apache.spark.sql.AnalysisException +cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 67 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala index 7caf6241bb984..e379d6df867c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala @@ -247,7 +247,7 @@ object IntegratedUDFTestUtils extends SQLHelper { /** * A Scala UDF that takes one column and returns a string column. - * Equivalent to `udf((input: Any) => input.toString)`. + * Equivalent to `udf((input: Any) => String.valueOf(input)`. */ case class TestScalaUDF(name: String) extends TestUDF { private[IntegratedUDFTestUtils] lazy val udf = SparkUserDefinedFunction(