Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,18 @@ trait CheckAnalysis extends PredicateHelper {
s"appear in the arguments of an aggregate function.")
}
}
case e: Attribute if groupingExprs.isEmpty =>
// Collect all [[AggregateExpressions]]s.
val aggExprs = aggregateExprs.filter(_.collect {
case a: AggregateExpression => a
}.nonEmpty)
failAnalysis(
s"grouping expressions sequence is empty, " +
s"and '${e.sql}' is not an aggregate function. " +
s"Wrap '${aggExprs.map(_.sql).mkString("(", ", ", ")")}' in windowing " +
s"function(s) or wrap '${e.sql}' in first() (or first_value) " +
s"if you don't care which value you get."
)
case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) =>
failAnalysis(
s"expression '${e.sql}' is neither present in the group by, " +
Expand Down
41 changes: 29 additions & 12 deletions sql/core/src/test/resources/sql-tests/inputs/group-by.sql
Original file line number Diff line number Diff line change
@@ -1,17 +1,34 @@
-- Temporary data.
create temporary view myview as values 128, 256 as v(int_col);
-- Test data.
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
AS testData(a, b);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also can we just augment the initial dataset rather than introducing a new testData?

It'd be better to use one dataset.


-- group by should produce all input rows,
select int_col, count(*) from myview group by int_col;
-- Aggregate with empty GroupBy expressions.
SELECT a, COUNT(b) FROM testData;
SELECT COUNT(a), COUNT(b) FROM testData;

-- group by should produce a single row.
select 'foo', count(*) from myview group by 1;
-- Aggregate with non-empty GroupBy expressions.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are already tested earlier ain't they?

SELECT a, COUNT(b) FROM testData GROUP BY a;
SELECT a, COUNT(b) FROM testData GROUP BY b;
SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a;

-- group-by should not produce any rows (whole stage code generation).
select 'foo' from myview where int_col == 0 group by 1;
-- Aggregate grouped by literals.
SELECT 'foo', COUNT(a) FROM testData GROUP BY 1;

-- group-by should not produce any rows (hash aggregate).
select 'foo', approx_count_distinct(int_col) from myview where int_col == 0 group by 1;
-- Aggregate grouped by literals (whole stage code generation).
SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1;

-- group-by should not produce any rows (sort aggregate).
select 'foo', max(struct(int_col)) from myview where int_col == 0 group by 1;
-- Aggregate grouped by literals (hash aggregate).
SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1;

-- Aggregate grouped by literals (sort aggregate).
SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1;

-- Aggregate with complex GroupBy expressions.
SELECT a + b, COUNT(b) FROM testData GROUP BY a + b;
SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1;
SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1;

-- Aggregate with nulls.
SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
FROM testData;
116 changes: 99 additions & 17 deletions sql/core/src/test/resources/sql-tests/results/group-by.sql.out
Original file line number Diff line number Diff line change
@@ -1,51 +1,133 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 6
-- Number of queries: 14


-- !query 0
create temporary view myview as values 128, 256 as v(int_col)
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
AS testData(a, b)
-- !query 0 schema
struct<>
-- !query 0 output



-- !query 1
select int_col, count(*) from myview group by int_col
SELECT a, COUNT(b) FROM testData
-- !query 1 schema
struct<int_col:int,count(1):bigint>
struct<>
-- !query 1 output
128 1
256 1
org.apache.spark.sql.AnalysisException
grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) or wrap 'testdata.`a`' in first() (or first_value) if you don't care which value you get.;


-- !query 2
select 'foo', count(*) from myview group by 1
SELECT COUNT(a), COUNT(b) FROM testData
-- !query 2 schema
struct<foo:string,count(1):bigint>
struct<count(a):bigint,count(b):bigint>
-- !query 2 output
foo 2
7 7


-- !query 3
select 'foo' from myview where int_col == 0 group by 1
SELECT a, COUNT(b) FROM testData GROUP BY a
-- !query 3 schema
struct<foo:string>
struct<a:int,count(b):bigint>
-- !query 3 output

1 2
2 2
3 2
NULL 1


-- !query 4
select 'foo', approx_count_distinct(int_col) from myview where int_col == 0 group by 1
SELECT a, COUNT(b) FROM testData GROUP BY b
-- !query 4 schema
struct<foo:string,approx_count_distinct(int_col):bigint>
struct<>
-- !query 4 output

org.apache.spark.sql.AnalysisException
expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;


-- !query 5
select 'foo', max(struct(int_col)) from myview where int_col == 0 group by 1
SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a
-- !query 5 schema
struct<foo:string,max(struct(int_col)):struct<int_col:int>>
struct<count(a):bigint,count(b):bigint>
-- !query 5 output
0 1
2 2
2 2
3 2


-- !query 6
SELECT 'foo', COUNT(a) FROM testData GROUP BY 1
-- !query 6 schema
struct<foo:string,count(a):bigint>
-- !query 6 output
foo 7


-- !query 7
SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1
-- !query 7 schema
struct<foo:string>
-- !query 7 output



-- !query 8
SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1
-- !query 8 schema
struct<foo:string,approx_count_distinct(a):bigint>
-- !query 8 output



-- !query 9
SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1
-- !query 9 schema
struct<foo:string,max(struct(a)):struct<a:int>>
-- !query 9 output



-- !query 10
SELECT a + b, COUNT(b) FROM testData GROUP BY a + b
-- !query 10 schema
struct<(a + b):int,count(b):bigint>
-- !query 10 output
2 1
3 2
4 2
5 1
NULL 1


-- !query 11
SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1
-- !query 11 schema
struct<>
-- !query 11 output
org.apache.spark.sql.AnalysisException
expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;


-- !query 12
SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1
-- !query 12 schema
struct<((a + 1) + 1):int,count(b):bigint>
-- !query 12 output
3 2
4 2
5 2
NULL 1


-- !query 13
SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
FROM testData
-- !query 13 schema
struct<skewness(CAST(a AS DOUBLE)):double,kurtosis(CAST(a AS DOUBLE)):double,min(a):int,max(a):int,avg(a):double,var_samp(CAST(a AS DOUBLE)):double,stddev_samp(CAST(a AS DOUBLE)):double,sum(a):bigint,count(a):bigint>
-- !query 13 output
-0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7
35 changes: 0 additions & 35 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -463,20 +463,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
)
}

test("agg") {
checkAnswer(
sql("SELECT a, SUM(b) FROM testData2 GROUP BY a"),
Seq(Row(1, 3), Row(2, 3), Row(3, 3)))
}

test("aggregates with nulls") {
checkAnswer(
sql("SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a)," +
"AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM nullInts"),
Row(0, -1.5, 1, 3, 2, 1.0, 1, 6, 3)
)
}

test("select *") {
checkAnswer(
sql("SELECT * FROM testData"),
Expand Down Expand Up @@ -1178,27 +1164,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
Row(1))
}

test("throw errors for non-aggregate attributes with aggregation") {
def checkAggregation(query: String, isInvalidQuery: Boolean = true) {
if (isInvalidQuery) {
val e = intercept[AnalysisException](sql(query).queryExecution.analyzed)
assert(e.getMessage contains "group by")
} else {
// Should not throw
sql(query).queryExecution.analyzed
}
}

checkAggregation("SELECT key, COUNT(*) FROM testData")
checkAggregation("SELECT COUNT(key), COUNT(*) FROM testData", isInvalidQuery = false)

checkAggregation("SELECT value, COUNT(*) FROM testData GROUP BY key")
checkAggregation("SELECT COUNT(value), SUM(key) FROM testData GROUP BY key", false)

checkAggregation("SELECT key + 2, COUNT(*) FROM testData GROUP BY key + 1")
checkAggregation("SELECT key + 1 + 1, COUNT(*) FROM testData GROUP BY key + 1", false)
}

testQuietly(
"SPARK-16748: SparkExceptions during planning should not wrapped in TreeNodeException") {
intercept[SparkException] {
Expand Down