Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more sqllogictests for parquet_sorted_statistics #10381

Closed
wants to merge 3 commits into from
Closed
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,77 @@ physical_plan
01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST]
02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true]
03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col]

statement ok
DROP TABLE test_table;

# Create a table as a data source
statement ok
CREATE TABLE src_table2 (
a INT,
b INT,
c INT
) AS VALUES
-- first file
(1, 100, 0),
(2, 200, 0),
(3, 300, 0),
-- second file
(7, 700, 0),
(8, 800, 0),
(9, 900, 0);

# Create files: file1.parquet, file2.parquet both sorted on a
# but file 1 has the columns in the order a, b, c and file 2 has the columns in the order c, b, a
# The keyranges of values of a should be non overlapping

# Create file1.parquet
query III
COPY (SELECT * FROM src_table2 WHERE a <= 3 ORDER BY a)
TO 'test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet'
STORED AS PARQUET;
----
3

# Create file2.parquet
query III
COPY (SELECT c, b, a FROM src_table2 WHERE a > 6 ORDER BY a)
TO 'test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet'
STORED AS PARQUET;
----
3

# Create an external table a, b, c with explicit order by a
statement ok
CREATE EXTERNAL TABLE test_table (
partition_col TEXT NOT NULL,
a INT NOT NULL,
b INT NOT NULL,
c INT NOT NULL
)
STORED AS PARQUET
PARTITIONED BY (partition_col)
WITH ORDER (a ASC NULLS LAST)
LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table1';

query TT
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @yyy1000 would you mind adding the description what exactly it tests, what is so outstanding in the output we looking for?

Copy link
Contributor Author

@yyy1000 yyy1000 May 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, the test is mainly from #9593 (comment), I think it will test the case when schema and query order to verify sort preserving merge would not be used.
Will add the description soon.

EXPLAIN SELECT *
FROM test_table
ORDER BY a;
----
logical_plan
01)Sort: test_table.a ASC NULLS LAST
02)--TableScan: test_table projection=[a, b, c, partition_col]
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST]

query TT
EXPLAIN SELECT *
FROM test_table
ORDER BY b;
----
logical_plan
01)Sort: test_table.b ASC NULLS LAST
02)--TableScan: test_table projection=[a, b, c, partition_col]
physical_plan
01)SortExec: expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST]
Loading