Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more StringView comparison test coverage #10997

Merged
merged 3 commits into from
Jun 19, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
270 changes: 211 additions & 59 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -15,99 +15,251 @@
# specific language governing permissions and limitations
# under the License.

########
## Test setup
########

# test StringViewArray with Utf8View columns
statement ok
create table test as values (arrow_cast('Andrew', 'Utf8View'), arrow_cast('X', 'Utf8View')),
(arrow_cast('Xiangpeng', 'Utf8View'), arrow_cast('Xiangpeng', 'Utf8View')),
(arrow_cast('Raphael', 'Utf8View'), arrow_cast('R', 'Utf8View')),
(arrow_cast(NULL, 'Utf8View'), arrow_cast('R', 'Utf8View'));
create table test_source as values
('Andrew', 'X'),
('Xiangpeng', 'Xiangpeng'),
('Raphael', 'R'),
(NULL, 'R')
;

query B
select arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View');
----
false
# Table with the different combination of column types
statement ok
create table test as
SELECT
arrow_cast(column1, 'Utf8') as column1_utf8,
arrow_cast(column2, 'Utf8') as column2_utf8,
arrow_cast(column1, 'Utf8View') as column1_utf8view,
arrow_cast(column2, 'Utf8View') as column2_utf8view,
arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1_dict,
arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2_dict
FROM test_source;

query B
select arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View');
----
true
statement ok
drop table test_source

query B
select arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View');
----
true
########
## StringView to StringView
########

query B
select arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View');
# StringView scalar to StringView scalar

query BBBB
select
arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'),
arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View');
----
true
false true true true


# StringView column to StringView column comparison as filters

query ??
select * from test where column1 = column2;
query TT
select column1_utf8, column2_utf8 from test where column1_utf8view = column2_utf8view;
----
Xiangpeng Xiangpeng

query ??
select * from test where column1 <> column2;
query TT
select column1_utf8, column2_utf8 from test where column1_utf8view <> column2_utf8view;
----
Andrew X
Raphael R

query ??
select * from test where column1 = arrow_cast('Andrew', 'Utf8View');
# StringView column to StringView column
query TTBB
select
column1_utf8, column2_utf8,
column1_utf8view = column2_utf8view,
column1_utf8view <> column2_utf8view
from test;
----
Andrew X
Andrew X false true
Xiangpeng Xiangpeng true false
Raphael R false true
NULL R NULL NULL

query ??
select * from test where column1 = 'Andrew';
# StringView column to StringView scalar comparison
query TTBBBB
select
column1_utf8, column2_utf8,
column1_utf8view = arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') = column1_utf8view,
column1_utf8view <> arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') <> column1_utf8view
from test;
----
Andrew X
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

query ??
select * from test where column1 <> arrow_cast('Andrew', 'Utf8View');
########
## StringView to String
########

# test StringViewArray with Utf8 columns
query TTBBBB
select
column1_utf8, column2_utf8,
column1_utf8view = column2_utf8,
column2_utf8 = column1_utf8view,
column1_utf8view <> column2_utf8,
column2_utf8 <> column1_utf8view
from test;
----
Xiangpeng Xiangpeng
Raphael R
Andrew X false false true true
Xiangpeng Xiangpeng true true false false
Raphael R false false true true
NULL R NULL NULL NULL NULL

query ??
select * from test where column1 <> 'Andrew';
# StringView column to String scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_utf8view = arrow_cast('Andrew', 'Utf8'),
arrow_cast('Andrew', 'Utf8') = column1_utf8view,
column1_utf8view <> arrow_cast('Andrew', 'Utf8'),
arrow_cast('Andrew', 'Utf8') <> column1_utf8view
from test;
----
Xiangpeng Xiangpeng
Raphael R
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

statement ok
drop table test;
# String column to StringView scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_utf8 = arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') = column1_utf8,
column1_utf8 <> arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') <> column1_utf8
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL


########
## StringView to Dictionary
########

# test StringViewArray with Dictionary columns
query TTBBBB
select
column1_utf8, column2_utf8,
column1_utf8view = column2_dict,
column2_dict = column1_utf8view,
column1_utf8view <> column2_dict,
column2_dict <> column1_utf8view
from test;
----
Andrew X false false true true
Xiangpeng Xiangpeng true true false false
Raphael R false false true true
NULL R NULL NULL NULL NULL

# StringView column to Dict scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'),
arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') = column1_utf8view,
column1_utf8view <> arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'),
arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') <> column1_utf8view
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

# Dict column to StringView scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_dict = arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') = column1_dict,
column1_dict <> arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') <> column1_dict
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL


########
## Coercion Rules
########


# test StringViewArray with Utf8 and Utf8View columns
statement ok
create table test as values ('Andrew', arrow_cast('X', 'Utf8View')),
('Xiangpeng', arrow_cast('Xiangpeng', 'Utf8View')),
('Raphael', arrow_cast('R', 'Utf8View')),
(NULL, arrow_cast('R', 'Utf8View'));
set datafusion.explain.logical_plan_only = true;

query T?
select * from test where column1 = column2;

# Filter should have a StringView literal and no column cast
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wrote several plans here showing what coercion is happening, and they mostly look good to me.

query TT
explain SELECT column1_utf8 from test where column1_utf8view = 'Andrew';
----
Xiangpeng Xiangpeng
logical_plan
01)Projection: test.column1_utf8
02)--Filter: test.column1_utf8view = Utf8View("Andrew")
03)----TableScan: test projection=[column1_utf8, column1_utf8view]

query T?
select * from test where column1 <> column2;
# reverse order should be the same
query TT
explain SELECT column1_utf8 from test where 'Andrew' = column1_utf8view;
----
Andrew X
Raphael R
logical_plan
01)Projection: test.column1_utf8
02)--Filter: test.column1_utf8view = Utf8View("Andrew")
03)----TableScan: test projection=[column1_utf8, column1_utf8view]

query T?
select * from test where column1 = arrow_cast('Andrew', 'Utf8View');
# should not be casting the column: https://github.com/apache/datafusion/issues/10998
query TT
explain SELECT column1_utf8 from test where column1_utf8 = arrow_cast('Andrew', 'Utf8View');
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This plan is non ideal as it casts the column rather than the constant

Instead of

CAST(test.column1_utf8 AS Utf8View) = Utf8View("Andrew")

the filter should be

test.column1_utf8 =  Utf8("Andrew")

I will file a ticket about this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

----
Andrew X
logical_plan
01)Filter: CAST(test.column1_utf8 AS Utf8View) = Utf8View("Andrew")
02)--TableScan: test projection=[column1_utf8]

query T?
select * from test where column1 <> arrow_cast('Andrew', 'Utf8View');
query TT
explain SELECT column1_utf8 from test where column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)');
----
Xiangpeng Xiangpeng
Raphael R
logical_plan
01)Projection: test.column1_utf8
02)--Filter: test.column1_utf8view = Utf8View("Andrew")
03)----TableScan: test projection=[column1_utf8, column1_utf8view]

# compare string / stringview
# Should cast string -> stringview (which is cheap), not stringview -> string (which is not)
query TT
explain SELECT column1_utf8 from test where column1_utf8view = column2_utf8;
----
logical_plan
01)Projection: test.column1_utf8
02)--Filter: test.column1_utf8view = CAST(test.column2_utf8 AS Utf8View)
03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view]

query TT
explain SELECT column1_utf8 from test where column2_utf8 = column1_utf8view;
----
logical_plan
01)Projection: test.column1_utf8
02)--Filter: CAST(test.column2_utf8 AS Utf8View) = test.column1_utf8view
03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view]


statement ok
drop table test;