From 10db53ce91051c3bcaa958d3e6b40d234b4a0e1b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 19 Jun 2024 07:30:46 -0400 Subject: [PATCH 1/3] Add more StringView comparison test coverage --- .../sqllogictest/test_files/string_view.slt | 259 +++++++++++++----- 1 file changed, 196 insertions(+), 63 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 3be3c94770db..3fd0bf0db802 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -15,99 +15,232 @@ # specific language governing permissions and limitations # under the License. +######## +## Test setup +######## -# test StringViewArray with Utf8View columns statement ok -create table test as values (arrow_cast('Andrew', 'Utf8View'), arrow_cast('X', 'Utf8View')), - (arrow_cast('Xiangpeng', 'Utf8View'), arrow_cast('Xiangpeng', 'Utf8View')), - (arrow_cast('Raphael', 'Utf8View'), arrow_cast('R', 'Utf8View')), - (arrow_cast(NULL, 'Utf8View'), arrow_cast('R', 'Utf8View')); +create table test_source as values + ('Andrew', 'X'), + ('Xiangpeng', 'Xiangpeng'), + ('Raphael', 'R'), + (NULL, 'R') +; + +# Table with the different combination of column types +statement ok +create table test as +SELECT + arrow_cast(column1, 'Utf8') as column1_utf8, + arrow_cast(column2, 'Utf8') as column2_utf8, + arrow_cast(column1, 'Utf8View') as column1_utf8view, + arrow_cast(column2, 'Utf8View') as column2_utf8view, + arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1_dict, + arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2_dict +FROM test_source; -query B -select arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'); ----- -false +statement ok +drop table test_source -query B -select arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); ----- -true +######## +## StringView to StringView +######## -query B -select arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'); ----- -true +# StringView scalar to StringView scalar -query B -select arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); +query BBBB +select + arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); ---- -true +false true true true + -query ?? -select * from test where column1 = column2; +# StringView column to StringView column comparison as filters + +query TT +select column1_utf8, column2_utf8 from test where column1_utf8view = column2_utf8view; ---- Xiangpeng Xiangpeng -query ?? -select * from test where column1 <> column2; +query TT +select column1_utf8, column2_utf8 from test where column1_utf8view <> column2_utf8view; ---- Andrew X Raphael R -query ?? -select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); +# StringView column to StringView column +query TTBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_utf8view, + column1_utf8view <> column2_utf8view +from test; ---- -Andrew X - -query ?? -select * from test where column1 = 'Andrew'; +Andrew X false true +Xiangpeng Xiangpeng true false +Raphael R false true +NULL R NULL NULL + +# StringView column to StringView scalar comparison +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_utf8view +from test; ---- -Andrew X - -query ?? -select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +######## +## StringView to String +######## + +# test StringViewArray with Utf8 columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_utf8, + column2_utf8 = column1_utf8view, + column1_utf8view <> column2_utf8, + column2_utf8 <> column1_utf8view +from test; ---- -Xiangpeng Xiangpeng -Raphael R - -query ?? -select * from test where column1 <> 'Andrew'; +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# StringView column to String scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'Utf8'), + arrow_cast('Andrew', 'Utf8') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'Utf8'), + arrow_cast('Andrew', 'Utf8') <> column1_utf8view +from test; ---- -Xiangpeng Xiangpeng -Raphael R +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# String column to StringView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8 = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_utf8, + column1_utf8 <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_utf8 +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + + +######## +## StringView to Dictionary +######## + +# test StringViewArray with Dictionary columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_dict, + column2_dict = column1_utf8view, + column1_utf8view <> column2_dict, + column2_dict <> column1_utf8view +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# StringView column to Dict scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'), + arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'), + arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') <> column1_utf8view +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# Dict column to StringView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_dict = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_dict, + column1_dict <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_dict +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL -statement ok -drop table test; + +######## +## Coercion Rules +######## -# test StringViewArray with Utf8 and Utf8View columns statement ok -create table test as values ('Andrew', arrow_cast('X', 'Utf8View')), - ('Xiangpeng', arrow_cast('Xiangpeng', 'Utf8View')), - ('Raphael', arrow_cast('R', 'Utf8View')), - (NULL, arrow_cast('R', 'Utf8View')); +set datafusion.explain.logical_plan_only = true; -query T? -select * from test where column1 = column2; ----- -Xiangpeng Xiangpeng -query T? -select * from test where column1 <> column2; +# Filter should have a StringView literal and no column cast +query TT +explain SELECT column1_utf8 from test where column1_utf8view = 'Andrew'; ---- -Andrew X -Raphael R +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +# reverse order should be the same +query TT +explain SELECT column1_utf8 from test where 'Andrew' = column1_utf8view; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] -query T? -select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); +query TT +explain SELECT column1_utf8 from test where column1_utf8 = arrow_cast('Andrew', 'Utf8View'); ---- -Andrew X +logical_plan +01)Filter: CAST(test.column1_utf8 AS Utf8View) = Utf8View("Andrew") +02)--TableScan: test projection=[column1_utf8] -query T? -select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); +query TT +explain SELECT column1_utf8 from test where column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'); ---- -Xiangpeng Xiangpeng -Raphael R +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + statement ok drop table test; From b3e54352d75cde93d22d2e84a02e062655f75b73 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 19 Jun 2024 08:00:11 -0400 Subject: [PATCH 2/3] add reference --- datafusion/sqllogictest/test_files/string_view.slt | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 3fd0bf0db802..5fca18d0307a 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -226,6 +226,7 @@ logical_plan 02)--Filter: test.column1_utf8view = Utf8View("Andrew") 03)----TableScan: test projection=[column1_utf8, column1_utf8view] +# should not be casting the column: https://github.com/apache/datafusion/issues/10998 query TT explain SELECT column1_utf8 from test where column1_utf8 = arrow_cast('Andrew', 'Utf8View'); ---- From a578c4029c94627f4bebcc938dfe397fea51a341 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 19 Jun 2024 09:51:03 -0400 Subject: [PATCH 3/3] Add another test showing casting on columns works correctly --- .../sqllogictest/test_files/string_view.slt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 5fca18d0307a..f8824b23d1b9 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -242,6 +242,24 @@ logical_plan 02)--Filter: test.column1_utf8view = Utf8View("Andrew") 03)----TableScan: test projection=[column1_utf8, column1_utf8view] +# compare string / stringview +# Should cast string -> stringview (which is cheap), not stringview -> string (which is not) +query TT +explain SELECT column1_utf8 from test where column1_utf8view = column2_utf8; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = CAST(test.column2_utf8 AS Utf8View) +03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] + +query TT +explain SELECT column1_utf8 from test where column2_utf8 = column1_utf8view; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: CAST(test.column2_utf8 AS Utf8View) = test.column1_utf8view +03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] + statement ok drop table test;