From 5db4b414ffaea4659c5ab700203ff098ce2a69f8 Mon Sep 17 00:00:00 2001 From: "wangguangxin.cn" Date: Thu, 6 Aug 2020 16:39:56 +0800 Subject: [PATCH] Fix the trim logic in UTF8String.toInt/toLong did't handle Chinese characters correctly --- .../apache/spark/unsafe/types/UTF8String.java | 12 +++--- .../test/resources/sql-tests/inputs/cast.sql | 5 +++ .../resources/sql-tests/inputs/datetime.sql | 2 + .../resources/sql-tests/inputs/interval.sql | 1 + .../sql-tests/results/ansi/datetime.sql.out | 30 ++++++++++++- .../sql-tests/results/ansi/interval.sql.out | 16 ++++++- .../resources/sql-tests/results/cast.sql.out | 42 ++++++++++++++++++- .../sql-tests/results/datetime-legacy.sql.out | 30 ++++++++++++- .../sql-tests/results/datetime.sql.out | 30 ++++++++++++- .../sql-tests/results/interval.sql.out | 16 ++++++- 10 files changed, 172 insertions(+), 12 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 7205293aa48c..43bd7976c5d3 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -575,14 +575,14 @@ public UTF8String trim() { public UTF8String trimAll() { int s = 0; // skip all of the whitespaces (<=0x20) in the left side - while (s < this.numBytes && getByte(s) <= ' ') s++; + while (s < this.numBytes && Character.isWhitespace(getByte(s))) s++; if (s == this.numBytes) { // Everything trimmed return EMPTY_UTF8; } // skip all of the whitespaces (<=0x20) in the right side int e = this.numBytes - 1; - while (e > s && getByte(e) <= ' ') e--; + while (e > s && Character.isWhitespace(getByte(e))) e--; if (s == 0 && e == numBytes - 1) { // Nothing trimmed return this; @@ -1119,11 +1119,11 @@ public boolean toLong(LongWrapper toLongResult) { private boolean toLong(LongWrapper toLongResult, boolean allowDecimal) { int offset = 0; - while (offset < this.numBytes && getByte(offset) <= ' ') offset++; + while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++; if (offset == this.numBytes) return false; int end = this.numBytes - 1; - while (end > offset && getByte(end) <= ' ') end--; + while (end > offset && Character.isWhitespace(getByte(end))) end--; byte b = getByte(offset); final boolean negative = b == '-'; @@ -1216,11 +1216,11 @@ public boolean toInt(IntWrapper intWrapper) { private boolean toInt(IntWrapper intWrapper, boolean allowDecimal) { int offset = 0; - while (offset < this.numBytes && getByte(offset) <= ' ') offset++; + while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++; if (offset == this.numBytes) return false; int end = this.numBytes - 1; - while (end > offset && getByte(end) <= ' ') end--; + while (end > offset && Character.isWhitespace(getByte(end))) end--; byte b = getByte(offset); final boolean negative = b == '-'; diff --git a/sql/core/src/test/resources/sql-tests/inputs/cast.sql b/sql/core/src/test/resources/sql-tests/inputs/cast.sql index 972ebdd01f61..81c741a5ca8e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cast.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cast.sql @@ -70,6 +70,11 @@ select cast(' 1' as bigint); select cast(' 1' as float); select cast(' 1 ' as DOUBLE); select cast('1.0 ' as DEC); +select cast('1中文' as tinyint); +select cast('1中文' as smallint); +select cast('1中文' as INT); +select cast('中文1' as bigint); +select cast('1中文' as bigint); -- trim string before cast to boolean select cast('\t\t true \n\r ' as boolean); diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index bdf11f51db53..0445c7864946 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -48,6 +48,8 @@ select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01'); select date '2019-01-01\t'; select timestamp '2019-01-01\t'; +select date '2020-01-01中文'; +select timestamp '2019-01-01中文'; -- time add/sub select timestamp'2011-11-11 11:11:11' + interval '2' day; diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql index e881250ed7be..7173863313b2 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -171,6 +171,7 @@ select interval '2-2\t' year to month; select interval '-\t2-2\t' year to month; select interval '\n0 12:34:46.789\t' day to second; select interval '\n-\t10\t 12:34:46.789\t' day to second; +select interval '中文 interval 1 day'; -- interval overflow if (ansi) exception else NULL select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out index 484b67677a91..d2d66713780d 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 106 +-- Number of queries: 108 -- !query @@ -226,6 +226,34 @@ struct 2019-01-01 00:00:00 +-- !query +select date '2020-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7) + +== SQL == +select date '2020-01-01中文' +-------^^^ + + +-- !query +select timestamp '2019-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7) + +== SQL == +select timestamp '2019-01-01中文' +-------^^^ + + -- !query select timestamp'2011-11-11 11:11:11' + interval '2' day -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index 8644c668782a..f80bea1d3273 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 93 +-- Number of queries: 94 -- !query @@ -891,6 +891,20 @@ select interval '\n-\t10\t 12:34:46.789\t' day to second ----------------^^^ +-- !query +select interval '中文 interval 1 day' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 中文 interval 1 day(line 1, pos 7) + +== SQL == +select interval '中文 interval 1 day' +-------^^^ + + -- !query select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/cast.sql.out index 35b4c0e79720..d4872ca03199 100644 --- a/sql/core/src/test/resources/sql-tests/results/cast.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cast.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 46 +-- Number of queries: 51 -- !query @@ -353,6 +353,46 @@ struct 1 +-- !query +select cast('1中文' as tinyint) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('1中文' as smallint) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('1中文' as INT) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('中文1' as bigint) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('1中文' as bigint) +-- !query schema +struct +-- !query output +NULL + + -- !query select cast('\t\t true \n\r ' as boolean) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out index edb49e575f52..3806764856f5 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 106 +-- Number of queries: 108 -- !query @@ -200,6 +200,34 @@ struct 2019-01-01 00:00:00 +-- !query +select date '2020-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7) + +== SQL == +select date '2020-01-01中文' +-------^^^ + + +-- !query +select timestamp '2019-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7) + +== SQL == +select timestamp '2019-01-01中文' +-------^^^ + + -- !query select timestamp'2011-11-11 11:11:11' + interval '2' day -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 9f9351a4809a..5feeaa9addef 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 106 +-- Number of queries: 108 -- !query @@ -200,6 +200,34 @@ struct 2019-01-01 00:00:00 +-- !query +select date '2020-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7) + +== SQL == +select date '2020-01-01中文' +-------^^^ + + +-- !query +select timestamp '2019-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7) + +== SQL == +select timestamp '2019-01-01中文' +-------^^^ + + -- !query select timestamp'2011-11-11 11:11:11' + interval '2' day -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out index 438a2766061b..297c4fcd0cb9 100644 --- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 93 +-- Number of queries: 94 -- !query @@ -868,6 +868,20 @@ select interval '\n-\t10\t 12:34:46.789\t' day to second ----------------^^^ +-- !query +select interval '中文 interval 1 day' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 中文 interval 1 day(line 1, pos 7) + +== SQL == +select interval '中文 interval 1 day' +-------^^^ + + -- !query select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) -- !query schema