diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index d7a498d1c1c2..deecd4f01582 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -1063,7 +1063,7 @@ public static class IntWrapper implements Serializable { } /** - * Parses this UTF8String to long. + * Parses this UTF8String(trimmed if needed) to long. * * Note that, in this method we accumulate the result in negative format, and convert it to * positive format at the end, if this string is not started with '-'. This is because min value @@ -1077,18 +1077,20 @@ public static class IntWrapper implements Serializable { * @return true if the parsing was successful else false */ public boolean toLong(LongWrapper toLongResult) { - if (numBytes == 0) { - return false; - } + int offset = 0; + while (offset < this.numBytes && getByte(offset) <= ' ') offset++; + if (offset == this.numBytes) return false; - byte b = getByte(0); + int end = this.numBytes - 1; + while (end > offset && getByte(end) <= ' ') end--; + + byte b = getByte(offset); final boolean negative = b == '-'; - int offset = 0; if (negative || b == '+') { - offset++; - if (numBytes == 1) { + if (end - offset == 0) { return false; } + offset++; } final byte separator = '.'; @@ -1096,7 +1098,7 @@ public boolean toLong(LongWrapper toLongResult) { final long stopValue = Long.MIN_VALUE / radix; long result = 0; - while (offset < numBytes) { + while (offset <= end) { b = getByte(offset); offset++; if (b == separator) { @@ -1131,7 +1133,7 @@ public boolean toLong(LongWrapper toLongResult) { // This is the case when we've encountered a decimal separator. The fractional // part will not change the number, but we will verify that the fractional part // is well formed. - while (offset < numBytes) { + while (offset <= end) { byte currentByte = getByte(offset); if (currentByte < '0' || currentByte > '9') { return false; @@ -1151,7 +1153,7 @@ public boolean toLong(LongWrapper toLongResult) { } /** - * Parses this UTF8String to int. + * Parses this UTF8String(trimmed if needed) to int. * * Note that, in this method we accumulate the result in negative format, and convert it to * positive format at the end, if this string is not started with '-'. This is because min value @@ -1168,18 +1170,20 @@ public boolean toLong(LongWrapper toLongResult) { * @return true if the parsing was successful else false */ public boolean toInt(IntWrapper intWrapper) { - if (numBytes == 0) { - return false; - } + int offset = 0; + while (offset < this.numBytes && getByte(offset) <= ' ') offset++; + if (offset == this.numBytes) return false; - byte b = getByte(0); + int end = this.numBytes - 1; + while (end > offset && getByte(end) <= ' ') end--; + + byte b = getByte(offset); final boolean negative = b == '-'; - int offset = 0; if (negative || b == '+') { - offset++; - if (numBytes == 1) { + if (end - offset == 0) { return false; } + offset++; } final byte separator = '.'; @@ -1187,7 +1191,7 @@ public boolean toInt(IntWrapper intWrapper) { final int stopValue = Integer.MIN_VALUE / radix; int result = 0; - while (offset < numBytes) { + while (offset <= end) { b = getByte(offset); offset++; if (b == separator) { @@ -1222,7 +1226,7 @@ public boolean toInt(IntWrapper intWrapper) { // This is the case when we've encountered a decimal separator. The fractional // part will not change the number, but we will verify that the fractional part // is well formed. - while (offset < numBytes) { + while (offset <= end) { byte currentByte = getByte(offset); if (currentByte < '0' || currentByte > '9') { return false; diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 2d5afa919e66..6fc78893e688 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -222,6 +222,8 @@ license: | - Since Spark 3.0, when casting interval values to string type, there is no "interval" prefix, e.g. `1 days 2 hours`. In Spark version 2.4 and earlier, the string contains the "interval" prefix like `interval 1 days 2 hours`. + - Since Spark 3.0, when casting string value to integral types, including tinyint, smallint, int and bigint type, the leading and trailing white spaces(<= ACSII 32) will be trimmed before convert to integral values, e.g. `cast(' 1 ' as int)` results `1`. In Spark version 2.4 and earlier, the result will be `null`. + ## Upgrading from Spark SQL 2.4 to 2.4.1 - The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was diff --git a/sql/core/src/test/resources/sql-tests/inputs/cast.sql b/sql/core/src/test/resources/sql-tests/inputs/cast.sql index 8a035f594be5..3c1702e6f837 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cast.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cast.sql @@ -60,3 +60,13 @@ DESC FUNCTION EXTENDED boolean; -- cast string to interval and interval to string SELECT CAST('interval 3 month 1 hour' AS interval); SELECT CAST(interval 3 month 1 hour AS string); + +-- trim string before cast to numeric +select cast(' 1' as tinyint); +select cast(' 1\t' as tinyint); +select cast(' 1' as smallint); +select cast(' 1' as INT); +select cast(' 1' as bigint); +select cast(' 1' as float); +select cast(' 1 ' as DOUBLE); +select cast('1.0 ' as DEC); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/comparator.sql b/sql/core/src/test/resources/sql-tests/inputs/comparator.sql index 3e2447723e57..70af4f75ac43 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/comparator.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/comparator.sql @@ -1,3 +1,13 @@ -- binary type select x'00' < x'0f'; select x'00' < x'ff'; + +-- trim string to numeric +select '1 ' = 1Y; +select '\t1 ' = 1Y; +select '1 ' = 1S; +select '1 ' = 1; +select ' 1' = 1L; +select ' 1' = cast(1.0 as float); +select ' 1.0 ' = 1.0D; +select ' 1.0 ' = 1.0BD; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/cast.sql.out index 609d283da555..bf1e873a48d1 100644 --- a/sql/core/src/test/resources/sql-tests/results/cast.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cast.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 35 +-- Number of queries: 43 -- !query 0 @@ -287,3 +287,67 @@ SELECT CAST(interval 3 month 1 hour AS string) struct -- !query 34 output 3 months 1 hours + + +-- !query 35 +select cast(' 1' as tinyint) +-- !query 35 schema +struct +-- !query 35 output +1 + + +-- !query 36 +select cast(' 1\t' as tinyint) +-- !query 36 schema +struct +-- !query 36 output +1 + + +-- !query 37 +select cast(' 1' as smallint) +-- !query 37 schema +struct +-- !query 37 output +1 + + +-- !query 38 +select cast(' 1' as INT) +-- !query 38 schema +struct +-- !query 38 output +1 + + +-- !query 39 +select cast(' 1' as bigint) +-- !query 39 schema +struct +-- !query 39 output +1 + + +-- !query 40 +select cast(' 1' as float) +-- !query 40 schema +struct +-- !query 40 output +1.0 + + +-- !query 41 +select cast(' 1 ' as DOUBLE) +-- !query 41 schema +struct +-- !query 41 output +1.0 + + +-- !query 42 +select cast('1.0 ' as DEC) +-- !query 42 schema +struct +-- !query 42 output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/comparator.sql.out b/sql/core/src/test/resources/sql-tests/results/comparator.sql.out index afc7b5448b7b..a182da2a7963 100644 --- a/sql/core/src/test/resources/sql-tests/results/comparator.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/comparator.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 2 +-- Number of queries: 10 -- !query 0 @@ -16,3 +16,67 @@ select x'00' < x'ff' struct<(X'00' < X'FF'):boolean> -- !query 1 output true + + +-- !query 2 +select '1 ' = 1Y +-- !query 2 schema +struct<(CAST(1 AS TINYINT) = 1):boolean> +-- !query 2 output +true + + +-- !query 3 +select '\t1 ' = 1Y +-- !query 3 schema +struct<(CAST( 1 AS TINYINT) = 1):boolean> +-- !query 3 output +true + + +-- !query 4 +select '1 ' = 1S +-- !query 4 schema +struct<(CAST(1 AS SMALLINT) = 1):boolean> +-- !query 4 output +true + + +-- !query 5 +select '1 ' = 1 +-- !query 5 schema +struct<(CAST(1 AS INT) = 1):boolean> +-- !query 5 output +true + + +-- !query 6 +select ' 1' = 1L +-- !query 6 schema +struct<(CAST( 1 AS BIGINT) = 1):boolean> +-- !query 6 output +true + + +-- !query 7 +select ' 1' = cast(1.0 as float) +-- !query 7 schema +struct<(CAST( 1 AS FLOAT) = CAST(1.0 AS FLOAT)):boolean> +-- !query 7 output +true + + +-- !query 8 +select ' 1.0 ' = 1.0D +-- !query 8 schema +struct<(CAST( 1.0 AS DOUBLE) = 1.0):boolean> +-- !query 8 output +true + + +-- !query 9 +select ' 1.0 ' = 1.0BD +-- !query 9 schema +struct<(CAST( 1.0 AS DOUBLE) = CAST(1.0 AS DOUBLE)):boolean> +-- !query 9 output +true