Skip to content
Closed
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,7 @@ public static class IntWrapper implements Serializable {
}

/**
* Parses this UTF8String to long.
* Parses this UTF8String(trimmed if needed) to long.
*
* Note that, in this method we accumulate the result in negative format, and convert it to
* positive format at the end, if this string is not started with '-'. This is because min value
Expand All @@ -1077,26 +1077,28 @@ public static class IntWrapper implements Serializable {
* @return true if the parsing was successful else false
*/
public boolean toLong(LongWrapper toLongResult) {
if (numBytes == 0) {
return false;
}
int offset = 0;
while (offset < this.numBytes && getByte(offset) <= ' ') offset++;
if (offset == this.numBytes) return false;

byte b = getByte(0);
int end = this.numBytes - 1;
while (end > offset && getByte(end) <= ' ') end--;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to trim from the right explicitly here. Just break inside the loop https://github.com/apache/spark/pull/26622/files#diff-d2b5337b91f684b9e7fd5cc101e93fc8R1104 if b == ' '

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I guess not, how do you know the ' ' is in the middle or end?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see


byte b = getByte(offset);
final boolean negative = b == '-';
int offset = 0;
if (negative || b == '+') {
offset++;
if (numBytes == 1) {
if (end - offset == 0) {
return false;
}
offset++;
}

final byte separator = '.';
final int radix = 10;
final long stopValue = Long.MIN_VALUE / radix;
long result = 0;

while (offset < numBytes) {
while (offset <= end) {
b = getByte(offset);
offset++;
if (b == separator) {
Expand Down Expand Up @@ -1131,7 +1133,7 @@ public boolean toLong(LongWrapper toLongResult) {
// This is the case when we've encountered a decimal separator. The fractional
// part will not change the number, but we will verify that the fractional part
// is well formed.
while (offset < numBytes) {
while (offset <= end) {
byte currentByte = getByte(offset);
if (currentByte < '0' || currentByte > '9') {
return false;
Expand All @@ -1151,7 +1153,7 @@ public boolean toLong(LongWrapper toLongResult) {
}

/**
* Parses this UTF8String to int.
* Parses this UTF8String(trimmed if needed) to int.
*
* Note that, in this method we accumulate the result in negative format, and convert it to
* positive format at the end, if this string is not started with '-'. This is because min value
Expand All @@ -1168,26 +1170,28 @@ public boolean toLong(LongWrapper toLongResult) {
* @return true if the parsing was successful else false
*/
public boolean toInt(IntWrapper intWrapper) {
if (numBytes == 0) {
return false;
}
int offset = 0;
while (offset < this.numBytes && getByte(offset) <= ' ') offset++;
if (offset == this.numBytes) return false;

byte b = getByte(0);
int end = this.numBytes - 1;
while (end > offset && getByte(end) <= ' ') end--;

byte b = getByte(offset);
final boolean negative = b == '-';
int offset = 0;
if (negative || b == '+') {
offset++;
if (numBytes == 1) {
if (end - offset == 0) {
return false;
}
offset++;
}

final byte separator = '.';
final int radix = 10;
final int stopValue = Integer.MIN_VALUE / radix;
int result = 0;

while (offset < numBytes) {
while (offset <= end) {
b = getByte(offset);
offset++;
if (b == separator) {
Expand Down Expand Up @@ -1222,7 +1226,7 @@ public boolean toInt(IntWrapper intWrapper) {
// This is the case when we've encountered a decimal separator. The fractional
// part will not change the number, but we will verify that the fractional part
// is well formed.
while (offset < numBytes) {
while (offset <= end) {
byte currentByte = getByte(offset);
if (currentByte < '0' || currentByte > '9') {
return false;
Expand Down
2 changes: 2 additions & 0 deletions docs/sql-migration-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ license: |

- Since Spark 3.0, when casting interval values to string type, there is no "interval" prefix, e.g. `1 days 2 hours`. In Spark version 2.4 and earlier, the string contains the "interval" prefix like `interval 1 days 2 hours`.

- Since Spark 3.0, when casting string value to integral types, including tinyint, smallint, int and bigint type, the leading and trailing white spaces(<= ACSII 32) will be trimmed before convert to integral values, e.g. `cast(' 1 ' as int)` results `1`. In Spark version 2.4 and earlier, the result will be `null`.

## Upgrading from Spark SQL 2.4 to 2.4.1

- The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was
Expand Down
10 changes: 10 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/cast.sql
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,13 @@ DESC FUNCTION EXTENDED boolean;
-- cast string to interval and interval to string
SELECT CAST('interval 3 month 1 hour' AS interval);
SELECT CAST(interval 3 month 1 hour AS string);

-- trim string before cast to numeric
select cast(' 1' as tinyint);
select cast(' 1\t' as tinyint);
select cast(' 1' as smallint);
select cast(' 1' as INT);
select cast(' 1' as bigint);
select cast(' 1' as float);
select cast(' 1 ' as DOUBLE);
select cast('1.0 ' as DEC);
10 changes: 10 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/comparator.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
-- binary type
select x'00' < x'0f';
select x'00' < x'ff';

-- trim string to numeric
select '1 ' = 1Y;
select '\t1 ' = 1Y;
select '1 ' = 1S;
select '1 ' = 1;
select ' 1' = 1L;
select ' 1' = cast(1.0 as float);
select ' 1.0 ' = 1.0D;
select ' 1.0 ' = 1.0BD;
66 changes: 65 additions & 1 deletion sql/core/src/test/resources/sql-tests/results/cast.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 35
-- Number of queries: 43


-- !query 0
Expand Down Expand Up @@ -287,3 +287,67 @@ SELECT CAST(interval 3 month 1 hour AS string)
struct<CAST(INTERVAL '3 months 1 hours' AS STRING):string>
-- !query 34 output
3 months 1 hours


-- !query 35
select cast(' 1' as tinyint)
-- !query 35 schema
struct<CAST( 1 AS TINYINT):tinyint>
-- !query 35 output
1


-- !query 36
select cast(' 1\t' as tinyint)
-- !query 36 schema
struct<CAST( 1 AS TINYINT):tinyint>
-- !query 36 output
1


-- !query 37
select cast(' 1' as smallint)
-- !query 37 schema
struct<CAST( 1 AS SMALLINT):smallint>
-- !query 37 output
1


-- !query 38
select cast(' 1' as INT)
-- !query 38 schema
struct<CAST( 1 AS INT):int>
-- !query 38 output
1


-- !query 39
select cast(' 1' as bigint)
-- !query 39 schema
struct<CAST( 1 AS BIGINT):bigint>
-- !query 39 output
1


-- !query 40
select cast(' 1' as float)
-- !query 40 schema
struct<CAST( 1 AS FLOAT):float>
-- !query 40 output
1.0


-- !query 41
select cast(' 1 ' as DOUBLE)
-- !query 41 schema
struct<CAST( 1 AS DOUBLE):double>
-- !query 41 output
1.0


-- !query 42
select cast('1.0 ' as DEC)
-- !query 42 schema
struct<CAST(1.0 AS DECIMAL(10,0)):decimal(10,0)>
-- !query 42 output
NULL
66 changes: 65 additions & 1 deletion sql/core/src/test/resources/sql-tests/results/comparator.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 2
-- Number of queries: 10


-- !query 0
Expand All @@ -16,3 +16,67 @@ select x'00' < x'ff'
struct<(X'00' < X'FF'):boolean>
-- !query 1 output
true


-- !query 2
select '1 ' = 1Y
-- !query 2 schema
struct<(CAST(1 AS TINYINT) = 1):boolean>
-- !query 2 output
true


-- !query 3
select '\t1 ' = 1Y
-- !query 3 schema
struct<(CAST( 1 AS TINYINT) = 1):boolean>
-- !query 3 output
true


-- !query 4
select '1 ' = 1S
-- !query 4 schema
struct<(CAST(1 AS SMALLINT) = 1):boolean>
-- !query 4 output
true


-- !query 5
select '1 ' = 1
-- !query 5 schema
struct<(CAST(1 AS INT) = 1):boolean>
-- !query 5 output
true


-- !query 6
select ' 1' = 1L
-- !query 6 schema
struct<(CAST( 1 AS BIGINT) = 1):boolean>
-- !query 6 output
true


-- !query 7
select ' 1' = cast(1.0 as float)
-- !query 7 schema
struct<(CAST( 1 AS FLOAT) = CAST(1.0 AS FLOAT)):boolean>
-- !query 7 output
true


-- !query 8
select ' 1.0 ' = 1.0D
-- !query 8 schema
struct<(CAST( 1.0 AS DOUBLE) = 1.0):boolean>
-- !query 8 output
true


-- !query 9
select ' 1.0 ' = 1.0BD
-- !query 9 schema
struct<(CAST( 1.0 AS DOUBLE) = CAST(1.0 AS DOUBLE)):boolean>
-- !query 9 output
true