Skip to content
Closed
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,19 @@ public static class IntWrapper implements Serializable {
public transient int value = 0;
}

private void insideTrim() {
int s = 0;
while (s < this.numBytes && getByte(s) == 0x20) {
s++;
this.offset++;
}
int e = this.numBytes - 1;
while (e > s && getByte(e) == 0x20) {
e--;
this.numBytes--;
}
}

/**
* Parses this UTF8String to long.
*
Expand All @@ -1077,6 +1090,7 @@ public static class IntWrapper implements Serializable {
* @return true if the parsing was successful else false
*/
public boolean toLong(LongWrapper toLongResult) {
insideTrim();
if (numBytes == 0) {
return false;
}
Expand Down Expand Up @@ -1168,10 +1182,10 @@ public boolean toLong(LongWrapper toLongResult) {
* @return true if the parsing was successful else false
*/
public boolean toInt(IntWrapper intWrapper) {
insideTrim();
if (numBytes == 0) {
return false;
}

byte b = getByte(0);
final boolean negative = b == '-';
int offset = 0;
Expand Down
21 changes: 21 additions & 0 deletions sql/core/benchmarks/CastBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
================================================================================================
Cast String to Numeric
================================================================================================

Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.1
Intel(R) Core(TM) i5-5287U CPU @ 2.90GHz
Cast String to Numeric: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
cast(trim(str) as int) as c_int 6559 9435 NaN 1.2 800.6 1.0X
cast(trim(str) as long) as c_long 6841 9341 1416 1.2 835.1 1.0X

Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.1
Intel(R) Core(TM) i5-5287U CPU @ 2.90GHz
Cast String to Numeric: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
cast(trim(str) as int) as c_int 4259 5270 1110 1.9 519.9 1.0X
cast(trim(str) as long) as c_long 4081 5663 1372 2.0 498.2 1.0X
cast(str as int) as c_int 4071 4254 176 2.0 496.9 1.0X
cast(str as long) as c_long 4121 5087 1272 2.0 503.1 1.0X


4 changes: 4 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/cast.sql
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,7 @@ DESC FUNCTION EXTENDED boolean;
-- cast string to interval and interval to string
SELECT CAST('interval 3 month 1 hour' AS interval);
SELECT CAST(interval 3 month 1 hour AS string);

select cast(' 1' as int);
select cast(' 1' as bigint);
select cast(' 1' as double);
26 changes: 25 additions & 1 deletion sql/core/src/test/resources/sql-tests/results/cast.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 35
-- Number of queries: 38


-- !query 0
Expand Down Expand Up @@ -287,3 +287,27 @@ SELECT CAST(interval 3 month 1 hour AS string)
struct<CAST(INTERVAL '3 months 1 hours' AS STRING):string>
-- !query 34 output
3 months 1 hours


-- !query 35
select cast(' 1' as int)
-- !query 35 schema
struct<CAST( 1 AS INT):int>
-- !query 35 output
1


-- !query 36
select cast(' 1' as bigint)
-- !query 36 schema
struct<CAST( 1 AS BIGINT):bigint>
-- !query 36 output
1


-- !query 37
select cast(' 1' as double)
-- !query 37 schema
struct<CAST( 1 AS DOUBLE):double>
-- !query 37 output
1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.execution.benchmark

import org.apache.spark.benchmark.Benchmark

/**
* Benchmark trim the string when casting string type to Boolean/Numeric types.
* To run this benchmark:
* {{{
* 1. without sbt:
* bin/spark-submit --class <this class> --jars <spark core test jar> <spark sql test jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
* Results will be written to "benchmarks/CastBenchmark-results.txt".
* }}}
*/
object CastBenchmark extends SqlBasedBenchmark {

override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {

val title = "Cast String to Integral"
runBenchmark(title) {
withTempPath { dir =>
val N = 500L << 14
val df = spark.range(N)
val types = Seq("int", "long")
df.selectExpr(s"concat(id, '${" " * 5}') as str")
.write.mode("overwrite").parquet(dir.getCanonicalPath)
val benchmark = new Benchmark(title, N, minNumIters = 5, output = output)
Seq(true, false).foreach { c =>
types.foreach { t =>
val str = if (c) "trim(str)" else "str"
val expr = s"cast($str as $t) as c_$t"
benchmark.addCase(expr) { _ =>
spark.read.parquet(dir.getCanonicalPath).selectExpr(expr).collect()
}
}
benchmark.run()
}
}
}
}
}