diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 2a8dec9e64a6..f1762f4eac76 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -19,13 +19,12 @@ package org.apache.spark.sql.catalyst.expressions import java.net.{URI, URISyntaxException} import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} +import java.util.{Base64 => JBase64} import java.util.{HashMap, Locale, Map => JMap} import java.util.regex.Pattern import scala.collection.mutable.ArrayBuffer -import org.apache.commons.codec.binary.{Base64 => CommonsBase64} - import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, FunctionRegistry, TypeCheckResult} import org.apache.spark.sql.catalyst.expressions.codegen._ @@ -2345,13 +2344,13 @@ case class Base64(child: Expression) override def inputTypes: Seq[DataType] = Seq(BinaryType) protected override def nullSafeEval(bytes: Any): Any = { - UTF8String.fromBytes(CommonsBase64.encodeBase64(bytes.asInstanceOf[Array[Byte]])) + UTF8String.fromBytes(JBase64.getMimeEncoder.encode(bytes.asInstanceOf[Array[Byte]])) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (child) => { s"""${ev.value} = UTF8String.fromBytes( - ${classOf[CommonsBase64].getName}.encodeBase64($child)); + ${classOf[JBase64].getName}.getMimeEncoder().encode($child)); """}) } @@ -2377,12 +2376,12 @@ case class UnBase64(child: Expression) override def inputTypes: Seq[DataType] = Seq(StringType) protected override def nullSafeEval(string: Any): Any = - CommonsBase64.decodeBase64(string.asInstanceOf[UTF8String].toString) + JBase64.getMimeDecoder.decode(string.asInstanceOf[UTF8String].toString) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (child) => { s""" - ${ev.value} = ${classOf[CommonsBase64].getName}.decodeBase64($child.toString()); + ${ev.value} = ${classOf[JBase64].getName}.getMimeDecoder().decode($child.toString()); """}) } diff --git a/sql/core/benchmarks/Base64Benchmark-jdk11-results.txt b/sql/core/benchmarks/Base64Benchmark-jdk11-results.txt new file mode 100644 index 000000000000..eeff53679522 --- /dev/null +++ b/sql/core/benchmarks/Base64Benchmark-jdk11-results.txt @@ -0,0 +1,56 @@ +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 4000 4121 204 5.0 200.0 1.0X +apache 34197 34280 71 0.6 1709.9 0.1X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 4696 4761 62 4.3 234.8 1.0X +apache 35117 35342 262 0.6 1755.9 0.1X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 6059 6192 120 3.3 303.0 1.0X +apache 36995 37108 109 0.5 1849.8 0.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 6993 7032 52 2.9 349.6 1.0X +apache 37686 37888 198 0.5 1884.3 0.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 5322 5503 162 3.8 266.1 1.0X +apache 35180 35391 195 0.6 1759.0 0.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 6780 6814 38 2.9 339.0 1.0X +apache 35161 35279 102 0.6 1758.1 0.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 8941 9068 130 2.2 447.1 1.0X +apache 41628 41704 122 0.5 2081.4 0.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 10248 10336 77 2.0 512.4 1.0X +apache 42702 42732 47 0.5 2135.1 0.2X + diff --git a/sql/core/benchmarks/Base64Benchmark-jdk17-results.txt b/sql/core/benchmarks/Base64Benchmark-jdk17-results.txt new file mode 100644 index 000000000000..f9cc647bbb8d --- /dev/null +++ b/sql/core/benchmarks/Base64Benchmark-jdk17-results.txt @@ -0,0 +1,56 @@ +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 3787 3862 75 5.3 189.3 1.0X +apache 28972 29107 153 0.7 1448.6 0.1X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 4732 4741 8 4.2 236.6 1.0X +apache 31133 31330 230 0.6 1556.6 0.2X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 5928 5940 11 3.4 296.4 1.0X +apache 31932 31981 47 0.6 1596.6 0.2X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 6290 6312 36 3.2 314.5 1.0X +apache 33568 33677 107 0.6 1678.4 0.2X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 5087 5162 67 3.9 254.4 1.0X +apache 30471 30598 161 0.7 1523.6 0.2X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 6362 6384 22 3.1 318.1 1.0X +apache 32436 32560 107 0.6 1621.8 0.2X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 8808 8812 5 2.3 440.4 1.0X +apache 37324 37537 215 0.5 1866.2 0.2X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 9904 9915 11 2.0 495.2 1.0X +apache 39963 40190 215 0.5 1998.2 0.2X + diff --git a/sql/core/benchmarks/Base64Benchmark-results.txt b/sql/core/benchmarks/Base64Benchmark-results.txt new file mode 100644 index 000000000000..2b18dbbc17ef --- /dev/null +++ b/sql/core/benchmarks/Base64Benchmark-results.txt @@ -0,0 +1,56 @@ +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 5408 5970 745 3.7 270.4 1.0X +apache 35038 35285 216 0.6 1751.9 0.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 5950 6191 209 3.4 297.5 1.0X +apache 37222 37440 191 0.5 1861.1 0.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 7472 7815 363 2.7 373.6 1.0X +apache 40215 40300 143 0.5 2010.7 0.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 9548 9721 296 2.1 477.4 1.0X +apache 40876 41011 143 0.5 2043.8 0.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 6835 7203 624 2.9 341.8 1.0X +apache 37065 37202 184 0.5 1853.3 0.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 8151 8292 187 2.5 407.5 1.0X +apache 39188 39455 262 0.5 1959.4 0.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 11225 11582 429 1.8 561.2 1.0X +apache 42835 42987 145 0.5 2141.8 0.3X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java 13722 13987 301 1.5 686.1 1.0X +apache 44221 44443 354 0.5 2211.0 0.3X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala new file mode 100644 index 000000000000..eb0b896574a6 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.Benchmark + +/** + * Benchmark for measuring perf of different Base64 implementations + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/Base64Benchmark-results.txt". + * }}} + */ +object Base64Benchmark extends SqlBasedBenchmark { + import spark.implicits._ + private val N = 20L * 1000 * 1000 + + private def doEncode(len: Int, f: Array[Byte] => Array[Byte]): Unit = { + spark.range(N).map(_ => "Spark" * len).foreach { s => + f(s.getBytes) + () + } + } + + private def doDecode(len: Int, f: Array[Byte] => Array[Byte]): Unit = { + spark.range(N).map(_ => "Spark" * len).map { s => + // using the same encode func + java.util.Base64.getMimeEncoder.encode(s.getBytes) + }.foreach { s => + f(s) + () + } + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + Seq(1, 3, 5, 7).map { len => + val benchmark = new Benchmark(s"encode for $len", N, output = output) + benchmark.addCase("java", 3) { _ => + doEncode(len, x => java.util.Base64.getMimeEncoder().encode(x)) + } + benchmark.addCase(s"apache", 3) { _ => + doEncode(len, org.apache.commons.codec.binary.Base64.encodeBase64) + } + benchmark + }.foreach(_.run()) + + Seq(1, 3, 5, 7).map { len => + val benchmark = new Benchmark(s"decode for $len", N, output = output) + benchmark.addCase("java", 3) { _ => + doDecode(len, x => java.util.Base64.getMimeDecoder.decode(x)) + } + benchmark.addCase(s"apache", 3) { _ => + doDecode(len, org.apache.commons.codec.binary.Base64.decodeBase64) + } + benchmark + }.foreach(_.run()) + } +}