From 15c00945ce7e3261acf0141e4c62fd31ce7defb3 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Wed, 6 Mar 2024 18:19:59 +0000 Subject: [PATCH 01/16] move to RFC4648 --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 259556826ad9..852ed7da72da 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2433,13 +2433,13 @@ case class Base64(child: Expression) override def inputTypes: Seq[DataType] = Seq(BinaryType) protected override def nullSafeEval(bytes: Any): Any = { - UTF8String.fromBytes(JBase64.getMimeEncoder.encode(bytes.asInstanceOf[Array[Byte]])) + UTF8String.fromBytes(JBase64.getEncoder.encode(bytes.asInstanceOf[Array[Byte]])) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (child) => { s"""${ev.value} = UTF8String.fromBytes( - ${classOf[JBase64].getName}.getMimeEncoder().encode($child)); + ${classOf[JBase64].getName}.getEncoder().encode($child)); """}) } From b2019e9362395db79a6bf9fbbe6336ee78ea769b Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Wed, 6 Mar 2024 18:22:40 +0000 Subject: [PATCH 02/16] remove others --- .../spark/sql/execution/benchmark/Base64Benchmark.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala index 3ad6baea84f2..efbf0132831c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala @@ -45,7 +45,7 @@ object Base64Benchmark extends SqlBasedBenchmark { private def doDecode(len: Int, f: Array[Byte] => Array[Byte]): Unit = { spark.range(N).map(_ => "Spark" * len).map { s => // using the same encode func - java.util.Base64.getMimeEncoder.encode(s.getBytes) + java.util.Base64.getEncoder.encode(s.getBytes) }.foreach { s => f(s) () @@ -56,7 +56,7 @@ object Base64Benchmark extends SqlBasedBenchmark { Seq(1, 3, 5, 7).map { len => val benchmark = new Benchmark(s"encode for $len", N, output = output) benchmark.addCase("java", 3) { _ => - doEncode(len, x => java.util.Base64.getMimeEncoder().encode(x)) + doEncode(len, x => java.util.Base64.getEncoder.encode(x)) } benchmark.addCase(s"apache", 3) { _ => doEncode(len, org.apache.commons.codec.binary.Base64.encodeBase64) From c2e960502a91f4dbeb0f52635299e666a5167f1b Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Wed, 6 Mar 2024 18:29:20 +0000 Subject: [PATCH 03/16] do decoders too --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- .../spark/sql/execution/benchmark/Base64Benchmark.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 852ed7da72da..1874b856acb1 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2476,7 +2476,7 @@ case class UnBase64(child: Expression, failOnError: Boolean = false) UTF8String.fromString("BASE64"), "try_to_binary") } - JBase64.getMimeDecoder.decode(string.asInstanceOf[UTF8String].toString) + JBase64.getDecoder.decode(string.asInstanceOf[UTF8String].toString) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -2498,7 +2498,7 @@ case class UnBase64(child: Expression, failOnError: Boolean = false) } s""" $maybeValidateInputCode - ${ev.value} = ${classOf[JBase64].getName}.getMimeDecoder().decode($child.toString()); + ${ev.value} = ${classOf[JBase64].getName}.getDecoder().decode($child.toString()); """}) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala index efbf0132831c..2f10283b4a8b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala @@ -67,7 +67,7 @@ object Base64Benchmark extends SqlBasedBenchmark { Seq(1, 3, 5, 7).map { len => val benchmark = new Benchmark(s"decode for $len", N, output = output) benchmark.addCase("java", 3) { _ => - doDecode(len, x => java.util.Base64.getMimeDecoder.decode(x)) + doDecode(len, x => java.util.Base64.getDecoder.decode(x)) } benchmark.addCase(s"apache", 3) { _ => doDecode(len, org.apache.commons.codec.binary.Base64.decodeBase64) From 81621b17eb3226279ae4f6c8bc41da6bcce67cb6 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Wed, 6 Mar 2024 19:10:21 +0000 Subject: [PATCH 04/16] add test --- .../sql/catalyst/expressions/StringExpressionsSuite.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 98f33e209994..04c44d04a71f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -506,6 +506,13 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")) :: Nil) } + test("SPARK-47307: base64 encoding without line breaks") { + val longString = "a" * 58 + val encoded = "YWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYQ==" + + checkEvaluation(Base64(Literal(longString.getBytes)), encoded, create_row("abcdefgh")) + } + test("initcap unit test") { checkEvaluation(InitCap(Literal.create(null, StringType)), null) checkEvaluation(InitCap(Literal("a b")), "A B") From 2bd8b8962b8002f861582cc0bbf4e1905863383a Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Wed, 6 Mar 2024 19:08:52 +0000 Subject: [PATCH 05/16] fix upstream --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 5 +++-- .../spark/sql/execution/benchmark/Base64Benchmark.scala | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 1874b856acb1..b130258ec49d 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2433,13 +2433,14 @@ case class Base64(child: Expression) override def inputTypes: Seq[DataType] = Seq(BinaryType) protected override def nullSafeEval(bytes: Any): Any = { - UTF8String.fromBytes(JBase64.getEncoder.encode(bytes.asInstanceOf[Array[Byte]])) + UTF8String.fromBytes( + JBase64.getMimeEncoder(-1, Array()).encode(bytes.asInstanceOf[Array[Byte]])) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (child) => { s"""${ev.value} = UTF8String.fromBytes( - ${classOf[JBase64].getName}.getEncoder().encode($child)); + ${classOf[JBase64].getName}.getMimeEncoder(-1, new byte[0]).encode($child)); """}) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala index 2f10283b4a8b..e9e40b38d781 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala @@ -45,7 +45,7 @@ object Base64Benchmark extends SqlBasedBenchmark { private def doDecode(len: Int, f: Array[Byte] => Array[Byte]): Unit = { spark.range(N).map(_ => "Spark" * len).map { s => // using the same encode func - java.util.Base64.getEncoder.encode(s.getBytes) + java.util.Base64.getMimeEncoder(-1, new byte[0]).encode(s.getBytes) }.foreach { s => f(s) () @@ -56,7 +56,7 @@ object Base64Benchmark extends SqlBasedBenchmark { Seq(1, 3, 5, 7).map { len => val benchmark = new Benchmark(s"encode for $len", N, output = output) benchmark.addCase("java", 3) { _ => - doEncode(len, x => java.util.Base64.getEncoder.encode(x)) + doEncode(len, x => java.util.Base64.getMimeEncoder(-1, new byte[0]).encode(x)) } benchmark.addCase(s"apache", 3) { _ => doEncode(len, org.apache.commons.codec.binary.Base64.encodeBase64) From a36161a9ea692a829bf43226d4eab9e3493af760 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Thu, 7 Mar 2024 08:02:12 +0000 Subject: [PATCH 06/16] mime to decode --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- .../spark/sql/execution/benchmark/Base64Benchmark.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index b130258ec49d..37e19dc0760f 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2477,7 +2477,7 @@ case class UnBase64(child: Expression, failOnError: Boolean = false) UTF8String.fromString("BASE64"), "try_to_binary") } - JBase64.getDecoder.decode(string.asInstanceOf[UTF8String].toString) + JBase64.getMimeDecoder.decode(string.asInstanceOf[UTF8String].toString) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -2499,7 +2499,7 @@ case class UnBase64(child: Expression, failOnError: Boolean = false) } s""" $maybeValidateInputCode - ${ev.value} = ${classOf[JBase64].getName}.getDecoder().decode($child.toString()); + ${ev.value} = ${classOf[JBase64].getName}.getMimeDecoder().decode($child.toString()); """}) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala index e9e40b38d781..a4b788eb335b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala @@ -67,7 +67,7 @@ object Base64Benchmark extends SqlBasedBenchmark { Seq(1, 3, 5, 7).map { len => val benchmark = new Benchmark(s"decode for $len", N, output = output) benchmark.addCase("java", 3) { _ => - doDecode(len, x => java.util.Base64.getDecoder.decode(x)) + doDecode(len, x => java.util.Base64.getMimeDecoder.decode(x)) } benchmark.addCase(s"apache", 3) { _ => doDecode(len, org.apache.commons.codec.binary.Base64.decodeBase64) From 4016eb3e11a0885515df7405adad5868ef638b32 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Thu, 7 Mar 2024 08:03:05 +0000 Subject: [PATCH 07/16] scala array --- .../spark/sql/execution/benchmark/Base64Benchmark.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala index a4b788eb335b..29b09682b503 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala @@ -45,7 +45,7 @@ object Base64Benchmark extends SqlBasedBenchmark { private def doDecode(len: Int, f: Array[Byte] => Array[Byte]): Unit = { spark.range(N).map(_ => "Spark" * len).map { s => // using the same encode func - java.util.Base64.getMimeEncoder(-1, new byte[0]).encode(s.getBytes) + java.util.Base64.getMimeEncoder(-1, Array()).encode(s.getBytes) }.foreach { s => f(s) () @@ -56,7 +56,7 @@ object Base64Benchmark extends SqlBasedBenchmark { Seq(1, 3, 5, 7).map { len => val benchmark = new Benchmark(s"encode for $len", N, output = output) benchmark.addCase("java", 3) { _ => - doEncode(len, x => java.util.Base64.getMimeEncoder(-1, new byte[0]).encode(x)) + doEncode(len, x => java.util.Base64.getMimeEncoder(-1, Array()).encode(x)) } benchmark.addCase(s"apache", 3) { _ => doEncode(len, org.apache.commons.codec.binary.Base64.encodeBase64) From 109b5148763bcf162617e7fe0408e7f9e1ef73d3 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Fri, 8 Mar 2024 11:52:06 +0000 Subject: [PATCH 08/16] make it configurable --- .../expressions/stringExpressions.scala | 17 +++++++++++------ .../org/apache/spark/sql/internal/SQLConf.scala | 11 ++++++++++- .../expressions/StringExpressionsSuite.scala | 4 ++-- .../execution/benchmark/Base64Benchmark.scala | 4 ++-- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 37e19dc0760f..8a4a198d1f3a 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2426,22 +2426,27 @@ case class Chr(child: Expression) """, since = "1.5.0", group = "string_funcs") -case class Base64(child: Expression) +case class Base64(child: Expression, chunkBase64: Boolean) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { + def this(child: Expression) = this(child, SQLConf.get.base64Chunking) + + lazy val encoder: JBase64.Encoder = if (chunkBase64) { + JBase64.getMimeEncoder + } else { + JBase64.getMimeEncoder(-1, Array()) + } override def dataType: DataType = StringType override def inputTypes: Seq[DataType] = Seq(BinaryType) protected override def nullSafeEval(bytes: Any): Any = { - UTF8String.fromBytes( - JBase64.getMimeEncoder(-1, Array()).encode(bytes.asInstanceOf[Array[Byte]])) + UTF8String.fromBytes(encoder.encode(bytes.asInstanceOf[Array[Byte]])) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (child) => { - s"""${ev.value} = UTF8String.fromBytes( - ${classOf[JBase64].getName}.getMimeEncoder(-1, new byte[0]).encode($child)); - """}) + s"${ev.value} = UTF8String.fromBytes($encoder}.encode($child));" + }) } override protected def withNewChildInternal(newChild: Expression): Base64 = copy(child = newChild) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index bc4734775c77..2a39ef629f3b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3293,7 +3293,7 @@ object SQLConf { "standard directly, but their behaviors align with ANSI SQL's style") .version("3.0.0") .booleanConf - .createWithDefault(sys.env.get("SPARK_ANSI_SQL_MODE").contains("true")) + .createWithDefault(sys.env.get("SPARK_ANSI_SQL_MODE").contains("true") val ENFORCE_RESERVED_KEYWORDS = buildConf("spark.sql.ansi.enforceReservedKeywords") .doc(s"When true and '${ANSI_ENABLED.key}' is true, the Spark SQL parser enforces the ANSI " + @@ -3318,6 +3318,13 @@ object SQLConf { .booleanConf .createWithDefault(false) + val BASE_64_CHUNKING = buildConf("spark.sql.base64.chunking") + .doc("When true, base64 strings generated by the base64 function are chunked into lines of " + + "at most 76 characters. When false, the base64 strings are not chunked.") + .version("3.5.2") + .booleanConf + .createWithDefault(true) + val ENABLE_DEFAULT_COLUMNS = buildConf("spark.sql.defaultColumn.enabled") .internal() @@ -5398,6 +5405,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def ansiRelationPrecedence: Boolean = ansiEnabled && getConf(ANSI_RELATION_PRECEDENCE) + def base64Chunking: Boolean = getConf(BASE_64_CHUNKING) + def timestampType: AtomicType = getConf(TIMESTAMP_TYPE) match { case "TIMESTAMP_LTZ" => // For historical reason, the TimestampType maps to TIMESTAMP WITH LOCAL TIME ZONE diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 04c44d04a71f..a3b6b4ac865b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -506,11 +506,11 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")) :: Nil) } - test("SPARK-47307: base64 encoding without line breaks") { + test("SPARK-47307: base64 encoding without chunking") { val longString = "a" * 58 val encoded = "YWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYQ==" - checkEvaluation(Base64(Literal(longString.getBytes)), encoded, create_row("abcdefgh")) + checkEvaluation(Base64(Literal(longString.getBytes), false), encoded, create_row("abcdefgh")) } test("initcap unit test") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala index 29b09682b503..a021477e88a0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/Base64Benchmark.scala @@ -45,7 +45,7 @@ object Base64Benchmark extends SqlBasedBenchmark { private def doDecode(len: Int, f: Array[Byte] => Array[Byte]): Unit = { spark.range(N).map(_ => "Spark" * len).map { s => // using the same encode func - java.util.Base64.getMimeEncoder(-1, Array()).encode(s.getBytes) + java.util.Base64.getMimeEncoder.encode(s.getBytes) }.foreach { s => f(s) () @@ -56,7 +56,7 @@ object Base64Benchmark extends SqlBasedBenchmark { Seq(1, 3, 5, 7).map { len => val benchmark = new Benchmark(s"encode for $len", N, output = output) benchmark.addCase("java", 3) { _ => - doEncode(len, x => java.util.Base64.getMimeEncoder(-1, Array()).encode(x)) + doEncode(len, x => java.util.Base64.getMimeEncoder.encode(x)) } benchmark.addCase(s"apache", 3) { _ => doEncode(len, org.apache.commons.codec.binary.Base64.encodeBase64) From 946af79e97533edb95c48729a874254e9b0d4614 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Fri, 8 Mar 2024 12:45:01 +0000 Subject: [PATCH 09/16] fix the codegen --- .../sql/catalyst/expressions/stringExpressions.scala | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 8a4a198d1f3a..bb9e103f6b35 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2445,7 +2445,15 @@ case class Base64(child: Expression, chunkBase64: Boolean) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (child) => { - s"${ev.value} = UTF8String.fromBytes($encoder}.encode($child));" + s""" + if ($chunkBase64) { + ${ev.value} = UTF8String.fromBytes( + ${classOf[JBase64].getName}.getMimeEncoder().encode($child)); + } else { + ${ev.value} = UTF8String.fromBytes( + ${classOf[JBase64].getName}.getMimeEncoder(-1, new byte[0]).encode($child)); + } + """ }) } From 0dadce5f2c3de2dec8f8230c635a14e18f98b476 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Fri, 8 Mar 2024 12:53:38 +0000 Subject: [PATCH 10/16] reorg --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index bb9e103f6b35..6bd44e3fd858 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2429,13 +2429,13 @@ case class Chr(child: Expression) case class Base64(child: Expression, chunkBase64: Boolean) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - def this(child: Expression) = this(child, SQLConf.get.base64Chunking) - lazy val encoder: JBase64.Encoder = if (chunkBase64) { JBase64.getMimeEncoder } else { JBase64.getMimeEncoder(-1, Array()) } + + def this(child: Expression) = this(child, SQLConf.get.base64Chunking) override def dataType: DataType = StringType override def inputTypes: Seq[DataType] = Seq(BinaryType) From 55b6bf22e585c3f2e9548176d9d67532c2fd23b6 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Fri, 8 Mar 2024 13:32:29 +0000 Subject: [PATCH 11/16] undo bad diff --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 2a39ef629f3b..d1992f9c6f04 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3293,7 +3293,7 @@ object SQLConf { "standard directly, but their behaviors align with ANSI SQL's style") .version("3.0.0") .booleanConf - .createWithDefault(sys.env.get("SPARK_ANSI_SQL_MODE").contains("true") + .createWithDefault(sys.env.get("SPARK_ANSI_SQL_MODE").contains("true")) val ENFORCE_RESERVED_KEYWORDS = buildConf("spark.sql.ansi.enforceReservedKeywords") .doc(s"When true and '${ANSI_ENABLED.key}' is true, the Spark SQL parser enforces the ANSI " + From 8c308691ec30cc38ae9e578a8950625a4bcbb638 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Fri, 8 Mar 2024 14:12:03 +0000 Subject: [PATCH 12/16] try that --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 6bd44e3fd858..bf44df91c2b1 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2426,7 +2426,7 @@ case class Chr(child: Expression) """, since = "1.5.0", group = "string_funcs") -case class Base64(child: Expression, chunkBase64: Boolean) +case class Base64(child: Expression, chunkBase64: Boolean = SQLConf.get.base64Chunking) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { lazy val encoder: JBase64.Encoder = if (chunkBase64) { @@ -2435,7 +2435,6 @@ case class Base64(child: Expression, chunkBase64: Boolean) JBase64.getMimeEncoder(-1, Array()) } - def this(child: Expression) = this(child, SQLConf.get.base64Chunking) override def dataType: DataType = StringType override def inputTypes: Seq[DataType] = Seq(BinaryType) From ac92c02da3350f7f8bbf0cee71395d929af40de3 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Fri, 8 Mar 2024 16:11:51 +0000 Subject: [PATCH 13/16] do chunk if outside codegen --- .../expressions/stringExpressions.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index bf44df91c2b1..e5a8c7fd8bd0 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2444,15 +2444,15 @@ case class Base64(child: Expression, chunkBase64: Boolean = SQLConf.get.base64Ch override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (child) => { - s""" - if ($chunkBase64) { - ${ev.value} = UTF8String.fromBytes( - ${classOf[JBase64].getName}.getMimeEncoder().encode($child)); - } else { - ${ev.value} = UTF8String.fromBytes( - ${classOf[JBase64].getName}.getMimeEncoder(-1, new byte[0]).encode($child)); - } - """ + if (chunkBase64) { + s"""${ev.value} = UTF8String.fromBytes( + ${classOf[JBase64].getName}.getMimeEncoder().encode($child)); + """ + } else { + s"""${ev.value} = UTF8String.fromBytes( + ${classOf[JBase64].getName}.getMimeEncoder(-1, new byte[0]).encode($child)); + """ + } }) } From 3f854f7a104fc8ae29fdbbb0ce586b679adf8a16 Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Mon, 11 Mar 2024 11:25:12 +0000 Subject: [PATCH 14/16] comments --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 5 ++++- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index e5a8c7fd8bd0..b529d366d48b 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2426,9 +2426,12 @@ case class Chr(child: Expression) """, since = "1.5.0", group = "string_funcs") -case class Base64(child: Expression, chunkBase64: Boolean = SQLConf.get.base64Chunking) +case class Base64(child: Expression, chunkBase64: Boolean) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { + // Secondary constructor with a default value for chunkBase64 + def this(child: Expression) = this(child, SQLConf.get.chunkBase64StringEnabled) + lazy val encoder: JBase64.Encoder = if (chunkBase64) { JBase64.getMimeEncoder } else { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index d1992f9c6f04..f1ece91f25e9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3318,10 +3318,10 @@ object SQLConf { .booleanConf .createWithDefault(false) - val BASE_64_CHUNKING = buildConf("spark.sql.base64.chunking") + val CHUNK_BASE_64_STRING_ENABLED = buildConf("spark.sql.chunkBase64String.enabled") .doc("When true, base64 strings generated by the base64 function are chunked into lines of " + "at most 76 characters. When false, the base64 strings are not chunked.") - .version("3.5.2") + .version("4.0.0") .booleanConf .createWithDefault(true) @@ -5405,7 +5405,7 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def ansiRelationPrecedence: Boolean = ansiEnabled && getConf(ANSI_RELATION_PRECEDENCE) - def base64Chunking: Boolean = getConf(BASE_64_CHUNKING) + def chunkBase64StringEnabled: Boolean = getConf(CHUNK_BASE_64_STRING_ENABLED) def timestampType: AtomicType = getConf(TIMESTAMP_TYPE) match { case "TIMESTAMP_LTZ" => From ef05b34a1b4b3acb75cbb72761f27a5762b9da0a Mon Sep 17 00:00:00 2001 From: Ted Jenks Date: Mon, 11 Mar 2024 12:54:52 +0000 Subject: [PATCH 15/16] do that --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index b529d366d48b..f2237540afde 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2426,12 +2426,9 @@ case class Chr(child: Expression) """, since = "1.5.0", group = "string_funcs") -case class Base64(child: Expression, chunkBase64: Boolean) +case class Base64(child: Expression, chunkBase64: Boolean = SQLConf.get.chunkBase64StringEnabled) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - // Secondary constructor with a default value for chunkBase64 - def this(child: Expression) = this(child, SQLConf.get.chunkBase64StringEnabled) - lazy val encoder: JBase64.Encoder = if (chunkBase64) { JBase64.getMimeEncoder } else { From 5435698a526f28d8615b289f6734742ca2960579 Mon Sep 17 00:00:00 2001 From: Ted Chester Jenks Date: Tue, 12 Mar 2024 13:45:33 +0000 Subject: [PATCH 16/16] to gcp --- .../analyzer-results/charvarchar.sql.out | 22 +++++++++++--- .../sql-tests/results/charvarchar.sql.out | 29 ++++++++++++------- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/charvarchar.sql.out index 4f556d6dbc0b..d1f304c66c32 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/charvarchar.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/charvarchar.sql.out @@ -458,10 +458,24 @@ Project [ascii(c7#x) AS ascii(c7)#x, ascii(c8#x) AS ascii(c8)#x, ascii(v#x) AS a -- !query select base64(c7), base64(c8), base64(v), ascii(s) from char_tbl4 -- !query analysis -Project [base64(cast(c7#x as binary)) AS base64(c7)#x, base64(cast(c8#x as binary)) AS base64(c8)#x, base64(cast(v#x as binary)) AS base64(v)#x, ascii(s#x) AS ascii(s)#x] -+- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] - +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + "sqlState" : "42605", + "messageParameters" : { + "actualNum" : "1", + "docroot" : "https://spark.apache.org/docs/latest", + "expectedNum" : "0", + "functionName" : "`base64`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 17, + "fragment" : "base64(c7)" + } ] +} -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out index 3ad363abd31b..fcc7ffe3ac2f 100644 --- a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out @@ -839,17 +839,26 @@ NULL NULL NULL NULL -- !query select base64(c7), base64(c8), base64(v), ascii(s) from char_tbl4 -- !query schema -struct +struct<> -- !query output -NULL NULL NULL NULL -NULL NULL Uw== NULL -TiAgICAgIA== TiAgICAgICA= TiA= 78 -TmUgICAgIA== TmUgICAgICA= U3A= 78 -TmV0ICAgIA== TmV0ICAgICA= U3BhICA= 78 -TmV0RSAgIA== TmV0RSAgICA= U3Bhcg== 78 -TmV0RWEgIA== TmV0RWEgICA= U3Bhcmsg 78 -TmV0RWFzIA== TmV0RWFzICA= U3Bhcms= 78 -TmV0RWFzZQ== TmV0RWFzZSA= U3Bhcmst 78 +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + "sqlState" : "42605", + "messageParameters" : { + "actualNum" : "1", + "docroot" : "https://spark.apache.org/docs/latest", + "expectedNum" : "0", + "functionName" : "`base64`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 17, + "fragment" : "base64(c7)" + } ] +} -- !query