From 76af1970d68abe1a05e69182ecd5f70583041986 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Wed, 23 Aug 2023 17:18:28 +0300 Subject: [PATCH 1/6] Fix examples --- .../expressions/numberFormatExpressions.scala | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala index 3a424ac21c50e..c749104e3c54e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper import org.apache.spark.sql.catalyst.util.ToNumberParser import org.apache.spark.sql.errors.QueryCompilationErrors -import org.apache.spark.sql.types.{AbstractDataType, DataType, DatetimeType, Decimal, DecimalType, StringType} +import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, DatetimeType, Decimal, DecimalType, StringType} import org.apache.spark.unsafe.types.UTF8String abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Boolean) @@ -209,6 +209,10 @@ case class TryToNumber(left: Expression, right: Expression) wrapped by angle brackets if the input value is negative. ('<1>'). If `expr` is a datetime, `format` shall be a valid datetime pattern, see Datetime Patterns. + If `expr` is a binary, it is converted to a string in one of the formats: + 'base64': a base 64 string. + 'hex': a string in the hexadecimal format. + 'utf-8': the input binary is decoded to UTF-8 string. """, examples = """ Examples: @@ -224,6 +228,12 @@ case class TryToNumber(left: Expression, right: Expression) 12,454.8- > SELECT _FUNC_(date'2016-04-08', 'y'); 2016 + > SELECT _FUNC_(x'537061726b2053514c', 'base64'); + U3BhcmsgU1FM + > SELECT _FUNC_(x'537061726b2053514c', 'hex'); + 537061726B2053514C + > SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8'); + abc """, since = "3.4.0", group = "string_funcs") @@ -232,9 +242,18 @@ object ToCharacterBuilder extends ExpressionBuilder { override def build(funcName: String, expressions: Seq[Expression]): Expression = { val numArgs = expressions.length if (expressions.length == 2) { - val inputExpr = expressions.head + val (inputExpr, format) = (expressions(0), expressions(1)) inputExpr.dataType match { case _: DatetimeType => DateFormatClass(inputExpr, expressions(1)) + case _: BinaryType => + if (!(format.dataType == StringType && format.foldable)) { + throw QueryCompilationErrors.requireLiteralParameter(funcName, "format", "string") + } + format.eval().asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT).trim match { + case "base64" => Base64(inputExpr) + case "hex" => Hex(inputExpr) + case "utf-8" => new Decode(Seq(inputExpr, format)) + } case _ => ToCharacter(inputExpr, expressions(1)) } } else { From 89e766085efb9139212ca4ea1f4038bf0514a7db Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sat, 26 Aug 2023 11:20:45 +0300 Subject: [PATCH 2/6] Add a test for _LEGACY_ERROR_TEMP_1100 --- .../org/apache/spark/sql/StringFunctionsSuite.scala | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 88c9e15570e30..f6a472815207b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -864,6 +864,17 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { df.select(to_char(col("a"), lit("$99.99"))), Seq(Row("$78.12")) ) + + val df2 = Seq((Array(1.toByte), "base64")).toDF("input", "format") + checkError( + exception = intercept[AnalysisException] { + df2.select(to_char(col("input"), col("format"))).collect() + }, + errorClass = "_LEGACY_ERROR_TEMP_1100", + parameters = Map( + "argName" -> "format", + "funcName" -> "to_char", + "requiredType" -> "string")) } test("to_varchar") { From b7987e10a6c2d6685f6c2c28e02c3e0e6d7e2d1a Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sat, 26 Aug 2023 12:09:55 +0300 Subject: [PATCH 3/6] Add tests for errors --- .../main/resources/error/error-classes.json | 5 +++++ ...ons-invalid-parameter-value-error-class.md | 4 ++++ .../expressions/numberFormatExpressions.scala | 5 +++-- .../sql/errors/QueryCompilationErrors.scala | 9 +++++++++ .../spark/sql/StringFunctionsSuite.scala | 20 +++++++++++++++++++ 5 files changed, 41 insertions(+), 2 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 632c449b99209..53c596c00fc37 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -1788,6 +1788,11 @@ "expects a binary value with 16, 24 or 32 bytes, but got bytes." ] }, + "BINARY_FORMAT" : { + "message" : [ + "expects one of binary formats 'base64', 'hex', 'utf-8', but got ." + ] + }, "DATETIME_UNIT" : { "message" : [ "expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal ." diff --git a/docs/sql-error-conditions-invalid-parameter-value-error-class.md b/docs/sql-error-conditions-invalid-parameter-value-error-class.md index 370e6da336293..96829e564aa5c 100644 --- a/docs/sql-error-conditions-invalid-parameter-value-error-class.md +++ b/docs/sql-error-conditions-invalid-parameter-value-error-class.md @@ -37,6 +37,10 @@ supports 16-byte CBC IVs and 12-byte GCM IVs, but got `` bytes for expects a binary value with 16, 24 or 32 bytes, but got `` bytes. +## BINARY_FORMAT + +expects one of binary formats 'base64', 'hex', 'utf-8', but got ``. + ## DATETIME_UNIT expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal ``. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala index c749104e3c54e..7875ed8fe20fe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala @@ -244,7 +244,7 @@ object ToCharacterBuilder extends ExpressionBuilder { if (expressions.length == 2) { val (inputExpr, format) = (expressions(0), expressions(1)) inputExpr.dataType match { - case _: DatetimeType => DateFormatClass(inputExpr, expressions(1)) + case _: DatetimeType => DateFormatClass(inputExpr, format) case _: BinaryType => if (!(format.dataType == StringType && format.foldable)) { throw QueryCompilationErrors.requireLiteralParameter(funcName, "format", "string") @@ -253,8 +253,9 @@ object ToCharacterBuilder extends ExpressionBuilder { case "base64" => Base64(inputExpr) case "hex" => Hex(inputExpr) case "utf-8" => new Decode(Seq(inputExpr, format)) + case invalid => throw QueryCompilationErrors.binaryFormatError(funcName, invalid) } - case _ => ToCharacter(inputExpr, expressions(1)) + case _ => ToCharacter(inputExpr, format) } } else { throw QueryCompilationErrors.wrongNumArgsError(funcName, Seq(2), numArgs) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 3d2c59cebd8ea..e579e5cf565b2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -157,6 +157,15 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "functionName" -> toSQLId("format_string"))) } + def binaryFormatError(funcName: String, invalidFormat: String): Throwable = { + new AnalysisException( + errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT", + messageParameters = Map( + "parameter" -> toSQLId("format"), + "functionName" -> toSQLId(funcName), + "invalidFormat" -> toSQLValue(invalidFormat, StringType))) + } + def unorderablePivotColError(pivotCol: Expression): Throwable = { new AnalysisException( errorClass = "INCOMPARABLE_PIVOT_COLUMN", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index f6a472815207b..12c10eb8626ed 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -875,6 +875,26 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { "argName" -> "format", "funcName" -> "to_char", "requiredType" -> "string")) + checkError( + exception = intercept[AnalysisException] { + df2.select(to_char(col("input"), lit("invalid_format"))).collect() + }, + errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT", + parameters = Map( + "parameter" -> "`format`", + "functionName" -> "`to_char`", + "invalidFormat" -> "'invalid_format'")) + checkError( + exception = intercept[AnalysisException] { + sql("select to_char('a', 'b', 'c')") + }, + errorClass = "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + parameters = Map( + "functionName" -> "`to_char`", + "expectedNum" -> "2", + "actualNum" -> "3", + "docroot" -> SPARK_DOC_ROOT), + context = ExpectedContext("", "", 7, 28, "to_char('a', 'b', 'c')")) } test("to_varchar") { From c13f1822b6a9c40949b71702d08266faa54afe75 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sat, 26 Aug 2023 17:33:18 +0300 Subject: [PATCH 4/6] Trigger build From 1fa15d71fb805b25d445f4f8d23d64689bd9f4e3 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 28 Aug 2023 10:50:37 +0300 Subject: [PATCH 5/6] Add more tests --- .../spark/sql/StringFunctionsSuite.scala | 100 ++++++++---------- 1 file changed, 47 insertions(+), 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 12c10eb8626ed..12881f4a22a6c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -854,59 +854,53 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { ) } - test("to_char") { - val df = Seq(78.12).toDF("a") - checkAnswer( - df.selectExpr("to_char(a, '$99.99')"), - Seq(Row("$78.12")) - ) - checkAnswer( - df.select(to_char(col("a"), lit("$99.99"))), - Seq(Row("$78.12")) - ) - - val df2 = Seq((Array(1.toByte), "base64")).toDF("input", "format") - checkError( - exception = intercept[AnalysisException] { - df2.select(to_char(col("input"), col("format"))).collect() - }, - errorClass = "_LEGACY_ERROR_TEMP_1100", - parameters = Map( - "argName" -> "format", - "funcName" -> "to_char", - "requiredType" -> "string")) - checkError( - exception = intercept[AnalysisException] { - df2.select(to_char(col("input"), lit("invalid_format"))).collect() - }, - errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT", - parameters = Map( - "parameter" -> "`format`", - "functionName" -> "`to_char`", - "invalidFormat" -> "'invalid_format'")) - checkError( - exception = intercept[AnalysisException] { - sql("select to_char('a', 'b', 'c')") - }, - errorClass = "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", - parameters = Map( - "functionName" -> "`to_char`", - "expectedNum" -> "2", - "actualNum" -> "3", - "docroot" -> SPARK_DOC_ROOT), - context = ExpectedContext("", "", 7, 28, "to_char('a', 'b', 'c')")) - } - - test("to_varchar") { - val df = Seq(78.12).toDF("a") - checkAnswer( - df.selectExpr("to_varchar(a, '$99.99')"), - Seq(Row("$78.12")) - ) - checkAnswer( - df.select(to_varchar(col("a"), lit("$99.99"))), - Seq(Row("$78.12")) - ) + test("to_char/to_varchar") { + Seq( + "to_char" -> ((e: Column, fmt: Column) => to_char(e, fmt)), + "to_varchar" -> ((e: Column, fmt: Column) => to_varchar(e, fmt)) + ).foreach { case (funcName, func) => + val df = Seq(78.12).toDF("a") + checkAnswer(df.selectExpr(s"$funcName(a, '$$99.99')"), Seq(Row("$78.12"))) + checkAnswer(df.select(func(col("a"), lit("$99.99"))), Seq(Row("$78.12"))) + + val df2 = Seq((Array(100.toByte), "base64")).toDF("input", "format") + checkAnswer(df2.selectExpr(s"$funcName(input, 'hex')"), Seq(Row("64"))) + checkAnswer(df2.select(func(col("input"), lit("hex"))), Seq(Row("64"))) + checkAnswer(df2.selectExpr(s"$funcName(input, 'base64')"), Seq(Row("ZA=="))) + checkAnswer(df2.select(func(col("input"), lit("base64"))), Seq(Row("ZA=="))) + checkAnswer(df2.selectExpr(s"$funcName(input, 'utf-8')"), Seq(Row("d"))) + checkAnswer(df2.select(func(col("input"), lit("utf-8"))), Seq(Row("d"))) + + checkError( + exception = intercept[AnalysisException] { + df2.select(func(col("input"), col("format"))).collect() + }, + errorClass = "_LEGACY_ERROR_TEMP_1100", + parameters = Map( + "argName" -> "format", + "funcName" -> "to_char", + "requiredType" -> "string")) + checkError( + exception = intercept[AnalysisException] { + df2.select(func(col("input"), lit("invalid_format"))).collect() + }, + errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT", + parameters = Map( + "parameter" -> "`format`", + "functionName" -> "`to_char`", + "invalidFormat" -> "'invalid_format'")) + checkError( + exception = intercept[AnalysisException] { + sql(s"select $funcName('a', 'b', 'c')") + }, + errorClass = "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + parameters = Map( + "functionName" -> s"`$funcName`", + "expectedNum" -> "2", + "actualNum" -> "3", + "docroot" -> SPARK_DOC_ROOT), + context = ExpectedContext("", "", 7, 21 + funcName.length, s"$funcName('a', 'b', 'c')")) + } } test("to_number") { From ec790bbb07d3285bc058b33d9ac97482be0aca25 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 28 Aug 2023 13:05:56 +0300 Subject: [PATCH 6/6] Trigger build