From bd014e7001c4c3a320b9edd6b9f4f4545e69d7e7 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Mon, 21 Nov 2022 15:01:03 +0800 Subject: [PATCH 1/5] fix --- .../main/resources/error/error-classes.json | 5 ++ .../expressions/stringExpressions.scala | 85 +++++++++++++------ .../results/ansi/string-functions.sql.out | 38 +++++++-- .../results/string-functions.sql.out | 38 +++++++-- 4 files changed, 127 insertions(+), 39 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 4da9d2f9fbca..9e58437d5b9a 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -269,6 +269,11 @@ "Input to the function cannot contain elements of the \"MAP\" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on \"MAP\" elements. To restore previous behavior set \"spark.sql.legacy.allowHashOnMapType\" to \"true\"." ] }, + "INVALID_ARG_VALUE" : { + "message" : [ + "The value must to be a literal of , but got ." + ] + }, "INVALID_JSON_MAP_KEY_TYPE" : { "message" : [ "Input schema can only contain STRING as a key type for a MAP." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 60b56f4fef79..3a1db2ce1b8b 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2620,39 +2620,30 @@ case class ToBinary( nullOnInvalidFormat: Boolean = false) extends RuntimeReplaceable with ImplicitCastInputTypes { - override lazy val replacement: Expression = format.map { f => - assert(f.foldable && (f.dataType == StringType || f.dataType == NullType)) + @transient lazy val fmt: String = format.map { f => val value = f.eval() if (value == null) { - Literal(null, BinaryType) + null } else { - value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT) match { - case "hex" => Unhex(expr, failOnError = true) - case "utf-8" | "utf8" => Encode(expr, Literal("UTF-8")) - case "base64" => UnBase64(expr, failOnError = true) - case _ if nullOnInvalidFormat => Literal(null, BinaryType) - case other => throw QueryCompilationErrors.invalidStringLiteralParameter( - "to_binary", - "format", - other, - Some( - "The value has to be a case-insensitive string literal of " + - "'hex', 'utf-8', 'utf8', or 'base64'.")) - } + value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT) + } + }.getOrElse("hex") + + override lazy val replacement: Expression = if (fmt == null) { + Literal(null, BinaryType) + } else { + fmt match { + case "hex" => Unhex(expr, failOnError = true) + case "utf-8" | "utf8" => Encode(expr, Literal("UTF-8")) + case "base64" => UnBase64(expr, failOnError = true) + case _ => Literal(null, BinaryType) } - }.getOrElse(Unhex(expr, failOnError = true)) + } def this(expr: Expression) = this(expr, None, false) def this(expr: Expression, format: Expression) = - this(expr, Some({ - // We perform this check in the constructor to make it eager and not go through type coercion. - if (format.foldable && (format.dataType == StringType || format.dataType == NullType)) { - format - } else { - throw QueryCompilationErrors.requireLiteralParameter("to_binary", "format", "string") - } - }), false) + this(expr, Some(format), false) override def prettyName: String = "to_binary" @@ -2660,6 +2651,50 @@ case class ToBinary( override def inputTypes: Seq[AbstractDataType] = children.map(_ => StringType) + override def checkInputDataTypes(): TypeCheckResult = { + def isValidFormat: Boolean = { + fmt == null || Set("hex", "utf-8", "utf8", "base64").contains(fmt) + } + format match { + case Some(f) => + if (f.foldable && (f.dataType == StringType || f.dataType == NullType)) { + if (isValidFormat || nullOnInvalidFormat) { + super.checkInputDataTypes() + } else { + DataTypeMismatch( + errorSubClass = "INVALID_ARG_VALUE", + messageParameters = Map( + "inputName" -> "fmt", + "requireType" -> s"case-insensitive ${toSQLType(StringType)}", + "validValues" -> "'hex', 'utf-8', 'utf8', or 'base64'", + "inputValue" -> toSQLValue(fmt, StringType) + ) + ) + } + } else if (!f.foldable) { + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> "fmt", + "inputType" -> toSQLType(StringType), + "inputExpr" -> toSQLExpr(f) + ) + ) + } else { + DataTypeMismatch( + errorSubClass = "INVALID_ARG_VALUE", + messageParameters = Map( + "inputName" -> "fmt", + "requireType" -> s"case-insensitive ${toSQLType(StringType)}", + "validValues" -> "'hex', 'utf-8', 'utf8', or 'base64'", + "inputValue" -> toSQLValue(f.eval(), f.dataType) + ) + ) + } + case _ => super.checkInputDataTypes() + } + } + override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): Expression = { if (format.isDefined) { diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 41f1922f8bd5..125ab8cfc150 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1609,7 +1609,23 @@ select to_binary('abc', 1) struct<> -- !query output org.apache.spark.sql.AnalysisException -The 'format' parameter of function 'to_binary' needs to be a string literal.; line 1 pos 7 +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE", + "messageParameters" : { + "inputName" : "fmt", + "inputValue" : "'1'", + "requireType" : "case-insensitive \"STRING\"", + "sqlExpr" : "\"to_binary(abc, 1)\"", + "validValues" : "'hex', 'utf-8', 'utf8', or 'base64'" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 26, + "fragment" : "to_binary('abc', 1)" + } ] +} -- !query @@ -1619,11 +1635,19 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "_LEGACY_ERROR_TEMP_1101", + "errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE", "messageParameters" : { - "argName" : "format", - "endingMsg" : " The value has to be a case-insensitive string literal of 'hex', 'utf-8', 'utf8', or 'base64'.", - "funcName" : "to_binary", - "invalidValue" : "invalidformat" - } + "inputName" : "fmt", + "inputValue" : "'invalidformat'", + "requireType" : "case-insensitive \"STRING\"", + "sqlExpr" : "\"to_binary(abc, invalidFormat)\"", + "validValues" : "'hex', 'utf-8', 'utf8', or 'base64'" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 40, + "fragment" : "to_binary('abc', 'invalidFormat')" + } ] } diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 4bcb69ed773a..1e492d7c62f2 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1541,7 +1541,23 @@ select to_binary('abc', 1) struct<> -- !query output org.apache.spark.sql.AnalysisException -The 'format' parameter of function 'to_binary' needs to be a string literal.; line 1 pos 7 +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE", + "messageParameters" : { + "inputName" : "fmt", + "inputValue" : "'1'", + "requireType" : "case-insensitive \"STRING\"", + "sqlExpr" : "\"to_binary(abc, 1)\"", + "validValues" : "'hex', 'utf-8', 'utf8', or 'base64'" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 26, + "fragment" : "to_binary('abc', 1)" + } ] +} -- !query @@ -1551,11 +1567,19 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "_LEGACY_ERROR_TEMP_1101", + "errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE", "messageParameters" : { - "argName" : "format", - "endingMsg" : " The value has to be a case-insensitive string literal of 'hex', 'utf-8', 'utf8', or 'base64'.", - "funcName" : "to_binary", - "invalidValue" : "invalidformat" - } + "inputName" : "fmt", + "inputValue" : "'invalidformat'", + "requireType" : "case-insensitive \"STRING\"", + "sqlExpr" : "\"to_binary(abc, invalidFormat)\"", + "validValues" : "'hex', 'utf-8', 'utf8', or 'base64'" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 40, + "fragment" : "to_binary('abc', 'invalidFormat')" + } ] } From f76c6cc90001986bb39d608af2ec0725fd63c896 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Tue, 22 Nov 2022 15:36:17 +0800 Subject: [PATCH 2/5] update golden files --- .../results/ansi/string-functions.sql.out | 30 ++++++++++++------- .../results/string-functions.sql.out | 30 ++++++++++++------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 3ab49c14bef1..125ab8cfc150 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1610,11 +1610,13 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "_LEGACY_ERROR_TEMP_1100", + "errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE", "messageParameters" : { - "argName" : "format", - "funcName" : "to_binary", - "requiredType" : "string" + "inputName" : "fmt", + "inputValue" : "'1'", + "requireType" : "case-insensitive \"STRING\"", + "sqlExpr" : "\"to_binary(abc, 1)\"", + "validValues" : "'hex', 'utf-8', 'utf8', or 'base64'" }, "queryContext" : [ { "objectType" : "", @@ -1633,11 +1635,19 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "_LEGACY_ERROR_TEMP_1101", + "errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE", "messageParameters" : { - "argName" : "format", - "endingMsg" : " The value has to be a case-insensitive string literal of 'hex', 'utf-8', 'utf8', or 'base64'.", - "funcName" : "to_binary", - "invalidValue" : "invalidformat" - } + "inputName" : "fmt", + "inputValue" : "'invalidformat'", + "requireType" : "case-insensitive \"STRING\"", + "sqlExpr" : "\"to_binary(abc, invalidFormat)\"", + "validValues" : "'hex', 'utf-8', 'utf8', or 'base64'" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 40, + "fragment" : "to_binary('abc', 'invalidFormat')" + } ] } diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 2ea5cefa38d1..1e492d7c62f2 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1542,11 +1542,13 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "_LEGACY_ERROR_TEMP_1100", + "errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE", "messageParameters" : { - "argName" : "format", - "funcName" : "to_binary", - "requiredType" : "string" + "inputName" : "fmt", + "inputValue" : "'1'", + "requireType" : "case-insensitive \"STRING\"", + "sqlExpr" : "\"to_binary(abc, 1)\"", + "validValues" : "'hex', 'utf-8', 'utf8', or 'base64'" }, "queryContext" : [ { "objectType" : "", @@ -1565,11 +1567,19 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "_LEGACY_ERROR_TEMP_1101", + "errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE", "messageParameters" : { - "argName" : "format", - "endingMsg" : " The value has to be a case-insensitive string literal of 'hex', 'utf-8', 'utf8', or 'base64'.", - "funcName" : "to_binary", - "invalidValue" : "invalidformat" - } + "inputName" : "fmt", + "inputValue" : "'invalidformat'", + "requireType" : "case-insensitive \"STRING\"", + "sqlExpr" : "\"to_binary(abc, invalidFormat)\"", + "validValues" : "'hex', 'utf-8', 'utf8', or 'base64'" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 40, + "fragment" : "to_binary('abc', 'invalidFormat')" + } ] } From 36e1bda1427ae754e739c70d363e4733615bd7bd Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 23 Nov 2022 15:55:09 +0800 Subject: [PATCH 3/5] add case --- .../expressions/StringExpressionsSuite.scala | 16 ++++++++ .../sql-tests/inputs/string-functions.sql | 4 ++ .../results/ansi/string-functions.sql.out | 40 +++++++++++++++++++ .../results/string-functions.sql.out | 40 +++++++++++++++++++ 4 files changed, 100 insertions(+) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 8bdbcb26e83c..1debbee7fd19 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -432,6 +432,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")) :: Nil) } + test("initcap unit test") { checkEvaluation(InitCap(Literal.create(null, StringType)), null) checkEvaluation(InitCap(Literal("a b")), "A B") @@ -1256,6 +1257,21 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { ) } + test("ToBinary: fails analysis if fmt is not foldable") { + val wrongFmt = AttributeReference("invalidFormat", StringType)() + val toCharacterExpr = ToBinary(Literal("abc"), Some(wrongFmt)) + assert(toCharacterExpr.checkInputDataTypes() == + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> "fmt", + "inputType" -> toSQLType(wrongFmt.dataType), + "inputExpr" -> toSQLExpr(wrongFmt) + ) + ) + ) + } + test("ToNumber: negative tests (the input string does not match the format string)") { Seq( // The input contained more thousands separators than the format string. diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index cb18c547b612..39c57e6efa28 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -225,3 +225,7 @@ select to_binary(null, cast(null as string)); -- invalid format select to_binary('abc', 1); select to_binary('abc', 'invalidFormat'); +CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat'); +SELECT to_binary('abc', fmtField) FROM fmtTable; +-- Clean up +DROP VIEW IF EXISTS fmtTable; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 125ab8cfc150..5a0479996b95 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1651,3 +1651,43 @@ org.apache.spark.sql.AnalysisException "fragment" : "to_binary('abc', 'invalidFormat')" } ] } + + +-- !query +CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat') +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT to_binary('abc', fmtField) FROM fmtTable +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", + "messageParameters" : { + "inputExpr" : "\"fmtField\"", + "inputName" : "fmt", + "inputType" : "\"STRING\"", + "sqlExpr" : "\"to_binary(abc, fmtField)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 33, + "fragment" : "to_binary('abc', fmtField)" + } ] +} + + +-- !query +DROP VIEW IF EXISTS fmtTable +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 1e492d7c62f2..36814275cd7d 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1583,3 +1583,43 @@ org.apache.spark.sql.AnalysisException "fragment" : "to_binary('abc', 'invalidFormat')" } ] } + + +-- !query +CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat') +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT to_binary('abc', fmtField) FROM fmtTable +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", + "messageParameters" : { + "inputExpr" : "\"fmtField\"", + "inputName" : "fmt", + "inputType" : "\"STRING\"", + "sqlExpr" : "\"to_binary(abc, fmtField)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 33, + "fragment" : "to_binary('abc', fmtField)" + } ] +} + + +-- !query +DROP VIEW IF EXISTS fmtTable +-- !query schema +struct<> +-- !query output + From f3621d69693b8f08dd76a9e7bb29561507efb95c Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 23 Nov 2022 15:59:34 +0800 Subject: [PATCH 4/5] rename expr --- .../sql/catalyst/expressions/StringExpressionsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 1debbee7fd19..81f0712911bc 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -1259,8 +1259,8 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("ToBinary: fails analysis if fmt is not foldable") { val wrongFmt = AttributeReference("invalidFormat", StringType)() - val toCharacterExpr = ToBinary(Literal("abc"), Some(wrongFmt)) - assert(toCharacterExpr.checkInputDataTypes() == + val toBinaryExpr = ToBinary(Literal("abc"), Some(wrongFmt)) + assert(toBinaryExpr.checkInputDataTypes() == DataTypeMismatch( errorSubClass = "NON_FOLDABLE_INPUT", messageParameters = Map( From b554e8dac8563da8c44fbf9a07e51be392c7ac36 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 23 Nov 2022 17:09:11 +0800 Subject: [PATCH 5/5] remove empty line --- .../spark/sql/catalyst/expressions/StringExpressionsSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 81f0712911bc..42b1b967fe7d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -432,7 +432,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")) :: Nil) } - test("initcap unit test") { checkEvaluation(InitCap(Literal.create(null, StringType)), null) checkEvaluation(InitCap(Literal("a b")), "A B")