diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 9a89fa29858d..9280dc2c6c76 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -2988,6 +2988,12 @@ ], "sqlState" : "42710" }, + "MALFORMED_CHARACTER_CODING" : { + "message" : [ + "Invalid value found when performing with " + ], + "sqlState" : "22000" + }, "MALFORMED_CSV_RECORD" : { "message" : [ "Malformed CSV record: " diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain index 165be9b9e12f..e1a445120c13 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain @@ -1,2 +1,2 @@ -Project [decode(cast(g#0 as binary), UTF-8, false) AS decode(g, UTF-8)#0] +Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.StringDecode, StringType, decode, cast(g#0 as binary), UTF-8, false, false, BinaryType, StringTypeAnyCollation, BooleanType, BooleanType, true, true, true) AS decode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain index 2f6543605923..7ce8776d754d 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8, false) AS encode(g, UTF-8)#0] +Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.Encode, BinaryType, encode, g#0, UTF-8, false, false, StringTypeAnyCollation, StringTypeAnyCollation, BooleanType, BooleanType, true, true, true) AS encode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain index b62ccccc0c15..d999697a4c9e 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8, false) AS to_binary(g, utf-8)#0] +Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.Encode, BinaryType, encode, g#0, UTF-8, false, false, StringTypeAnyCollation, StringTypeAnyCollation, BooleanType, BooleanType, true, true, true) AS to_binary(g, utf-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index ac23962f41ed..660c93de807a 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.expressions -import java.io.UnsupportedEncodingException +import java.nio.{ByteBuffer, CharBuffer} +import java.nio.charset.{CharacterCodingException, Charset, CodingErrorAction, IllegalCharsetNameException, UnsupportedCharsetException} import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} import java.util.{Base64 => JBase64} import java.util.{HashMap, Locale, Map => JMap} @@ -25,6 +26,7 @@ import java.util.{HashMap, Locale, Map => JMap} import scala.collection.mutable.ArrayBuffer import org.apache.spark.QueryContext +import org.apache.spark.network.util.JavaUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, FunctionRegistry, TypeCheckResult} import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch @@ -2708,62 +2710,69 @@ case class Decode(params: Seq[Expression], replacement: Expression) since = "1.5.0", group = "string_funcs") // scalastyle:on line.size.limit -case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Boolean) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { +case class StringDecode( + bin: Expression, + charset: Expression, + legacyCharsets: Boolean, + legacyErrorAction: Boolean) + extends RuntimeReplaceable with ImplicitCastInputTypes { def this(bin: Expression, charset: Expression) = - this(bin, charset, SQLConf.get.legacyJavaCharsets) + this(bin, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) - override def left: Expression = bin - override def right: Expression = charset override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, StringTypeAnyCollation) + override def prettyName: String = "decode" + override def toString: String = s"$prettyName($bin, $charset)" - private val supportedCharsets = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") - - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - val fromCharset = input2.asInstanceOf[UTF8String].toString - try { - if (legacyCharsets || supportedCharsets.contains(fromCharset.toUpperCase(Locale.ROOT))) { - UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset)) - } else throw new UnsupportedEncodingException - } catch { - case _: UnsupportedEncodingException => - throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) - } - } - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (bytes, charset) => { - val fromCharset = ctx.freshName("fromCharset") - val sc = JavaCode.global( - ctx.addReferenceObj("supportedCharsets", supportedCharsets), - supportedCharsets.getClass) - s""" - String $fromCharset = $charset.toString(); - try { - if ($legacyCharsets || $sc.contains($fromCharset.toUpperCase(java.util.Locale.ROOT))) { - ${ev.value} = UTF8String.fromString(new String($bytes, $fromCharset)); - } else { - throw new java.io.UnsupportedEncodingException(); - } - } catch (java.io.UnsupportedEncodingException e) { - throw QueryExecutionErrors.invalidCharsetError("$prettyName", $fromCharset); - } - """ - }) - } - - override protected def withNewChildrenInternal( - newLeft: Expression, newRight: Expression): StringDecode = - copy(bin = newLeft, charset = newRight) + override def replacement: Expression = StaticInvoke( + classOf[StringDecode], + SQLConf.get.defaultStringType, + "decode", + Seq(bin, charset, Literal(legacyCharsets), Literal(legacyErrorAction)), + Seq(BinaryType, StringTypeAnyCollation, BooleanType, BooleanType)) - override def prettyName: String = "decode" + override def children: Seq[Expression] = Seq(bin, charset) + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(bin = newChildren(0), charset = newChildren(1)) } object StringDecode { def apply(bin: Expression, charset: Expression): StringDecode = new StringDecode(bin, charset) + def decode( + input: Array[Byte], + charset: UTF8String, + legacyCharsets: Boolean, + legacyErrorAction: Boolean): UTF8String = { + val fromCharset = charset.toString + if (legacyCharsets || Encode.VALID_CHARSETS.contains(fromCharset.toUpperCase(Locale.ROOT))) { + val decoder = try { + val codingErrorAction = if (legacyErrorAction) { + CodingErrorAction.REPLACE + } else { + CodingErrorAction.REPORT + } + Charset.forName(fromCharset) + .newDecoder() + .onMalformedInput(codingErrorAction) + .onUnmappableCharacter(codingErrorAction) + } catch { + case _: IllegalCharsetNameException | + _: UnsupportedCharsetException | + _: IllegalArgumentException => + throw QueryExecutionErrors.invalidCharsetError("decode", fromCharset) + } + try { + val cb = decoder.decode(ByteBuffer.wrap(input)) + UTF8String.fromString(cb.toString) + } catch { + case _: CharacterCodingException => + throw QueryExecutionErrors.malformedCharacterCoding("decode", fromCharset) + } + } else { + throw QueryExecutionErrors.invalidCharsetError("decode", fromCharset) + } + } } /** @@ -2785,59 +2794,76 @@ object StringDecode { since = "1.5.0", group = "string_funcs") // scalastyle:on line.size.limit -case class Encode(str: Expression, charset: Expression, legacyCharsets: Boolean) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { +case class Encode( + str: Expression, + charset: Expression, + legacyCharsets: Boolean, + legacyErrorAction: Boolean) + extends RuntimeReplaceable with ImplicitCastInputTypes { def this(value: Expression, charset: Expression) = - this(value, charset, SQLConf.get.legacyJavaCharsets) + this(value, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) - override def left: Expression = str - override def right: Expression = charset override def dataType: DataType = BinaryType override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation, StringTypeAnyCollation) - private val supportedCharsets = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") - - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - val toCharset = input2.asInstanceOf[UTF8String].toString - try { - if (legacyCharsets || supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) { - input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) - } else throw new UnsupportedEncodingException - } catch { - case _: UnsupportedEncodingException => - throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) - } - } + override val replacement: Expression = StaticInvoke( + classOf[Encode], + BinaryType, + "encode", + Seq( + str, charset, Literal(legacyCharsets, BooleanType), Literal(legacyErrorAction, BooleanType)), + Seq(StringTypeAnyCollation, StringTypeAnyCollation, BooleanType, BooleanType)) - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (string, charset) => { - val toCharset = ctx.freshName("toCharset") - val sc = JavaCode.global( - ctx.addReferenceObj("supportedCharsets", supportedCharsets), - supportedCharsets.getClass) - s""" - String $toCharset = $charset.toString(); - try { - if ($legacyCharsets || $sc.contains($toCharset.toUpperCase(java.util.Locale.ROOT))) { - ${ev.value} = $string.toString().getBytes($toCharset); - } else { - throw new java.io.UnsupportedEncodingException(); - } - } catch (java.io.UnsupportedEncodingException e) { - throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset); - }""" - }) - } + override def toString: String = s"$prettyName($str, $charset)" - override protected def withNewChildrenInternal( - newLeft: Expression, newRight: Expression): Encode = copy(str = newLeft, charset = newRight) + override def children: Seq[Expression] = Seq(str, charset) + + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(str = newChildren.head, charset = newChildren(1)) } object Encode { def apply(value: Expression, charset: Expression): Encode = new Encode(value, charset) + + private[expressions] final lazy val VALID_CHARSETS = + Set("US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") + + def encode( + input: UTF8String, + charset: UTF8String, + legacyCharsets: Boolean, + legacyErrorAction: Boolean): Array[Byte] = { + val toCharset = charset.toString + if (legacyCharsets || VALID_CHARSETS.contains(toCharset.toUpperCase(Locale.ROOT))) { + val encoder = try { + val codingErrorAction = if (legacyErrorAction) { + CodingErrorAction.REPLACE + } else { + CodingErrorAction.REPORT + } + Charset.forName(toCharset) + .newEncoder() + .onMalformedInput(codingErrorAction) + .onUnmappableCharacter(codingErrorAction) + } catch { + case _: IllegalCharsetNameException | + _: UnsupportedCharsetException | + _: IllegalArgumentException => + throw QueryExecutionErrors.invalidCharsetError("encode", toCharset) + } + try { + val bb = encoder.encode(CharBuffer.wrap(input.toString)) + JavaUtils.bufferToArray(bb) + } catch { + case _: CharacterCodingException => + throw QueryExecutionErrors.malformedCharacterCoding("encode", toCharset) + } + } else { + throw QueryExecutionErrors.invalidCharsetError("encode", toCharset) + } + } } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 30e53f146982..8af931976b2e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -2741,6 +2741,14 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "charset" -> charset)) } + def malformedCharacterCoding(functionName: String, charset: String): RuntimeException = { + new SparkRuntimeException( + errorClass = "MALFORMED_CHARACTER_CODING", + messageParameters = Map( + "function" -> toSQLId(functionName), + "charset" -> charset)) + } + def invalidWriterCommitMessageError(details: String): Throwable = { new SparkRuntimeException( errorClass = "INVALID_WRITER_COMMIT_MESSAGE", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index fd804bc0e986..70eff641a91c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -5010,6 +5010,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_CODING_ERROR_ACTION = buildConf("spark.sql.legacy.codingErrorAction") + .internal() + .doc("When set to true, encode/decode functions replace unmappable characters with mojibake " + + "instead of reporting coding errors.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val LEGACY_EVAL_CURRENT_TIME = buildConf("spark.sql.legacy.earlyEvalCurrentTime") .internal() .doc("When set to true, evaluation and constant folding will happen for now() and " + @@ -5986,6 +5994,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyJavaCharsets: Boolean = getConf(SQLConf.LEGACY_JAVA_CHARSETS) + def legacyCodingErrorAction: Boolean = getConf(SQLConf.LEGACY_CODING_ERROR_ACTION) + def legacyEvalCurrentTime: Boolean = getConf(SQLConf.LEGACY_EVAL_CURRENT_TIME) /** ********************** SQLConf functionality methods ************ */ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index 4df8d87074fc..4c045f9fda73 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -104,7 +104,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-22543: split large if expressions into blocks due to JVM code size limit") { var strExpr: Expression = Literal("abc") for (_ <- 1 to 150) { - strExpr = StringDecode(Encode(strExpr, "utf-8"), "utf-8") + strExpr = StringTrimRight(StringTrimLeft(strExpr)) } val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 21e6b8692911..a063e53486ad 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -71,10 +71,15 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB new ArrayBasedMapData(keyArray, valueArray) } + protected def replace(expr: Expression): Expression = expr match { + case r: RuntimeReplaceable => replace(r.replacement) + case _ => expr.mapChildren(replace) + } + private def prepareEvaluation(expression: Expression): Expression = { val serializer = new JavaSerializer(new SparkConf()).newInstance() val resolver = ResolveTimeZone - val expr = resolver.resolveTimeZones(expression) + val expr = resolver.resolveTimeZones(replace(expression)) assert(expr.resolved) serializer.deserialize(serializer.serialize(expr)) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 51de44d8dfd9..ebd724543481 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -505,8 +505,8 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(StringDecode(b, Literal.create(null, StringType)), null, create_row(null)) // Test escaping of charset - GenerateUnsafeProjection.generate(Encode(a, Literal("\"quote")) :: Nil) - GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")) :: Nil) + GenerateUnsafeProjection.generate(Encode(a, Literal("\"quote")).replacement :: Nil) + GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")).replacement :: Nil) } test("initcap unit test") { diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index c9b451187356..c7675b16384f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -685,14 +685,56 @@ Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query analysis +Project [encode(渭城朝雨浥轻尘, US-ASCII) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query analysis +Project [encode(客舍青青柳色新, US-ASCII) AS encode(客舍青青柳色新, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -746,14 +788,14 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation -- !query select decode(encode('大千世界', 'utf-32'), 'utf-32') -- !query analysis -Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +Project [decode(encode(大千世界, utf-32), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +- OneRowRelation @@ -863,6 +905,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query analysis +Project [decode(0xE58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query analysis +Project [decode(0xE8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index c9b451187356..c7675b16384f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -685,14 +685,56 @@ Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query analysis +Project [encode(渭城朝雨浥轻尘, US-ASCII) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query analysis +Project [encode(客舍青青柳色新, US-ASCII) AS encode(客舍青青柳色新, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -746,14 +788,14 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation -- !query select decode(encode('大千世界', 'utf-32'), 'utf-32') -- !query analysis -Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +Project [decode(encode(大千世界, utf-32), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +- OneRowRelation @@ -863,6 +905,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query analysis +Project [decode(0xE58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query analysis +Project [decode(0xE8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out index b3c5034656e2..62e3a8747326 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out @@ -11,7 +11,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#xL as string), col2#x), cast(col3#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x] + +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x] +- Range (0, 10, step=1) @@ -29,7 +29,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, cast(col2#xL as string)), concat(col3#x, cast(col4#x as string))), cast(col5#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1) @@ -46,7 +46,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -67,7 +67,7 @@ FROM ( -- !query analysis Project [concat(cast(col1#x as string), cast(col2#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] +- Range (0, 10, step=1) @@ -84,7 +84,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(cast(col1#x as string), cast(col2#x as string)), cast(col3#x as string)), cast(col4#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -101,7 +101,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#x as string), cast(col2#x as string)), concat(cast(col3#x as string), cast(col4#x as string))) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -122,7 +122,7 @@ FROM ( -- !query analysis Project [concat(col1#x, col2#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] +- Range (0, 10, step=1) @@ -139,7 +139,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, col2#x), col3#x), col4#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -156,7 +156,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), concat(col3#x, col4#x)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out index 60b7fa711791..f4902012f0f9 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out @@ -13,7 +13,7 @@ FROM ( -- !query analysis Project [elt(2, col1#x, cast(col2#xL as string), col3#x, cast(col4#x as string), cast(col5#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1) @@ -30,7 +30,7 @@ FROM ( -- !query analysis Project [elt(3, col1#x, col2#x, cast(col3#x as string), cast(col4#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -51,7 +51,7 @@ FROM ( -- !query analysis Project [elt(1, cast(col1#x as string), cast(col2#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] +- Range (0, 10, step=1) @@ -72,5 +72,5 @@ FROM ( -- !query analysis Project [elt(2, col1#x, col2#x, false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] +- Range (0, 10, step=1) diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 733720a7e21b..0d9c0f3a6a14 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -126,6 +126,12 @@ select encode('hello', 'WINDOWS-1252'); select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol); select encode('hello', 'Windows-xxx'); select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=true; +select encode('渭城朝雨浥轻尘', 'US-ASCII'); +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=false; +select encode('客舍青青柳色新', 'US-ASCII'); +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol); -- decode select decode(); @@ -147,6 +153,12 @@ select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, set spark.sql.legacy.javaCharsets=false; select decode(X'68656c6c6f', 'WINDOWS-1252'); select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=true; +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII'); +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=false; +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII'); +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol); -- contains SELECT CONTAINS(null, 'Spark'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 09d4f8892fa4..9f72e215ea54 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -903,6 +903,70 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query schema +struct +-- !query output +??????? + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +??????? + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + +-- !query +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + -- !query select decode() -- !query schema @@ -1125,6 +1189,70 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query schema +struct +-- !query output +��������������������� + + +-- !query +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +��������������������� + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} + + +-- !query +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 506524840f10..e6778cb539bd 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -835,6 +835,70 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query schema +struct +-- !query output +??????? + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +??????? + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + +-- !query +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + -- !query select decode() -- !query schema @@ -1057,6 +1121,70 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query schema +struct +-- !query output +��������������������� + + +-- !query +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +��������������������� + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} + + +-- !query +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index b2aaaceb26ab..22fdd96ce6ba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -192,9 +192,11 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite |) """.stripMargin) checkKeywordsExistsInExplain(df2, - "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") + "Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x]", + "Project [cast(id#xL as string) AS col1#x, " + + "cast((id#xL + cast(1 as bigint)) as string) AS col2#x, " + + "encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, " + + "encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]") val df3 = sql( """ @@ -208,9 +210,10 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite |) """.stripMargin) checkKeywordsExistsInExplain(df3, - "Project [concat(cast(id#xL as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") + "Project [concat(col1#x, cast(concat(col3#x, col4#x) as string)) AS col#x]", + "Project [cast(id#xL as string) AS col1#x, " + + "encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, " + + "encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]") } }