diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 7c9c0a726c881..ceabdac332173 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -32,6 +32,8 @@ license: | - In Spark 3.1, the Parquet, ORC, Avro and JSON datasources throw the exception `org.apache.spark.sql.AnalysisException: Found duplicate column(s) in the data schema` in read if they detect duplicate names in top-level columns as well in nested structures. The datasources take into account the SQL config `spark.sql.caseSensitive` while detecting column name duplicates. + - In Spark 3.1, structs and maps are wrapped by the `{}` brackets in casting them to strings. For instance, the `show()` action and the `CAST` expression use such brackets. In Spark 3.0 and earlier, the `[]` brackets are used for the same purpose. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.castComplexTypesToString.enabled` to `true`. + ## Upgrading from Spark SQL 3.0 to 3.0.1 - In Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they match to the pattern defined by the JSON option `timestampFormat`. Since version 3.0.1, the timestamp type inference is disabled by default. Set the JSON option `inferTimestamp` to `true` to enable such type inference. diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py index 70de8425613ec..517c984252768 100644 --- a/python/pyspark/ml/stat.py +++ b/python/pyspark/ml/stat.py @@ -222,14 +222,14 @@ class Summarizer(object): +-----------------------------------+ |aggregate_metrics(features, weight)| +-----------------------------------+ - |[[1.0,1.0,1.0], 1] | + |{[1.0,1.0,1.0], 1} | +-----------------------------------+ >>> df.select(summarizer.summary(df.features)).show(truncate=False) +--------------------------------+ |aggregate_metrics(features, 1.0)| +--------------------------------+ - |[[1.0,1.5,2.0], 2] | + |{[1.0,1.5,2.0], 2} | +--------------------------------+ >>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 02a6f0022e7ab..308642b136f75 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1996,7 +1996,7 @@ def map_from_arrays(col1, col2): +----------------+ | map| +----------------+ - |[2 -> a, 5 -> b]| + |{2 -> a, 5 -> b}| +----------------+ """ sc = SparkContext._active_spark_context @@ -2316,9 +2316,9 @@ def explode_outer(col): +---+----------+----+ | id| a_map| col| +---+----------+----+ - | 1|[x -> 1.0]| foo| - | 1|[x -> 1.0]| bar| - | 2| []|null| + | 1|{x -> 1.0}| foo| + | 1|{x -> 1.0}| bar| + | 2| {}|null| | 3| null|null| +---+----------+----+ """ @@ -2351,9 +2351,9 @@ def posexplode_outer(col): +---+----------+----+----+ | id| a_map| pos| col| +---+----------+----+----+ - | 1|[x -> 1.0]| 0| foo| - | 1|[x -> 1.0]| 1| bar| - | 2| []|null|null| + | 1|{x -> 1.0}| 0| foo| + | 1|{x -> 1.0}| 1| bar| + | 2| {}|null|null| | 3| null|null|null| +---+----------+----+----+ """ @@ -2750,7 +2750,7 @@ def map_entries(col): +----------------+ | entries| +----------------+ - |[[1, a], [2, b]]| + |[{1, a}, {2, b}]| +----------------+ """ sc = SparkContext._active_spark_context @@ -2770,7 +2770,7 @@ def map_from_entries(col): +----------------+ | map| +----------------+ - |[1 -> a, 2 -> b]| + |{1 -> a, 2 -> b}| +----------------+ """ sc = SparkContext._active_spark_context @@ -2822,7 +2822,7 @@ def map_concat(*cols): +------------------------+ |map3 | +------------------------+ - |[1 -> a, 2 -> b, 3 -> c]| + |{1 -> a, 2 -> b, 3 -> c}| +------------------------+ """ sc = SparkContext._active_spark_context @@ -3241,7 +3241,7 @@ def transform_keys(col, f): +-------------------------+ |data_upper | +-------------------------+ - |[BAR -> 2.0, FOO -> -2.0]| + |{BAR -> 2.0, FOO -> -2.0}| +-------------------------+ """ return _invoke_higher_order_function("TransformKeys", [col], [f]) @@ -3268,7 +3268,7 @@ def transform_values(col, f): +---------------------------------------+ |new_data | +---------------------------------------+ - |[OPS -> 34.0, IT -> 20.0, SALES -> 2.0]| + |{OPS -> 34.0, IT -> 20.0, SALES -> 2.0}| +---------------------------------------+ """ return _invoke_higher_order_function("TransformValues", [col], [f]) @@ -3294,7 +3294,7 @@ def map_filter(col, f): +--------------------------+ |data_filtered | +--------------------------+ - |[baz -> 32.0, foo -> 42.0]| + |{baz -> 32.0, foo -> 42.0}| +--------------------------+ """ return _invoke_higher_order_function("MapFilter", [col], [f]) @@ -3324,7 +3324,7 @@ def map_zip_with(col1, col2, f): +---------------------------+ |updated_data | +---------------------------+ - |[SALES -> 16.8, IT -> 48.0]| + |{SALES -> 16.8, IT -> 48.0}| +---------------------------+ """ return _invoke_higher_order_function("MapZipWith", [col1, col2], [f]) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index e27c021556377..e93dc588819b0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -297,6 +297,10 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit private lazy val dateFormatter = DateFormatter(zoneId) private lazy val timestampFormatter = TimestampFormatter.getFractionFormatter(zoneId) + // The brackets that are used in casting structs and maps to strings + private val (leftBracket, rightBracket) = + if (SQLConf.get.getConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING)) ("[", "]") else ("{", "}") + // UDFToString private[this] def castToString(from: DataType): Any => Any = from match { case CalendarIntervalType => @@ -330,7 +334,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit case MapType(kt, vt, _) => buildCast[MapData](_, map => { val builder = new UTF8StringBuilder - builder.append("[") + builder.append(leftBracket) if (map.numElements > 0) { val keyArray = map.keyArray() val valueArray = map.valueArray() @@ -355,13 +359,13 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit i += 1 } } - builder.append("]") + builder.append(rightBracket) builder.build() }) case StructType(fields) => buildCast[InternalRow](_, row => { val builder = new UTF8StringBuilder - builder.append("[") + builder.append(leftBracket) if (row.numFields > 0) { val st = fields.map(_.dataType) val toUTF8StringFuncs = st.map(castToString) @@ -378,7 +382,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit i += 1 } } - builder.append("]") + builder.append(rightBracket) builder.build() }) case pudt: PythonUserDefinedType => castToString(pudt.sqlType) @@ -962,7 +966,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit val getMapKeyArray = CodeGenerator.getValue(mapKeyArray, kt, loopIndex) val getMapValueArray = CodeGenerator.getValue(mapValueArray, vt, loopIndex) code""" - |$buffer.append("["); + |$buffer.append("$leftBracket"); |if ($map.numElements() > 0) { | $buffer.append($keyToStringFunc($getMapFirstKey)); | $buffer.append(" ->"); @@ -980,7 +984,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit | } | } |} - |$buffer.append("]"); + |$buffer.append("$rightBracket"); """.stripMargin } @@ -1015,9 +1019,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit (classOf[UTF8StringBuilder].getName, buffer.code) :: Nil) code""" - |$buffer.append("["); + |$buffer.append("$leftBracket"); |$writeStructCode - |$buffer.append("]"); + |$buffer.append("$rightBracket"); """.stripMargin } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index bae41114caf1c..f54e0192b6df8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2690,6 +2690,15 @@ object SQLConf { .booleanConf .createWithDefault(true) + val LEGACY_COMPLEX_TYPES_TO_STRING = + buildConf("spark.sql.legacy.castComplexTypesToString.enabled") + .internal() + .doc("When true, maps and structs are wrapped by [] in casting to strings. " + + "Otherwise, if this is false, which is the default, maps and structs are wrapped by {}.") + .version("3.1.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 4ab288a34cb08..b8b93d929d39d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -712,47 +712,59 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { } test("SPARK-22973 Cast map to string") { - val ret1 = cast(Literal.create(Map(1 -> "a", 2 -> "b", 3 -> "c")), StringType) - checkEvaluation(ret1, "[1 -> a, 2 -> b, 3 -> c]") - val ret2 = cast( - Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)), - StringType) - checkEvaluation(ret2, "[1 -> a, 2 ->, 3 -> c]") - val ret3 = cast( - Literal.create(Map( - 1 -> Date.valueOf("2014-12-03"), - 2 -> Date.valueOf("2014-12-04"), - 3 -> Date.valueOf("2014-12-05"))), - StringType) - checkEvaluation(ret3, "[1 -> 2014-12-03, 2 -> 2014-12-04, 3 -> 2014-12-05]") - val ret4 = cast( - Literal.create(Map( - 1 -> Timestamp.valueOf("2014-12-03 13:01:00"), - 2 -> Timestamp.valueOf("2014-12-04 15:05:00"))), - StringType) - checkEvaluation(ret4, "[1 -> 2014-12-03 13:01:00, 2 -> 2014-12-04 15:05:00]") - val ret5 = cast( - Literal.create(Map( - 1 -> Array(1, 2, 3), - 2 -> Array(4, 5, 6))), - StringType) - checkEvaluation(ret5, "[1 -> [1, 2, 3], 2 -> [4, 5, 6]]") + Seq( + "false" -> ("{", "}"), + "true" -> ("[", "]")).foreach { case (legacyBrackets, (lb, rb)) => + withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyBrackets) { + val ret1 = cast(Literal.create(Map(1 -> "a", 2 -> "b", 3 -> "c")), StringType) + checkEvaluation(ret1, s"${lb}1 -> a, 2 -> b, 3 -> c$rb") + val ret2 = cast( + Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)), + StringType) + checkEvaluation(ret2, s"${lb}1 -> a, 2 ->, 3 -> c$rb") + val ret3 = cast( + Literal.create(Map( + 1 -> Date.valueOf("2014-12-03"), + 2 -> Date.valueOf("2014-12-04"), + 3 -> Date.valueOf("2014-12-05"))), + StringType) + checkEvaluation(ret3, s"${lb}1 -> 2014-12-03, 2 -> 2014-12-04, 3 -> 2014-12-05$rb") + val ret4 = cast( + Literal.create(Map( + 1 -> Timestamp.valueOf("2014-12-03 13:01:00"), + 2 -> Timestamp.valueOf("2014-12-04 15:05:00"))), + StringType) + checkEvaluation(ret4, s"${lb}1 -> 2014-12-03 13:01:00, 2 -> 2014-12-04 15:05:00$rb") + val ret5 = cast( + Literal.create(Map( + 1 -> Array(1, 2, 3), + 2 -> Array(4, 5, 6))), + StringType) + checkEvaluation(ret5, s"${lb}1 -> [1, 2, 3], 2 -> [4, 5, 6]$rb") + } + } } test("SPARK-22981 Cast struct to string") { - val ret1 = cast(Literal.create((1, "a", 0.1)), StringType) - checkEvaluation(ret1, "[1, a, 0.1]") - val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType) - checkEvaluation(ret2, "[1,, a]") - val ret3 = cast(Literal.create( - (Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType) - checkEvaluation(ret3, "[2014-12-03, 2014-12-03 15:05:00]") - val ret4 = cast(Literal.create(((1, "a"), 5, 0.1)), StringType) - checkEvaluation(ret4, "[[1, a], 5, 0.1]") - val ret5 = cast(Literal.create((Seq(1, 2, 3), "a", 0.1)), StringType) - checkEvaluation(ret5, "[[1, 2, 3], a, 0.1]") - val ret6 = cast(Literal.create((1, Map(1 -> "a", 2 -> "b", 3 -> "c"))), StringType) - checkEvaluation(ret6, "[1, [1 -> a, 2 -> b, 3 -> c]]") + Seq( + "false" -> ("{", "}"), + "true" -> ("[", "]")).foreach { case (legacyBrackets, (lb, rb)) => + withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyBrackets) { + val ret1 = cast(Literal.create((1, "a", 0.1)), StringType) + checkEvaluation(ret1, s"${lb}1, a, 0.1$rb") + val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType) + checkEvaluation(ret2, s"${lb}1,, a$rb") + val ret3 = cast(Literal.create( + (Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType) + checkEvaluation(ret3, s"${lb}2014-12-03, 2014-12-03 15:05:00$rb") + val ret4 = cast(Literal.create(((1, "a"), 5, 0.1)), StringType) + checkEvaluation(ret4, s"$lb${lb}1, a$rb, 5, 0.1$rb") + val ret5 = cast(Literal.create((Seq(1, 2, 3), "a", 0.1)), StringType) + checkEvaluation(ret5, s"$lb[1, 2, 3], a, 0.1$rb") + val ret6 = cast(Literal.create((1, Map(1 -> "a", 2 -> "b", 3 -> "c"))), StringType) + checkEvaluation(ret6, s"${lb}1, ${lb}1 -> a, 2 -> b, 3 -> c$rb$rb") + } + } } test("up-cast") { diff --git a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out index ac4e71e244bc0..bb0d452fa04a1 100644 --- a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out @@ -276,7 +276,7 @@ PIVOT ( FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) ) -- !query schema -struct +struct -- !query output 1 15000 NULL 2 NULL 30000 @@ -370,7 +370,7 @@ PIVOT ( FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) ) -- !query schema -struct,[2013, Java]:array> +struct,{2013, Java}:array> -- !query output 2012 [1,1] NULL 2013 NULL [2,2] @@ -404,7 +404,7 @@ PIVOT ( FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ) -- !query schema -struct +struct -- !query output 2012 15000 NULL 2013 NULL 30000 @@ -421,7 +421,7 @@ PIVOT ( FOR s IN ((1, 'a'), (2, 'b')) ) -- !query schema -struct +struct -- !query output 2012 35000 NULL 2013 NULL 78000 @@ -438,7 +438,7 @@ PIVOT ( FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) ) -- !query schema -struct +struct -- !query output 2012 15000 NULL 2013 NULL 30000 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out index 087b4ed9302d8..414435e6b781d 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out @@ -276,7 +276,7 @@ PIVOT ( FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) ) -- !query schema -struct +struct -- !query output 1 15000 NULL 2 NULL 30000 @@ -370,7 +370,7 @@ PIVOT ( FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ) -- !query schema -struct +struct -- !query output 2012 15000 NULL 2013 NULL 30000 @@ -387,7 +387,7 @@ PIVOT ( FOR s IN ((1, 'a'), (2, 'b')) ) -- !query schema -struct +struct -- !query output 2012 35000 NULL 2013 NULL 78000 @@ -404,7 +404,7 @@ PIVOT ( FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) ) -- !query schema -struct +struct -- !query output 2012 15000 NULL 2013 NULL 30000 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 0d0e91e2287e0..bcfc77545bbd2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -1268,7 +1268,7 @@ class DataFrameSuite extends QueryTest s"""+----------------+ || a| |+----------------+ - ||[1 -> a, 2 -> b]| + ||{1 -> a, 2 -> b}| |+----------------+ |""".stripMargin) val df3 = Seq(((1, "a"), 0), ((2, "b"), 0)).toDF("a", "b") @@ -1276,8 +1276,8 @@ class DataFrameSuite extends QueryTest s"""+------+---+ || a| b| |+------+---+ - ||[1, a]| 0| - ||[2, b]| 0| + ||{1, a}| 0| + ||{2, b}| 0| |+------+---+ |""".stripMargin) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 06600c1e4b1d7..4923e8b556907 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1116,8 +1116,8 @@ class DatasetSuite extends QueryTest """+--------+ || f| |+--------+ - ||[foo, 1]| - ||[bar, 2]| + ||{foo, 1}| + ||{bar, 2}| |+--------+ |""".stripMargin diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index db85ae613eaa1..04af7d1a68682 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -550,7 +550,7 @@ class UDFSuite extends QueryTest with SharedSparkSession { udf((d: LocalDate, i: Instant) => LocalDateInstantType(d, i))) checkAnswer(df.selectExpr(s"buildLocalDateInstantType(d, i) as di") .select('di.cast(StringType)), - Row(s"[$expectedDate, $expectedInstant]") :: Nil) + Row(s"{$expectedDate, $expectedInstant}") :: Nil) // test null cases spark.udf.register("buildLocalDateInstantType", @@ -580,7 +580,7 @@ class UDFSuite extends QueryTest with SharedSparkSession { udf((t: Timestamp, i: Instant) => TimestampInstantType(t, i))) checkAnswer(df.selectExpr("buildTimestampInstantType(t, i) as ti") .select('ti.cast(StringType)), - Row(s"[$expectedTimestamp, $expectedInstant]")) + Row(s"{$expectedTimestamp, $expectedInstant}")) // test null cases spark.udf.register("buildTimestampInstantType",