diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index d6550c30b955..0c84db38afaf 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -31,7 +31,11 @@ license: | - In Spark 3.1, `from_unixtime`, `unix_timestamp`,`to_unix_timestamp`, `to_timestamp` and `to_date` will fail if the specified datetime pattern is invalid. In Spark 3.0 or earlier, they result `NULL`. - In Spark 3.1, casting numeric to timestamp will be forbidden by default. It's strongly recommended to use dedicated functions: TIMESTAMP_SECONDS, TIMESTAMP_MILLIS and TIMESTAMP_MICROS. Or you can set `spark.sql.legacy.allowCastNumericToTimestamp` to true to work around it. See more details in SPARK-31710. - + +## Upgrading from Spark SQL 3.0 to 3.0.1 + +- In Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they match to the pattern defined by the JSON option `timestampFormat`. Since version 3.0.1, the timestamp type inference is disabled by default. Set the JSON option `inferTimestamp` to `true` to enable such type inference. + ## Upgrading from Spark SQL 2.4 to 3.0 ### Dataset/DataFrame APIs diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala index f9222f5af54d..70a673bb4245 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala @@ -133,7 +133,7 @@ private[sql] class JSONOptions( * Enables inferring of TimestampType from strings matched to the timestamp pattern * defined by the timestampFormat option. */ - val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(true) + val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(false) /** Build a Jackson [[JsonFactory]] using JSON options. */ def buildJsonFactory(): JsonFactory = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala index bce917c80f93..8290b38e3393 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala @@ -35,22 +35,29 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper { assert(inferSchema.inferField(parser) === expectedType) } - def checkTimestampType(pattern: String, json: String): Unit = { - checkType(Map("timestampFormat" -> pattern), json, TimestampType) + def checkTimestampType(pattern: String, json: String, inferTimestamp: Boolean): Unit = { + checkType( + Map("timestampFormat" -> pattern, "inferTimestamp" -> inferTimestamp.toString), + json, + if (inferTimestamp) TimestampType else StringType) } test("inferring timestamp type") { - Seq("legacy", "corrected").foreach { legacyParserPolicy => - withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { - checkTimestampType("yyyy", """{"a": "2018"}""") - checkTimestampType("yyyy=MM", """{"a": "2018=12"}""") - checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""") - checkTimestampType( - "yyyy-MM-dd'T'HH:mm:ss.SSS", - """{"a": "2018-12-02T21:04:00.123"}""") - checkTimestampType( - "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX", - """{"a": "2018-12-02T21:04:00.123567+01:00"}""") + Seq(true, false).foreach { inferTimestamp => + Seq("legacy", "corrected").foreach { legacyParserPolicy => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { + checkTimestampType("yyyy", """{"a": "2018"}""", inferTimestamp) + checkTimestampType("yyyy=MM", """{"a": "2018=12"}""", inferTimestamp) + checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""", inferTimestamp) + checkTimestampType( + "yyyy-MM-dd'T'HH:mm:ss.SSS", + """{"a": "2018-12-02T21:04:00.123"}""", + inferTimestamp) + checkTimestampType( + "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX", + """{"a": "2018-12-02T21:04:00.123567+01:00"}""", + inferTimestamp) + } } } } @@ -71,16 +78,19 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper { } test("skip decimal type inferring") { - Seq("legacy", "corrected").foreach { legacyParserPolicy => - withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { - checkType( - options = Map( - "prefersDecimal" -> "false", - "timestampFormat" -> "yyyyMMdd.HHmmssSSS" - ), - json = """{"a": "20181202.210400123"}""", - dt = TimestampType - ) + Seq(true, false).foreach { inferTimestamp => + Seq("legacy", "corrected").foreach { legacyParserPolicy => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { + checkType( + options = Map( + "prefersDecimal" -> "false", + "timestampFormat" -> "yyyyMMdd.HHmmssSSS", + "inferTimestamp" -> inferTimestamp.toString + ), + json = """{"a": "20181202.210400123"}""", + dt = if (inferTimestamp) TimestampType else StringType + ) + } } } } diff --git a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt index d0cd591da4c9..ff370847a2e3 100644 --- a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt @@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-106 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 68879 68993 116 1.5 688.8 1.0X -UTF-8 is set 115270 115602 455 0.9 1152.7 0.6X +No encoding 69219 69342 116 1.4 692.2 1.0X +UTF-8 is set 143950 143986 55 0.7 1439.5 0.5X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 47452 47538 113 2.1 474.5 1.0X -UTF-8 is set 77330 77354 30 1.3 773.3 0.6X +No encoding 57828 57913 136 1.7 578.3 1.0X +UTF-8 is set 83649 83711 60 1.2 836.5 0.7X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 60470 60900 534 0.2 6047.0 1.0X -UTF-8 is set 104733 104931 189 0.1 10473.3 0.6X +No encoding 64560 65193 1023 0.2 6456.0 1.0X +UTF-8 is set 102925 103174 216 0.1 10292.5 0.6X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 130302 131072 976 0.0 260604.6 1.0X -UTF-8 is set 150860 151284 377 0.0 301720.1 0.9X +No encoding 131002 132316 1160 0.0 262003.1 1.0X +UTF-8 is set 152128 152371 332 0.0 304256.5 0.9X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 18619 18684 99 0.5 1861.9 1.0X -Select 1 column 24227 24270 38 0.4 2422.7 0.8X +Select 10 columns 19376 19514 160 0.5 1937.6 1.0X +Select 1 column 24089 24156 58 0.4 2408.9 0.8X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 7947 7971 21 1.3 794.7 1.0X -Short column with UTF-8 12700 12753 58 0.8 1270.0 0.6X -Wide column without encoding 92632 92955 463 0.1 9263.2 0.1X -Wide column with UTF-8 147013 147170 188 0.1 14701.3 0.1X +Short column without encoding 8131 8219 103 1.2 813.1 1.0X +Short column with UTF-8 13464 13508 44 0.7 1346.4 0.6X +Wide column without encoding 108012 108598 914 0.1 10801.2 0.1X +Wide column with UTF-8 150988 151369 412 0.1 15098.8 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 713 734 19 14.0 71.3 1.0X -from_json 22019 22429 456 0.5 2201.9 0.0X -json_tuple 27987 28047 74 0.4 2798.7 0.0X -get_json_object 21468 21870 350 0.5 2146.8 0.0X +Text read 753 765 18 13.3 75.3 1.0X +from_json 23182 23446 230 0.4 2318.2 0.0X +json_tuple 31129 31304 181 0.3 3112.9 0.0X +get_json_object 22821 23073 225 0.4 2282.1 0.0X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 2887 2910 24 17.3 57.7 1.0X -schema inferring 31793 31843 43 1.6 635.9 0.1X -parsing 36791 37104 294 1.4 735.8 0.1X +Text read 3078 3101 26 16.2 61.6 1.0X +schema inferring 30225 30434 333 1.7 604.5 0.1X +parsing 32237 32308 63 1.6 644.7 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 10570 10611 45 4.7 211.4 1.0X -Schema inferring 48729 48763 41 1.0 974.6 0.2X -Parsing without charset 35490 35648 141 1.4 709.8 0.3X -Parsing with UTF-8 63853 63994 163 0.8 1277.1 0.2X +Text read 10835 10900 86 4.6 216.7 1.0X +Schema inferring 37720 37805 110 1.3 754.4 0.3X +Parsing without charset 35464 35538 100 1.4 709.3 0.3X +Parsing with UTF-8 67311 67738 381 0.7 1346.2 0.2X OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 2187 2190 5 4.6 218.7 1.0X -to_json(timestamp) 16262 16503 323 0.6 1626.2 0.1X -write timestamps to files 11679 11692 12 0.9 1167.9 0.2X -Create a dataset of dates 2297 2310 12 4.4 229.7 1.0X -to_json(date) 10904 10956 46 0.9 1090.4 0.2X -write dates to files 6610 6645 35 1.5 661.0 0.3X +Create a dataset of timestamps 2208 2222 14 4.5 220.8 1.0X +to_json(timestamp) 14299 14570 285 0.7 1429.9 0.2X +write timestamps to files 12955 12969 13 0.8 1295.5 0.2X +Create a dataset of dates 2297 2323 30 4.4 229.7 1.0X +to_json(date) 8509 8561 74 1.2 850.9 0.3X +write dates to files 6786 6827 45 1.5 678.6 0.3X OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -read timestamp text from files 2524 2530 9 4.0 252.4 1.0X -read timestamps from files 41002 41052 59 0.2 4100.2 0.1X -infer timestamps from files 84621 84939 526 0.1 8462.1 0.0X -read date text from files 2292 2302 9 4.4 229.2 1.1X -read date from files 16954 16976 21 0.6 1695.4 0.1X -timestamp strings 3067 3077 13 3.3 306.7 0.8X -parse timestamps from Dataset[String] 48690 48971 243 0.2 4869.0 0.1X -infer timestamps from Dataset[String] 97463 97786 338 0.1 9746.3 0.0X -date strings 3952 3956 3 2.5 395.2 0.6X -parse dates from Dataset[String] 24210 24241 30 0.4 2421.0 0.1X -from_json(timestamp) 71710 72242 629 0.1 7171.0 0.0X -from_json(date) 42465 42481 13 0.2 4246.5 0.1X +read timestamp text from files 2598 2613 18 3.8 259.8 1.0X +read timestamps from files 42007 42028 19 0.2 4200.7 0.1X +infer timestamps from files 18102 18120 28 0.6 1810.2 0.1X +read date text from files 2355 2360 5 4.2 235.5 1.1X +read date from files 17420 17458 33 0.6 1742.0 0.1X +timestamp strings 3099 3101 3 3.2 309.9 0.8X +parse timestamps from Dataset[String] 48188 48215 25 0.2 4818.8 0.1X +infer timestamps from Dataset[String] 22929 22988 102 0.4 2292.9 0.1X +date strings 4090 4103 11 2.4 409.0 0.6X +parse dates from Dataset[String] 24952 25068 139 0.4 2495.2 0.1X +from_json(timestamp) 66038 66352 413 0.2 6603.8 0.0X +from_json(date) 43755 43782 27 0.2 4375.5 0.1X diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt b/sql/core/benchmarks/JsonBenchmark-results.txt index 46d2410fb47c..0e4ce9003f70 100644 --- a/sql/core/benchmarks/JsonBenchmark-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-results.txt @@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aw Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 63981 64044 56 1.6 639.8 1.0X -UTF-8 is set 112672 113350 962 0.9 1126.7 0.6X +No encoding 64950 65182 306 1.5 649.5 1.0X +UTF-8 is set 129566 129796 229 0.8 1295.7 0.5X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 51256 51449 180 2.0 512.6 1.0X -UTF-8 is set 83694 83859 148 1.2 836.9 0.6X +No encoding 50896 51277 372 2.0 509.0 1.0X +UTF-8 is set 89712 89763 49 1.1 897.1 0.6X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 58440 59097 569 0.2 5844.0 1.0X -UTF-8 is set 102746 102883 198 0.1 10274.6 0.6X +No encoding 59415 59785 372 0.2 5941.5 1.0X +UTF-8 is set 103059 103165 156 0.1 10305.9 0.6X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 128982 129304 356 0.0 257965.0 1.0X -UTF-8 is set 147247 147415 231 0.0 294494.1 0.9X +No encoding 132951 133122 288 0.0 265901.9 1.0X +UTF-8 is set 149318 149441 107 0.0 298635.3 0.9X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 18837 19048 331 0.5 1883.7 1.0X -Select 1 column 24707 24723 14 0.4 2470.7 0.8X +Select 10 columns 18491 18552 85 0.5 1849.1 1.0X +Select 1 column 25908 25946 65 0.4 2590.8 0.7X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 8218 8234 17 1.2 821.8 1.0X -Short column with UTF-8 12374 12438 107 0.8 1237.4 0.7X -Wide column without encoding 136918 137298 345 0.1 13691.8 0.1X -Wide column with UTF-8 176961 177142 257 0.1 17696.1 0.0X +Short column without encoding 9264 9307 49 1.1 926.4 1.0X +Short column with UTF-8 14707 14727 17 0.7 1470.7 0.6X +Wide column without encoding 141138 141347 276 0.1 14113.8 0.1X +Wide column with UTF-8 179601 180035 664 0.1 17960.1 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 1268 1278 12 7.9 126.8 1.0X -from_json 23348 23479 176 0.4 2334.8 0.1X -json_tuple 29606 30221 1024 0.3 2960.6 0.0X -get_json_object 21898 22148 226 0.5 2189.8 0.1X +Text read 1173 1184 9 8.5 117.3 1.0X +from_json 23432 23738 338 0.4 2343.2 0.1X +json_tuple 32573 32851 358 0.3 3257.3 0.0X +get_json_object 22442 22489 47 0.4 2244.2 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 5887 5944 49 8.5 117.7 1.0X -schema inferring 46696 47054 312 1.1 933.9 0.1X -parsing 32336 32450 129 1.5 646.7 0.2X +Text read 5656 5680 31 8.8 113.1 1.0X +schema inferring 33283 33337 64 1.5 665.7 0.2X +parsing 41771 41929 178 1.2 835.4 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 9756 9769 11 5.1 195.1 1.0X -Schema inferring 51318 51433 108 1.0 1026.4 0.2X -Parsing without charset 43609 43743 118 1.1 872.2 0.2X -Parsing with UTF-8 60775 60844 106 0.8 1215.5 0.2X +Text read 9626 9668 39 5.2 192.5 1.0X +Schema inferring 39489 39579 91 1.3 789.8 0.2X +Parsing without charset 38096 38232 125 1.3 761.9 0.3X +Parsing with UTF-8 64565 64725 165 0.8 1291.3 0.1X OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1998 2015 17 5.0 199.8 1.0X -to_json(timestamp) 18156 18317 263 0.6 1815.6 0.1X -write timestamps to files 12912 12917 5 0.8 1291.2 0.2X -Create a dataset of dates 2209 2270 53 4.5 220.9 0.9X -to_json(date) 9433 9489 90 1.1 943.3 0.2X -write dates to files 6915 6923 8 1.4 691.5 0.3X +Create a dataset of timestamps 1898 1912 13 5.3 189.8 1.0X +to_json(timestamp) 20011 20092 119 0.5 2001.1 0.1X +write timestamps to files 13388 13427 35 0.7 1338.8 0.1X +Create a dataset of dates 2351 2368 18 4.3 235.1 0.8X +to_json(date) 11884 11913 40 0.8 1188.4 0.2X +write dates to files 7317 7326 9 1.4 731.7 0.3X OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -read timestamp text from files 2395 2412 17 4.2 239.5 1.0X -read timestamps from files 47269 47334 89 0.2 4726.9 0.1X -infer timestamps from files 91806 91851 67 0.1 9180.6 0.0X -read date text from files 2118 2133 13 4.7 211.8 1.1X -read date from files 17267 17340 115 0.6 1726.7 0.1X -timestamp strings 3906 3935 26 2.6 390.6 0.6X -parse timestamps from Dataset[String] 52244 52534 279 0.2 5224.4 0.0X -infer timestamps from Dataset[String] 100488 100714 198 0.1 10048.8 0.0X -date strings 4572 4584 12 2.2 457.2 0.5X -parse dates from Dataset[String] 26749 26768 17 0.4 2674.9 0.1X -from_json(timestamp) 71414 71867 556 0.1 7141.4 0.0X -from_json(date) 45322 45549 250 0.2 4532.2 0.1X +read timestamp text from files 2316 2324 13 4.3 231.6 1.0X +read timestamps from files 43712 43900 165 0.2 4371.2 0.1X +infer timestamps from files 19302 19328 38 0.5 1930.2 0.1X +read date text from files 2090 2099 11 4.8 209.0 1.1X +read date from files 18914 18940 44 0.5 1891.4 0.1X +timestamp strings 3785 3793 11 2.6 378.5 0.6X +parse timestamps from Dataset[String] 51177 51353 160 0.2 5117.7 0.0X +infer timestamps from Dataset[String] 27907 28119 186 0.4 2790.7 0.1X +date strings 4446 4452 6 2.2 444.6 0.5X +parse dates from Dataset[String] 28124 28172 55 0.4 2812.4 0.1X +from_json(timestamp) 71432 71827 354 0.1 7143.2 0.0X +from_json(date) 46497 46651 163 0.2 4649.7 0.0X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 6344ec6be487..c7448b12626b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -2610,7 +2610,9 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } test("inferring timestamp type") { - def schemaOf(jsons: String*): StructType = spark.read.json(jsons.toDS).schema + def schemaOf(jsons: String*): StructType = { + spark.read.option("inferTimestamp", true).json(jsons.toDS).schema + } assert(schemaOf( """{"a":"2018-12-17T10:11:12.123-01:00"}""", @@ -2633,6 +2635,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson val timestampsWithFormatPath = s"${dir.getCanonicalPath}/timestampsWithFormat.json" val timestampsWithFormat = spark.read .option("timestampFormat", "dd/MM/yyyy HH:mm") + .option("inferTimestamp", true) .json(datesRecords) assert(timestampsWithFormat.schema === customSchema) @@ -2645,6 +2648,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson val readBack = spark.read .option("timestampFormat", "yyyy-MM-dd HH:mm:ss") .option(DateTimeUtils.TIMEZONE_OPTION, "UTC") + .option("inferTimestamp", true) .json(timestampsWithFormatPath) assert(readBack.schema === customSchema)