Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ class JacksonParser(
options.dateFormatInRead.isEmpty
}

private val enablePartialResults = SQLConf.get.jsonEnablePartialResults

/**
* Create a converter which converts the JSON documents held by the `JsonParser`
* to a value according to a desired schema. This is a wrapper for the method
Expand Down Expand Up @@ -456,7 +458,7 @@ class JacksonParser(
schema.existenceDefaultsBitmask(index) = false
} catch {
case e: SparkUpgradeException => throw e
case NonFatal(e) =>
case NonFatal(e) if isRoot || enablePartialResults =>
badRecordException = badRecordException.orElse(Some(e))
parser.skipChildren()
}
Expand Down Expand Up @@ -489,10 +491,10 @@ class JacksonParser(
try {
values += fieldConverter.apply(parser)
} catch {
case PartialResultException(row, cause) =>
case PartialResultException(row, cause) if enablePartialResults =>
badRecordException = badRecordException.orElse(Some(cause))
values += row
case NonFatal(e) =>
case NonFatal(e) if enablePartialResults =>
badRecordException = badRecordException.orElse(Some(e))
parser.skipChildren()
}
Expand Down Expand Up @@ -525,7 +527,7 @@ class JacksonParser(
if (isRoot && v == null) throw QueryExecutionErrors.rootConverterReturnNullError()
values += v
} catch {
case PartialResultException(row, cause) =>
case PartialResultException(row, cause) if enablePartialResults =>
badRecordException = badRecordException.orElse(Some(cause))
values += row
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3629,6 +3629,15 @@ object SQLConf {
.booleanConf
.createWithDefault(true)

val JSON_ENABLE_PARTIAL_RESULTS =
buildConf("spark.sql.json.enablePartialResults")
.internal()
.doc("When set to true, enables partial results for structs, maps, and arrays in JSON " +
"when one or more fields do not match the schema")
.version("3.4.0")
.booleanConf
.createWithDefault(true)

val LEGACY_CSV_ENABLE_DATE_TIME_PARSING_FALLBACK =
buildConf("spark.sql.legacy.csv.enableDateTimeParsingFallback")
.internal()
Expand Down Expand Up @@ -4772,6 +4781,8 @@ class SQLConf extends Serializable with Logging {

def avroFilterPushDown: Boolean = getConf(AVRO_FILTER_PUSHDOWN_ENABLED)

def jsonEnablePartialResults: Boolean = getConf(JSON_ENABLE_PARTIAL_RESULTS)

def jsonEnableDateTimeParsingFallback: Option[Boolean] =
getConf(LEGACY_JSON_ENABLE_DATE_TIME_PARSING_FALLBACK)

Expand Down
155 changes: 70 additions & 85 deletions sql/core/benchmarks/JsonBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -1,120 +1,105 @@
================================================================================================
Benchmark for performance of JSON parsing
================================================================================================

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
No encoding 3363 3446 79 1.5 672.7 1.0X
UTF-8 is set 4894 4976 72 1.0 978.7 0.7X
No encoding 2545 2616 65 2.0 509.0 1.0X
UTF-8 is set 3845 3854 8 1.3 768.9 0.7X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
No encoding 3088 3123 32 1.6 617.6 1.0X
UTF-8 is set 4854 4938 87 1.0 970.9 0.6X
No encoding 2130 2176 41 2.3 426.0 1.0X
UTF-8 is set 3907 3911 4 1.3 781.3 0.5X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
No encoding 6411 7338 1497 0.2 6411.2 1.0X
UTF-8 is set 10589 10644 58 0.1 10589.1 0.6X
No encoding 5032 5068 50 0.2 5032.3 1.0X
UTF-8 is set 8304 8349 40 0.1 8304.3 0.6X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
No encoding 12862 13165 263 0.0 257239.1 1.0X
UTF-8 is set 14792 15110 371 0.0 295834.1 0.9X
No encoding 10782 10872 78 0.0 215647.2 1.0X
UTF-8 is set 12514 12560 41 0.0 250277.3 0.9X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Select 10 columns 2352 2369 17 0.4 2351.8 1.0X
Select 1 column 2680 2683 5 0.4 2680.0 0.9X
Select 10 columns 1901 1903 2 0.5 1901.0 1.0X
Select 1 column 1493 1501 8 0.7 1493.3 1.3X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Short column without encoding 884 887 2 1.1 884.1 1.0X
Short column with UTF-8 1193 1202 8 0.8 1192.6 0.7X
Wide column without encoding 12289 12448 170 0.1 12289.3 0.1X
Wide column with UTF-8 16609 16663 79 0.1 16608.6 0.1X
Short column without encoding 697 700 3 1.4 697.2 1.0X
Short column with UTF-8 979 979 0 1.0 978.7 0.7X
Wide column without encoding 10365 10403 51 0.1 10364.5 0.1X
Wide column with UTF-8 15209 15226 15 0.1 15208.7 0.0X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Text read 147 148 0 6.8 147.2 1.0X
from_json 2201 2202 1 0.5 2200.7 0.1X
json_tuple 2452 2473 20 0.4 2452.5 0.1X
get_json_object 2248 2263 22 0.4 2248.2 0.1X
Text read 120 123 4 8.3 120.2 1.0X
from_json 1944 1957 21 0.5 1944.4 0.1X
json_tuple 2142 2146 4 0.5 2141.6 0.1X
get_json_object 1967 1969 2 0.5 1966.7 0.1X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Text read 647 654 7 7.7 129.4 1.0X
schema inferring 2842 2862 25 1.8 568.4 0.2X
parsing 3213 3239 33 1.6 642.6 0.2X
Text read 537 542 4 9.3 107.5 1.0X
schema inferring 2319 2323 4 2.2 463.7 0.2X
parsing 2828 2854 29 1.8 565.6 0.2X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Text read 1046 1058 12 4.8 209.3 1.0X
Schema inferring 3321 3378 58 1.5 664.2 0.3X
Parsing without charset 3751 3791 36 1.3 750.2 0.3X
Parsing with UTF-8 5361 5403 37 0.9 1072.1 0.2X
Text read 798 811 16 6.3 159.6 1.0X
Schema inferring 2774 2781 10 1.8 554.9 0.3X
Parsing without charset 3213 3218 7 1.6 642.7 0.2X
Parsing with UTF-8 4574 4588 13 1.1 914.7 0.2X

OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Create a dataset of timestamps 171 173 2 5.8 171.3 1.0X
to_json(timestamp) 1414 1427 12 0.7 1414.0 0.1X
write timestamps to files 1183 1211 40 0.8 1183.2 0.1X
Create a dataset of dates 191 198 7 5.2 191.5 0.9X
to_json(date) 934 945 16 1.1 934.1 0.2X
write dates to files 727 748 22 1.4 726.9 0.2X
Create a dataset of timestamps 143 144 2 7.0 142.7 1.0X
to_json(timestamp) 1075 1079 7 0.9 1074.9 0.1X
write timestamps to files 928 932 4 1.1 928.1 0.2X
Create a dataset of dates 165 170 4 6.1 165.2 0.9X
to_json(date) 739 742 3 1.4 739.0 0.2X
write dates to files 573 576 4 1.7 573.4 0.2X

OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
read timestamp text from files 263 264 1 3.8 262.8 1.0X
read timestamps from files 2743 2807 59 0.4 2742.9 0.1X
infer timestamps from files 14799 15093 383 0.1 14799.3 0.0X
read date text from files 245 253 8 4.1 245.5 1.1X
read date from files 998 1008 9 1.0 998.4 0.3X
timestamp strings 383 403 17 2.6 382.8 0.7X
parse timestamps from Dataset[String] 3165 3185 17 0.3 3165.4 0.1X
infer timestamps from Dataset[String] 15717 15830 147 0.1 15717.2 0.0X
date strings 434 450 19 2.3 433.5 0.6X
parse dates from Dataset[String] 1466 1472 7 0.7 1465.6 0.2X
from_json(timestamp) 4682 4736 50 0.2 4681.9 0.1X
from_json(date) 2823 2848 22 0.4 2822.6 0.1X
read timestamp text from files 215 220 5 4.6 215.2 1.0X
read timestamps from files 2389 2424 31 0.4 2388.8 0.1X
infer timestamps from files 6115 6122 11 0.2 6115.4 0.0X
read date text from files 191 193 2 5.2 191.4 1.1X
read date from files 840 841 2 1.2 839.7 0.3X
timestamp strings 301 306 4 3.3 300.8 0.7X
parse timestamps from Dataset[String] 2706 2713 6 0.4 2706.1 0.1X
infer timestamps from Dataset[String] 6476 6482 5 0.2 6475.9 0.0X
date strings 343 343 0 2.9 342.5 0.6X
parse dates from Dataset[String] 1169 1172 5 0.9 1168.6 0.2X
from_json(timestamp) 4067 4074 7 0.2 4066.5 0.1X
from_json(date) 2470 2472 3 0.4 2469.9 0.1X

OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
OpenJDK 64-Bit Server VM 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10 on Linux 5.4.0-1045-aws
Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
w/o filters 21058 21148 143 0.0 210582.1 1.0X
pushdown disabled 20208 20464 226 0.0 202080.3 1.0X
w/ filters 750 756 6 0.1 7499.1 28.1X


w/o filters 18219 18230 18 0.0 182188.8 1.0X
pushdown disabled 17180 17183 4 0.0 171798.7 1.1X
w/ filters 1197 1219 22 0.1 11974.0 15.2X
Loading