Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions sql/core/benchmarks/JSONBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
================================================================================================
Benchmark for performance of JSON parsing
================================================================================================

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
No encoding 62946 / 63310 1.6 629.5 1.0X
UTF-8 is set 112814 / 112866 0.9 1128.1 0.6X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
No encoding 16468 / 16553 6.1 164.7 1.0X
UTF-8 is set 16420 / 16441 6.1 164.2 1.0X

Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
No encoding 39789 / 40053 0.3 3978.9 1.0X
UTF-8 is set 39505 / 39584 0.3 3950.5 1.0X
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The numbers for currently used Jackson parser should be slightly different. The PR #22920 triggers creation of Jackson parser.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I commented on the PR. Please add another benchmark cases instead of changing the existing numbers.


OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Count a dataset with 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Select 10 columns + count() 15997 / 16015 0.6 1599.7 1.0X
Select 1 column + count() 13280 / 13326 0.8 1328.0 1.2X
count() 3006 / 3021 3.3 300.6 5.3X


Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,31 @@
*/
package org.apache.spark.sql.execution.datasources.json

import java.io.File

import org.apache.spark.SparkConf
import org.apache.spark.benchmark.Benchmark
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.plans.SQLHelper
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.types._

/**
* The benchmarks aims to measure performance of JSON parsing when encoding is set and isn't.
* To run this:
* spark-submit --class <this class> --jars <spark sql test jar>
* To run this benchmark:
* {{{
* 1. without sbt:
* bin/spark-submit --class <this class> --jars <spark core test jar>,
* <spark catalyst test jar> <spark sql test jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result:
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
* Results will be written to "benchmarks/JSONBenchmark-results.txt".
* }}}
*/
object JSONBenchmarks extends SQLHelper {
val conf = new SparkConf()

val spark = SparkSession.builder
.master("local[1]")
.appName("benchmark-json-datasource")
.config(conf)
.getOrCreate()

object JSONBenchmark extends SqlBasedBenchmark {
import spark.implicits._

def schemaInferring(rowsNum: Int): Unit = {
val benchmark = new Benchmark("JSON schema inferring", rowsNum)
val benchmark = new Benchmark("JSON schema inferring", rowsNum, output = output)

withTempPath { path =>
// scalastyle:off println
Expand All @@ -65,21 +64,12 @@ object JSONBenchmarks extends SQLHelper {
.json(path.getAbsolutePath)
}

/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz

JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------
No encoding 45908 / 46480 2.2 459.1 1.0X
UTF-8 is set 68469 / 69762 1.5 684.7 0.7X
*/
benchmark.run()
}
}

def perlineParsing(rowsNum: Int): Unit = {
val benchmark = new Benchmark("JSON per-line parsing", rowsNum)
val benchmark = new Benchmark("JSON per-line parsing", rowsNum, output = output)

withTempPath { path =>
// scalastyle:off println
Expand Down Expand Up @@ -107,21 +97,12 @@ object JSONBenchmarks extends SQLHelper {
.count()
}

/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz

JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------
No encoding 9982 / 10237 10.0 99.8 1.0X
UTF-8 is set 16373 / 16806 6.1 163.7 0.6X
*/
benchmark.run()
}
}

def perlineParsingOfWideColumn(rowsNum: Int): Unit = {
val benchmark = new Benchmark("JSON parsing of wide lines", rowsNum)
val benchmark = new Benchmark("JSON parsing of wide lines", rowsNum, output = output)

withTempPath { path =>
// scalastyle:off println
Expand Down Expand Up @@ -156,22 +137,14 @@ object JSONBenchmarks extends SQLHelper {
.count()
}

/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz

JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------
No encoding 26038 / 26386 0.4 2603.8 1.0X
UTF-8 is set 28343 / 28557 0.4 2834.3 0.9X
*/
benchmark.run()
}
}

def countBenchmark(rowsNum: Int): Unit = {
val colsNum = 10
val benchmark = new Benchmark(s"Count a dataset with $colsNum columns", rowsNum)
val benchmark =
new Benchmark(s"Count a dataset with $colsNum columns", rowsNum, output = output)

withTempPath { path =>
val fields = Seq.tabulate(colsNum)(i => StructField(s"col$i", IntegerType))
Expand All @@ -195,23 +168,16 @@ object JSONBenchmarks extends SQLHelper {
ds.count()
}

/*
Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz

Count a dataset with 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------
Select 10 columns + count() 9961 / 10006 1.0 996.1 1.0X
Select 1 column + count() 8355 / 8470 1.2 835.5 1.2X
count() 2104 / 2156 4.8 210.4 4.7X
*/
benchmark.run()
}
}

def main(args: Array[String]): Unit = {
schemaInferring(100 * 1000 * 1000)
perlineParsing(100 * 1000 * 1000)
perlineParsingOfWideColumn(10 * 1000 * 1000)
countBenchmark(10 * 1000 * 1000)
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
runBenchmark("Benchmark for performance of JSON parsing") {
schemaInferring(100 * 1000 * 1000)
perlineParsing(100 * 1000 * 1000)
perlineParsingOfWideColumn(10 * 1000 * 1000)
countBenchmark(10 * 1000 * 1000)
}
}
}