From 21b623aad6a84cca2ab5f89f1c29d3b3b1b82d80 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Thu, 20 Sep 2018 17:46:19 +0800 Subject: [PATCH 1/6] Refactor DatasetBenchmark --- .../benchmarks/DatasetBenchmark-results.txt | 51 +++++++++ .../apache/spark/sql/DatasetBenchmark.scala | 101 +++++------------- 2 files changed, 76 insertions(+), 76 deletions(-) create mode 100644 sql/core/benchmarks/DatasetBenchmark-results.txt diff --git a/sql/core/benchmarks/DatasetBenchmark-results.txt b/sql/core/benchmarks/DatasetBenchmark-results.txt new file mode 100644 index 0000000000000..597a3b0986192 --- /dev/null +++ b/sql/core/benchmarks/DatasetBenchmark-results.txt @@ -0,0 +1,51 @@ +================================================================================================ +Dataset Benchmark +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +back-to-back map long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD 2932 / 3036 34.1 29.3 1.0X +DataFrame 666 / 736 150.1 6.7 4.4X +Dataset 832 / 939 120.1 8.3 3.5X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +back-to-back map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD 4788 / 4804 20.9 47.9 1.0X +DataFrame 3278 / 3314 30.5 32.8 1.5X +Dataset 8333 / 8825 12.0 83.3 0.6X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +back-to-back filter Long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD 975 / 1027 102.6 9.7 1.0X +DataFrame 432 / 538 231.6 4.3 2.3X +Dataset 1192 / 1202 83.9 11.9 0.8X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +back-to-back filter: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD 1668 / 1696 60.0 16.7 1.0X +DataFrame 73 / 80 1370.2 0.7 22.9X +Dataset 3691 / 3733 27.1 36.9 0.5X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +aggregate: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD sum 1621 / 1660 61.7 16.2 1.0X +DataFrame sum 31 / 37 3228.6 0.3 52.3X +Dataset sum using Aggregator 3567 / 3604 28.0 35.7 0.5X +Dataset complex Aggregator 5109 / 5134 19.6 51.1 0.3X + + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala index 1a0672b8876da..6093bd2012c00 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala @@ -17,17 +17,26 @@ package org.apache.spark.sql -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StringType -import org.apache.spark.util.Benchmark +import org.apache.spark.util.{Benchmark, BenchmarkBase => FileBenchmarkBase, Utils} /** * Benchmark for Dataset typed operations comparing with DataFrame and RDD versions. + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/DatasetBenchmark-results.txt". */ -object DatasetBenchmark { +object DatasetBenchmark extends FileBenchmarkBase { + + val spark = SparkSession.builder + .master("local[*]") + .appName("Dataset benchmark") + .getOrCreate() case class Data(l: Long, s: String) @@ -39,7 +48,7 @@ object DatasetBenchmark { val df = ds.toDF("l") val func = (l: Long) => l + 1 - val benchmark = new Benchmark("back-to-back map long", numRows) + val benchmark = new Benchmark("back-to-back map long", numRows, output = output) benchmark.addCase("RDD") { iter => var res = rdd @@ -78,7 +87,7 @@ object DatasetBenchmark { import spark.implicits._ val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) - val benchmark = new Benchmark("back-to-back map", numRows) + val benchmark = new Benchmark("back-to-back map", numRows, output = output) val func = (d: Data) => Data(d.l + 1, d.s) val rdd = spark.sparkContext.range(1, numRows).map(l => Data(l, l.toString)) @@ -123,7 +132,7 @@ object DatasetBenchmark { val df = ds.toDF("l") val func = (l: Long) => l % 2L == 0L - val benchmark = new Benchmark("back-to-back filter Long", numRows) + val benchmark = new Benchmark("back-to-back filter Long", numRows, output = output) benchmark.addCase("RDD") { iter => var res = rdd @@ -162,7 +171,7 @@ object DatasetBenchmark { import spark.implicits._ val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) - val benchmark = new Benchmark("back-to-back filter", numRows) + val benchmark = new Benchmark("back-to-back filter", numRows, output = output) val func = (d: Data, i: Int) => d.l % (100L + i) == 0L val funcs = 0.until(numChains).map { i => (d: Data) => func(d, i) @@ -220,7 +229,7 @@ object DatasetBenchmark { import spark.implicits._ val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) - val benchmark = new Benchmark("aggregate", numRows) + val benchmark = new Benchmark("aggregate", numRows, output = output) val rdd = spark.sparkContext.range(1, numRows).map(l => Data(l, l.toString)) benchmark.addCase("RDD sum") { iter => @@ -242,75 +251,15 @@ object DatasetBenchmark { benchmark } - def main(args: Array[String]): Unit = { - val spark = SparkSession.builder - .master("local[*]") - .appName("Dataset benchmark") - .getOrCreate() - + override def benchmark(): Unit = { val numRows = 100000000 val numChains = 10 - - val benchmark0 = backToBackMapLong(spark, numRows, numChains) - val benchmark1 = backToBackMap(spark, numRows, numChains) - val benchmark2 = backToBackFilterLong(spark, numRows, numChains) - val benchmark3 = backToBackFilter(spark, numRows, numChains) - val benchmark4 = aggregate(spark, numRows) - - /* - OpenJDK 64-Bit Server VM 1.8.0_111-8u111-b14-2ubuntu0.16.04.2-b14 on Linux 4.4.0-47-generic - Intel(R) Xeon(R) CPU E5-2667 v3 @ 3.20GHz - back-to-back map long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD 1883 / 1892 53.1 18.8 1.0X - DataFrame 502 / 642 199.1 5.0 3.7X - Dataset 657 / 784 152.2 6.6 2.9X - */ - benchmark0.run() - - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 3.10.0-327.18.2.el7.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - back-to-back map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD 3448 / 3646 29.0 34.5 1.0X - DataFrame 2647 / 3116 37.8 26.5 1.3X - Dataset 4781 / 5155 20.9 47.8 0.7X - */ - benchmark1.run() - - /* - OpenJDK 64-Bit Server VM 1.8.0_121-8u121-b13-0ubuntu1.16.04.2-b13 on Linux 4.4.0-47-generic - Intel(R) Xeon(R) CPU E5-2667 v3 @ 3.20GHz - back-to-back filter Long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD 846 / 1120 118.1 8.5 1.0X - DataFrame 270 / 329 370.9 2.7 3.1X - Dataset 545 / 789 183.5 5.4 1.6X - */ - benchmark2.run() - - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 3.10.0-327.18.2.el7.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - back-to-back filter: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD 1346 / 1618 74.3 13.5 1.0X - DataFrame 59 / 72 1695.4 0.6 22.8X - Dataset 2777 / 2805 36.0 27.8 0.5X - */ - benchmark3.run() - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.12.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - aggregate: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD sum 1913 / 1942 52.3 19.1 1.0X - DataFrame sum 46 / 61 2157.7 0.5 41.3X - Dataset sum using Aggregator 4656 / 4758 21.5 46.6 0.4X - Dataset complex Aggregator 6636 / 7039 15.1 66.4 0.3X - */ - benchmark4.run() + runBenchmark("Dataset Benchmark") { + backToBackMapLong(spark, numRows, numChains).run() + backToBackMap(spark, numRows, numChains).run() + backToBackFilterLong(spark, numRows, numChains).run() + backToBackFilter(spark, numRows, numChains).run() + aggregate(spark, numRows).run() + } } } From 152c549051b967caf58ef83c927e930bc3c70ccc Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Thu, 20 Sep 2018 20:49:23 +0800 Subject: [PATCH 2/6] Remove useless import --- .../src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala index 6093bd2012c00..bc369868ab7a4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StringType -import org.apache.spark.util.{Benchmark, BenchmarkBase => FileBenchmarkBase, Utils} +import org.apache.spark.util.{Benchmark, BenchmarkBase => FileBenchmarkBase} /** * Benchmark for Dataset typed operations comparing with DataFrame and RDD versions. From 51ae87ecdfa46505231f06ff88ed88c8f7a01c85 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Fri, 21 Sep 2018 01:49:39 +0800 Subject: [PATCH 3/6] Fix scala doc issue --- .../test/scala/org/apache/spark/sql/DatasetBenchmark.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala index bc369868ab7a4..7d67ce3816436 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala @@ -26,9 +26,9 @@ import org.apache.spark.util.{Benchmark, BenchmarkBase => FileBenchmarkBase} /** * Benchmark for Dataset typed operations comparing with DataFrame and RDD versions. * To run this benchmark: - * 1. without sbt: bin/spark-submit --class - * 2. build/sbt "sql/test:runMain " - * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * 1. without sbt: bin/spark-submit --class [this class] [spark sql test jar] + * 2. build/sbt "sql/test:runMain [this class]" + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain [this class]" * Results will be written to "benchmarks/DatasetBenchmark-results.txt". */ object DatasetBenchmark extends FileBenchmarkBase { From 7990e139cf307c631eb95a1f7b410a85120b07c5 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 1 Oct 2018 23:07:47 +0800 Subject: [PATCH 4/6] merge master --- .../org/apache/spark/sql/DatasetBenchmark.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala index b2eee7d725c69..be06ab769941b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql -import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ @@ -33,7 +34,7 @@ import org.apache.spark.sql.types.StringType * Results will be written to "benchmarks/DatasetBenchmark-results.txt". * }}} */ -object DatasetBenchmark extends BenchmarkBase { +object DatasetBenchmark extends SqlBasedBenchmark { case class Data(l: Long, s: String) @@ -248,10 +249,12 @@ object DatasetBenchmark extends BenchmarkBase { benchmark } - val spark = SparkSession.builder - .master("local[*]") - .appName("Dataset benchmark") - .getOrCreate() + override def getSparkSession: SparkSession = { + SparkSession.builder + .master("local[*]") + .appName("Dataset benchmark") + .getOrCreate() + } override def benchmark(): Unit = { val numRows = 100000000 From d2d0a3e9fbb27b6a531241bbe6641d5950a7a703 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Wed, 3 Oct 2018 06:59:05 +0800 Subject: [PATCH 5/6] benchmark -> runBenchmarkSuite --- .../src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala index be06ab769941b..e3df449b41f0a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala @@ -256,7 +256,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { .getOrCreate() } - override def benchmark(): Unit = { + override def runBenchmarkSuite(): Unit = { val numRows = 100000000 val numChains = 10 runBenchmark("Dataset Benchmark") { From 27c649337e326aa8081afcd9de1a7097ea2788e2 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 2 Oct 2018 16:52:25 -0700 Subject: [PATCH 6/6] Update result (#13) --- .../benchmarks/DatasetBenchmark-results.txt | 57 +++++++++---------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/sql/core/benchmarks/DatasetBenchmark-results.txt b/sql/core/benchmarks/DatasetBenchmark-results.txt index 597a3b0986192..dcc190eb45c03 100644 --- a/sql/core/benchmarks/DatasetBenchmark-results.txt +++ b/sql/core/benchmarks/DatasetBenchmark-results.txt @@ -2,50 +2,45 @@ Dataset Benchmark ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz back-to-back map long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -RDD 2932 / 3036 34.1 29.3 1.0X -DataFrame 666 / 736 150.1 6.7 4.4X -Dataset 832 / 939 120.1 8.3 3.5X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +RDD 11800 / 12042 8.5 118.0 1.0X +DataFrame 1927 / 2189 51.9 19.3 6.1X +Dataset 2483 / 2605 40.3 24.8 4.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz back-to-back map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -RDD 4788 / 4804 20.9 47.9 1.0X -DataFrame 3278 / 3314 30.5 32.8 1.5X -Dataset 8333 / 8825 12.0 83.3 0.6X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +RDD 16286 / 16301 6.1 162.9 1.0X +DataFrame 8101 / 8104 12.3 81.0 2.0X +Dataset 17445 / 17811 5.7 174.4 0.9X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz back-to-back filter Long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -RDD 975 / 1027 102.6 9.7 1.0X -DataFrame 432 / 538 231.6 4.3 2.3X -Dataset 1192 / 1202 83.9 11.9 0.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +RDD 2971 / 3184 33.7 29.7 1.0X +DataFrame 1243 / 1296 80.5 12.4 2.4X +Dataset 3062 / 3091 32.7 30.6 1.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz back-to-back filter: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -RDD 1668 / 1696 60.0 16.7 1.0X -DataFrame 73 / 80 1370.2 0.7 22.9X -Dataset 3691 / 3733 27.1 36.9 0.5X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +RDD 5253 / 5269 19.0 52.5 1.0X +DataFrame 211 / 234 473.4 2.1 24.9X +Dataset 9550 / 9552 10.5 95.5 0.6X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz aggregate: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -RDD sum 1621 / 1660 61.7 16.2 1.0X -DataFrame sum 31 / 37 3228.6 0.3 52.3X -Dataset sum using Aggregator 3567 / 3604 28.0 35.7 0.5X -Dataset complex Aggregator 5109 / 5134 19.6 51.1 0.3X +RDD sum 5086 / 5108 19.7 50.9 1.0X +DataFrame sum 65 / 73 1548.9 0.6 78.8X +Dataset sum using Aggregator 9024 / 9320 11.1 90.2 0.6X +Dataset complex Aggregator 15079 / 15171 6.6 150.8 0.3X