From 53e3b7bf68ebc4dc66c5a6f1a8321ce39683cff1 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Sat, 6 Oct 2018 08:29:28 +0100 Subject: [PATCH 1/4] refactor HashBenchmark --- .../benchmarks/HashBenchmark-results.txt | 75 +++++++++ .../org/apache/spark/sql/HashBenchmark.scala | 150 +++++++----------- 2 files changed, 132 insertions(+), 93 deletions(-) create mode 100644 sql/catalyst/benchmarks/HashBenchmark-results.txt diff --git a/sql/catalyst/benchmarks/HashBenchmark-results.txt b/sql/catalyst/benchmarks/HashBenchmark-results.txt new file mode 100644 index 000000000000..023a45ad0a00 --- /dev/null +++ b/sql/catalyst/benchmarks/HashBenchmark-results.txt @@ -0,0 +1,75 @@ +================================================================================================ +single ints +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 2668 / 2669 201.2 5.0 1.0X +codegen version 6216 / 6413 86.4 11.6 0.4X +codegen version 64-bit 10357 / 13068 51.8 19.3 0.3X +codegen HiveHash version 5814 / 5818 92.3 10.8 0.5X + + +================================================================================================ +single longs +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 3066 / 3067 175.1 5.7 1.0X +codegen version 7890 / 13428 68.0 14.7 0.4X +codegen version 64-bit 7639 / 8836 70.3 14.2 0.4X +codegen HiveHash version 7191 / 7258 74.7 13.4 0.4X + + +================================================================================================ +normal +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 3086 / 3108 0.7 1471.5 1.0X +codegen version 2206 / 2903 1.0 1051.7 1.4X +codegen version 64-bit 910 / 1089 2.3 434.1 3.4X +codegen HiveHash version 4801 / 5631 0.4 2289.1 0.6X + + +================================================================================================ +array +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 2721 / 2979 0.0 20759.1 1.0X +codegen version 5444 / 5725 0.0 41530.8 0.5X +codegen version 64-bit 3411 / 3430 0.0 26025.3 0.8X +codegen HiveHash version 2186 / 2190 0.1 16676.6 1.2X + + +================================================================================================ +map +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 0 / 0 60.3 16.6 1.0X +codegen version 198 / 212 0.0 48317.0 0.0X +codegen version 64-bit 167 / 174 0.0 40690.8 0.0X +codegen HiveHash version 58 / 65 0.1 14096.7 0.0X + + diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala index 7a2a66c9b1d3..329f2950e65a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.benchmark.Benchmark +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection @@ -26,94 +26,85 @@ import org.apache.spark.sql.types._ /** * Benchmark for the previous interpreted hash function(InternalRow.hashCode) vs codegened * hash expressions (Murmur3Hash/xxHash64). + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/HashBenchmark-results.txt". + * }}} */ -object HashBenchmark { +object HashBenchmark extends BenchmarkBase { def test(name: String, schema: StructType, numRows: Int, iters: Int): Unit = { - val generator = RandomDataGenerator.forType(schema, nullable = false).get - val encoder = RowEncoder(schema) - val attrs = schema.toAttributes - val safeProjection = GenerateSafeProjection.generate(attrs, attrs) + runBenchmark(name) { + val generator = RandomDataGenerator.forType(schema, nullable = false).get + val encoder = RowEncoder(schema) + val attrs = schema.toAttributes + val safeProjection = GenerateSafeProjection.generate(attrs, attrs) - val rows = (1 to numRows).map(_ => - // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format. - safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy() - ).toArray + val rows = (1 to numRows).map(_ => + // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format. + safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy() + ).toArray - val benchmark = new Benchmark("Hash For " + name, iters * numRows.toLong) - benchmark.addCase("interpreted version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += rows(i).hashCode() - i += 1 + val benchmark = new Benchmark("Hash For " + name, iters * numRows.toLong, output = output) + benchmark.addCase("interpreted version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += rows(i).hashCode() + i += 1 + } } } - } - val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs) - benchmark.addCase("codegen version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHashCode(rows(i)).getInt(0) - i += 1 + val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs) + benchmark.addCase("codegen version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHashCode(rows(i)).getInt(0) + i += 1 + } } } - } - val getHashCode64b = UnsafeProjection.create(new XxHash64(attrs) :: Nil, attrs) - benchmark.addCase("codegen version 64-bit") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHashCode64b(rows(i)).getInt(0) - i += 1 + val getHashCode64b = UnsafeProjection.create(new XxHash64(attrs) :: Nil, attrs) + benchmark.addCase("codegen version 64-bit") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHashCode64b(rows(i)).getInt(0) + i += 1 + } } } - } - val getHiveHashCode = UnsafeProjection.create(new HiveHash(attrs) :: Nil, attrs) - benchmark.addCase("codegen HiveHash version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHiveHashCode(rows(i)).getInt(0) - i += 1 + val getHiveHashCode = UnsafeProjection.create(new HiveHash(attrs) :: Nil, attrs) + benchmark.addCase("codegen HiveHash version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHiveHashCode(rows(i)).getInt(0) + i += 1 + } } } - } - benchmark.run() + benchmark.run() + } } - def main(args: Array[String]): Unit = { + override def runBenchmarkSuite(): Unit = { val singleInt = new StructType().add("i", IntegerType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3262 / 3267 164.6 6.1 1.0X - codegen version 6448 / 6718 83.3 12.0 0.5X - codegen version 64-bit 6088 / 6154 88.2 11.3 0.5X - codegen HiveHash version 4732 / 4745 113.5 8.8 0.7X - */ test("single ints", singleInt, 1 << 15, 1 << 14) val singleLong = new StructType().add("i", LongType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3716 / 3726 144.5 6.9 1.0X - codegen version 7706 / 7732 69.7 14.4 0.5X - codegen version 64-bit 6370 / 6399 84.3 11.9 0.6X - codegen HiveHash version 4924 / 5026 109.0 9.2 0.8X - */ test("single longs", singleLong, 1 << 15, 1 << 14) val normal = new StructType() @@ -131,45 +122,18 @@ object HashBenchmark { .add("binary", BinaryType) .add("date", DateType) .add("timestamp", TimestampType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 2985 / 3013 0.7 1423.4 1.0X - codegen version 2422 / 2434 0.9 1155.1 1.2X - codegen version 64-bit 856 / 920 2.5 408.0 3.5X - codegen HiveHash version 4501 / 4979 0.5 2146.4 0.7X - */ test("normal", normal, 1 << 10, 1 << 11) val arrayOfInt = ArrayType(IntegerType) val array = new StructType() .add("array", arrayOfInt) .add("arrayOfArray", ArrayType(arrayOfInt)) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3100 / 3555 0.0 23651.8 1.0X - codegen version 5779 / 5865 0.0 44088.4 0.5X - codegen version 64-bit 4738 / 4821 0.0 36151.7 0.7X - codegen HiveHash version 2200 / 2246 0.1 16785.9 1.4X - */ test("array", array, 1 << 8, 1 << 9) val mapOfInt = MapType(IntegerType, IntegerType) val map = new StructType() .add("map", mapOfInt) .add("mapOfMap", MapType(IntegerType, mapOfInt)) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 0 / 0 48.1 20.8 1.0X - codegen version 257 / 275 0.0 62768.7 0.0X - codegen version 64-bit 226 / 240 0.0 55224.5 0.0X - codegen HiveHash version 89 / 96 0.0 21708.8 0.0X - */ test("map", map, 1 << 6, 1 << 6) } } From 871b1a927f5687658f8a040488a656ad68c46523 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sat, 6 Oct 2018 09:28:44 -0700 Subject: [PATCH 2/4] Update result (#16) --- .../benchmarks/HashBenchmark-results.txt | 65 +++++++++---------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/sql/catalyst/benchmarks/HashBenchmark-results.txt b/sql/catalyst/benchmarks/HashBenchmark-results.txt index 023a45ad0a00..2459b35c75bb 100644 --- a/sql/catalyst/benchmarks/HashBenchmark-results.txt +++ b/sql/catalyst/benchmarks/HashBenchmark-results.txt @@ -2,74 +2,69 @@ single ints ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -interpreted version 2668 / 2669 201.2 5.0 1.0X -codegen version 6216 / 6413 86.4 11.6 0.4X -codegen version 64-bit 10357 / 13068 51.8 19.3 0.3X -codegen HiveHash version 5814 / 5818 92.3 10.8 0.5X +interpreted version 5615 / 5616 95.6 10.5 1.0X +codegen version 8400 / 8407 63.9 15.6 0.7X +codegen version 64-bit 8139 / 8145 66.0 15.2 0.7X +codegen HiveHash version 7213 / 7348 74.4 13.4 0.8X ================================================================================================ single longs ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -interpreted version 3066 / 3067 175.1 5.7 1.0X -codegen version 7890 / 13428 68.0 14.7 0.4X -codegen version 64-bit 7639 / 8836 70.3 14.2 0.4X -codegen HiveHash version 7191 / 7258 74.7 13.4 0.4X +interpreted version 6053 / 6054 88.7 11.3 1.0X +codegen version 9367 / 9369 57.3 17.4 0.6X +codegen version 64-bit 8041 / 8051 66.8 15.0 0.8X +codegen HiveHash version 7546 / 7575 71.1 14.1 0.8X ================================================================================================ normal ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -interpreted version 3086 / 3108 0.7 1471.5 1.0X -codegen version 2206 / 2903 1.0 1051.7 1.4X -codegen version 64-bit 910 / 1089 2.3 434.1 3.4X -codegen HiveHash version 4801 / 5631 0.4 2289.1 0.6X +interpreted version 3181 / 3182 0.7 1517.0 1.0X +codegen version 2403 / 2403 0.9 1145.7 1.3X +codegen version 64-bit 915 / 916 2.3 436.2 3.5X +codegen HiveHash version 4505 / 4527 0.5 2148.3 0.7X ================================================================================================ array ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -interpreted version 2721 / 2979 0.0 20759.1 1.0X -codegen version 5444 / 5725 0.0 41530.8 0.5X -codegen version 64-bit 3411 / 3430 0.0 26025.3 0.8X -codegen HiveHash version 2186 / 2190 0.1 16676.6 1.2X +interpreted version 1828 / 1844 0.1 13946.1 1.0X +codegen version 3678 / 3804 0.0 28058.2 0.5X +codegen version 64-bit 2925 / 2931 0.0 22317.8 0.6X +codegen HiveHash version 1216 / 1217 0.1 9280.0 1.5X ================================================================================================ map ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -interpreted version 0 / 0 60.3 16.6 1.0X -codegen version 198 / 212 0.0 48317.0 0.0X -codegen version 64-bit 167 / 174 0.0 40690.8 0.0X -codegen HiveHash version 58 / 65 0.1 14096.7 0.0X +interpreted version 0 / 0 44.3 22.6 1.0X +codegen version 176 / 176 0.0 42978.8 0.0X +codegen version 64-bit 173 / 175 0.0 42214.3 0.0X +codegen HiveHash version 44 / 44 0.1 10659.9 0.0X From 2a512ce82560014469ce5c35e164b7c074b429a6 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Sat, 6 Oct 2018 19:11:24 +0100 Subject: [PATCH 3/4] Fix scala doc --- .../test/scala/org/apache/spark/sql/HashBenchmark.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala index 329f2950e65a..18a48c452bba 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala @@ -28,9 +28,10 @@ import org.apache.spark.sql.types._ * hash expressions (Murmur3Hash/xxHash64). * To run this benchmark: * {{{ - * 1. without sbt: bin/spark-submit --class - * 2. build/sbt "sql/test:runMain " - * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "catalyst/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain " * Results will be written to "benchmarks/HashBenchmark-results.txt". * }}} */ From 7ec7237bc9af983d4f21aacfffc3ef0d836d7f62 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Sun, 7 Oct 2018 07:48:58 +0100 Subject: [PATCH 4/4] Fix scala doc --- .../src/test/scala/org/apache/spark/sql/HashBenchmark.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala index 18a48c452bba..4226ab3773fe 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala @@ -28,7 +28,8 @@ import org.apache.spark.sql.types._ * hash expressions (Murmur3Hash/xxHash64). * To run this benchmark: * {{{ - * 1. without sbt: bin/spark-submit --class + * 1. without sbt: + * bin/spark-submit --class --jars * 2. build/sbt "catalyst/test:runMain " * 3. generate result: * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain "