From 3688e50f4bca48d5ef83843a146fc35e02bc7a24 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sun, 14 Nov 2021 15:37:06 -0800 Subject: [PATCH 1/3] [SPARK-35345][SQL] Add Parquet tests to BloomFilterBenchmark --- .../benchmark/BloomFilterBenchmark.scala | 60 ++++++++++++++++--- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index f78ccf9569a0f..03526ff6db8ad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -19,14 +19,13 @@ package org.apache.spark.sql.execution.benchmark import scala.util.Random +import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetOutputFormat} + import org.apache.spark.benchmark.Benchmark /** * Benchmark to measure read performance with Bloom filters. * - * Currently, only ORC supports bloom filters, we will add Parquet BM as soon as it becomes - * available. - * * To run this benchmark: * {{{ * 1. without sbt: bin/spark-submit --class @@ -43,7 +42,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private val N = scaleFactor * 1000 * 1000 private val df = spark.range(N).map(_ => Random.nextInt) - private def writeBenchmark(): Unit = { + private def writeORCBenchmark(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath @@ -61,7 +60,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } - private def readBenchmark(): Unit = { + private def readORCBenchmark(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath @@ -81,8 +80,55 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } + private def writeParquetBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + runBenchmark(s"Parquet Write") { + val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + df.write.mode("overwrite").parquet(path + "/withoutBF") + } + benchmark.addCase("With bloom filter") { _ => + df.write.mode("overwrite") + .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .parquet(path + "/withBF") + } + benchmark.run() + } + } + } + + private def readParquetBenchmark(): Unit = { + val blockSizes = Seq(2 * 1024 * 1024, 4 * 1024 * 1024, 6 * 1024 * 1024, 8 * 1024 * 1024, + 12 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024) + for (blocksize <- blockSizes) { + withTempPath { dir => + val path = dir.getCanonicalPath + df.write.option("parquet.block.size", blocksize).parquet(path + "/withoutBF") + df.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.block.size", blocksize) + .parquet(path + "/withBF") + + runBenchmark(s"Parquet Read") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter, blocksize: " + blocksize) { _ => + spark.read.parquet(path + "/withoutBF").where("value = 0").noop() + } + benchmark.addCase("With bloom filter, blocksize: " + blocksize) { _ => + spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) + .parquet(path + "/withBF").where("value = 0").noop() + } + benchmark.run() + } + } + } + } + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { - writeBenchmark() - readBenchmark() + writeORCBenchmark() + readORCBenchmark() + writeParquetBenchmark() + readParquetBenchmark() } } From dd9cbb4e2b1b2c8230442e10ef351154df40a86f Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 16 Nov 2021 16:29:41 -0800 Subject: [PATCH 2/3] update bench mark results --- .../BloomFilterBenchmark-jdk11-results.txt | 112 ++++++++++++++++-- .../BloomFilterBenchmark-jdk17-results.txt | 112 ++++++++++++++++-- .../BloomFilterBenchmark-results.txt | 112 ++++++++++++++++-- 3 files changed, 312 insertions(+), 24 deletions(-) diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt index 1e3abebef021f..9524ab3b60500 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt @@ -2,23 +2,119 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 19503 19621 166 5.1 195.0 1.0X -With bloom filter 22472 22710 335 4.4 224.7 0.9X +Without bloom filter 22533 23149 871 4.4 225.3 1.0X +With bloom filter 25897 26118 313 3.9 259.0 0.9X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1981 2040 82 50.5 19.8 1.0X -With bloom filter 1428 1467 54 70.0 14.3 1.4X +Without bloom filter 2010 2093 118 49.8 20.1 1.0X +With bloom filter 1325 1388 90 75.5 13.2 1.5X + + +================================================================================================ +Parquet Write +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 20190 20281 129 5.0 201.9 1.0X +With bloom filter 25112 25419 433 4.0 251.1 0.8X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 2097152 1198 1348 213 83.5 12.0 1.0X +With bloom filter, blocksize: 2097152 387 485 80 258.1 3.9 3.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 4194304 1087 1140 75 92.0 10.9 1.0X +With bloom filter, blocksize: 4194304 304 364 46 328.7 3.0 3.6X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 6291456 998 1016 25 100.2 10.0 1.0X +With bloom filter, blocksize: 6291456 363 411 45 275.7 3.6 2.8X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 8388608 1289 1356 95 77.6 12.9 1.0X +With bloom filter, blocksize: 8388608 632 668 40 158.1 6.3 2.0X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 12582912 1357 1402 62 73.7 13.6 1.0X +With bloom filter, blocksize: 12582912 1009 1041 45 99.1 10.1 1.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 16777216 1359 1394 49 73.6 13.6 1.0X +With bloom filter, blocksize: 16777216 1217 1249 45 82.2 12.2 1.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 33554432 1362 1362 1 73.4 13.6 1.0X +With bloom filter, blocksize: 33554432 1345 1395 71 74.3 13.5 1.0X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk17-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk17-results.txt index f9a4d63d08b33..8e8bb2e839ee2 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk17-results.txt @@ -2,23 +2,119 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.8.0-1042-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 23519 23980 652 4.3 235.2 1.0X -With bloom filter 26703 26898 275 3.7 267.0 0.9X +Without bloom filter 18752 18847 134 5.3 187.5 1.0X +With bloom filter 20964 21131 236 4.8 209.6 0.9X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.8.0-1042-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1503 1514 16 66.6 15.0 1.0X -With bloom filter 1142 1159 23 87.5 11.4 1.3X +Without bloom filter 1445 1474 41 69.2 14.4 1.0X +With bloom filter 1085 1093 11 92.2 10.8 1.3X + + +================================================================================================ +Parquet Write +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 20162 20282 170 5.0 201.6 1.0X +With bloom filter 24370 24524 218 4.1 243.7 0.8X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 2097152 770 798 28 130.0 7.7 1.0X +With bloom filter, blocksize: 2097152 251 276 16 398.5 2.5 3.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 4194304 716 733 17 139.7 7.2 1.0X +With bloom filter, blocksize: 4194304 199 209 7 501.8 2.0 3.6X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 6291456 735 761 24 136.1 7.3 1.0X +With bloom filter, blocksize: 6291456 225 241 15 444.5 2.2 3.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 8388608 741 755 19 135.0 7.4 1.0X +With bloom filter, blocksize: 8388608 335 340 5 298.9 3.3 2.2X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 12582912 760 788 44 131.6 7.6 1.0X +With bloom filter, blocksize: 12582912 581 589 7 172.0 5.8 1.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 16777216 732 739 8 136.6 7.3 1.0X +With bloom filter, blocksize: 16777216 642 698 96 155.7 6.4 1.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 33554432 1047 1049 3 95.5 10.5 1.0X +With bloom filter, blocksize: 33554432 1252 1265 19 79.9 12.5 0.8X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index 149d3bf76f770..8c1cc7e19d89d 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -2,23 +2,119 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 14922 15194 384 6.7 149.2 1.0X -With bloom filter 17270 17665 559 5.8 172.7 0.9X +Without bloom filter 14854 14886 45 6.7 148.5 1.0X +With bloom filter 17327 17328 2 5.8 173.3 0.9X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1463 1486 32 68.4 14.6 1.0X -With bloom filter 1232 1239 9 81.2 12.3 1.2X +Without bloom filter 1445 1490 64 69.2 14.4 1.0X +With bloom filter 1118 1165 67 89.4 11.2 1.3X + + +================================================================================================ +Parquet Write +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 14530 14538 11 6.9 145.3 1.0X +With bloom filter 22584 22597 18 4.4 225.8 0.6X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 2097152 863 888 34 115.9 8.6 1.0X +With bloom filter, blocksize: 2097152 266 283 20 376.4 2.7 3.2X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 4194304 802 818 17 124.7 8.0 1.0X +With bloom filter, blocksize: 4194304 191 199 11 524.4 1.9 4.2X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 6291456 817 820 4 122.4 8.2 1.0X +With bloom filter, blocksize: 6291456 250 256 12 400.6 2.5 3.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 8388608 802 811 8 124.6 8.0 1.0X +With bloom filter, blocksize: 8388608 354 361 9 282.2 3.5 2.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 12582912 797 807 9 125.5 8.0 1.0X +With bloom filter, blocksize: 12582912 737 745 8 135.7 7.4 1.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 16777216 795 804 9 125.8 7.9 1.0X +With bloom filter, blocksize: 16777216 732 737 5 136.6 7.3 1.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 33554432 801 814 11 124.8 8.0 1.0X +With bloom filter, blocksize: 33554432 943 954 9 106.0 9.4 0.8X From bac368c6c344edc93e08029ebc7c90f14ef9ed4e Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 16 Nov 2021 18:43:51 -0800 Subject: [PATCH 3/3] address comments --- .../spark/sql/execution/benchmark/BloomFilterBenchmark.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 03526ff6db8ad..ccb65c7d3acca 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -84,7 +84,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - runBenchmark(s"Parquet Write") { + runBenchmark("Parquet Write") { val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => df.write.mode("overwrite").parquet(path + "/withoutBF") @@ -110,7 +110,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .option("parquet.block.size", blocksize) .parquet(path + "/withBF") - runBenchmark(s"Parquet Read") { + runBenchmark("Parquet Read") { val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter, blocksize: " + blocksize) { _ => spark.read.parquet(path + "/withoutBF").where("value = 0").noop()