diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt index 1e3abebef021f..ad462da6db245 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt @@ -2,23 +2,179 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 19503 19621 166 5.1 195.0 1.0X -With bloom filter 22472 22710 335 4.4 224.7 0.9X +Without bloom filter 13568 13645 109 7.4 135.7 1.0X +With bloom filter 16116 16238 172 6.2 161.2 0.8X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1981 2040 82 50.5 19.8 1.0X -With bloom filter 1428 1467 54 70.0 14.3 1.4X +Without bloom filter 1572 1605 47 63.6 15.7 1.0X +With bloom filter 1343 1359 23 74.5 13.4 1.2X + + +================================================================================================ +ORC Read for IN set +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 1M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 51 63 15 19.6 51.1 1.0X +With bloom filter 54 88 23 18.5 54.0 0.9X + + +================================================================================================ +Parquet Write +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 13679 13954 389 7.3 136.8 1.0X +With bloom filter 18260 18284 33 5.5 182.6 0.7X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 2097152 954 984 49 104.8 9.5 1.0X +With bloom filter, blocksize: 2097152 285 307 21 350.4 2.9 3.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 3145728 788 831 40 126.9 7.9 1.0X +With bloom filter, blocksize: 3145728 192 262 47 521.4 1.9 4.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 4194304 787 847 75 127.0 7.9 1.0X +With bloom filter, blocksize: 4194304 201 224 18 496.4 2.0 3.9X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 5242880 854 872 18 117.1 8.5 1.0X +With bloom filter, blocksize: 5242880 172 222 37 582.7 1.7 5.0X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 6291456 785 813 27 127.4 7.9 1.0X +With bloom filter, blocksize: 6291456 167 188 14 598.0 1.7 4.7X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 8388608 806 834 42 124.1 8.1 1.0X +With bloom filter, blocksize: 8388608 360 383 29 277.8 3.6 2.2X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 16777216 812 846 42 123.2 8.1 1.0X +With bloom filter, blocksize: 16777216 780 807 27 128.2 7.8 1.0X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 33554432 852 862 10 117.4 8.5 1.0X +With bloom filter, blocksize: 33554432 820 865 59 121.9 8.2 1.0X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 67108864 844 911 58 118.5 8.4 1.0X +With bloom filter, blocksize: 67108864 851 853 2 117.5 8.5 1.0X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 134217728 839 887 53 119.3 8.4 1.0X +With bloom filter, blocksize: 134217728 872 881 9 114.6 8.7 1.0X + + +================================================================================================ +Parquet Read for IN set +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 1M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 70 76 6 14.2 70.2 1.0X +With bloom filter 73 103 22 13.8 72.6 1.0X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index 149d3bf76f770..5faf31841866c 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -2,23 +2,179 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 14922 15194 384 6.7 149.2 1.0X -With bloom filter 17270 17665 559 5.8 172.7 0.9X +Without bloom filter 15800 15864 90 6.3 158.0 1.0X +With bloom filter 18447 18451 6 5.4 184.5 0.9X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1463 1486 32 68.4 14.6 1.0X -With bloom filter 1232 1239 9 81.2 12.3 1.2X +Without bloom filter 1543 1562 26 64.8 15.4 1.0X +With bloom filter 1145 1163 25 87.4 11.4 1.3X + + +================================================================================================ +ORC Read for IN set +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 1M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 57 71 16 17.6 56.8 1.0X +With bloom filter 54 63 12 18.4 54.2 1.0X + + +================================================================================================ +Parquet Write +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 14024 14315 412 7.1 140.2 1.0X +With bloom filter 22622 22681 84 4.4 226.2 0.6X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 2097152 802 826 23 124.7 8.0 1.0X +With bloom filter, blocksize: 2097152 241 257 12 414.7 2.4 3.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 3145728 786 794 8 127.3 7.9 1.0X +With bloom filter, blocksize: 3145728 191 203 11 523.1 1.9 4.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 4194304 786 790 5 127.3 7.9 1.0X +With bloom filter, blocksize: 4194304 179 191 10 559.5 1.8 4.4X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 5242880 775 787 18 129.0 7.8 1.0X +With bloom filter, blocksize: 5242880 171 181 13 584.9 1.7 4.5X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 6291456 767 777 9 130.3 7.7 1.0X +With bloom filter, blocksize: 6291456 220 233 12 454.3 2.2 3.5X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 8388608 782 799 17 127.9 7.8 1.0X +With bloom filter, blocksize: 8388608 344 356 17 291.1 3.4 2.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 16777216 800 811 14 125.0 8.0 1.0X +With bloom filter, blocksize: 16777216 628 646 12 159.2 6.3 1.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 33554432 802 813 10 124.7 8.0 1.0X +With bloom filter, blocksize: 33554432 817 833 15 122.4 8.2 1.0X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 67108864 798 816 23 125.3 8.0 1.0X +With bloom filter, blocksize: 67108864 810 823 12 123.4 8.1 1.0X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 134217728 831 845 13 120.4 8.3 1.0X +With bloom filter, blocksize: 134217728 799 821 20 125.1 8.0 1.0X + + +================================================================================================ +Parquet Read for IN set +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 1M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 86 99 18 11.6 86.5 1.0X +With bloom filter 85 95 14 11.8 84.9 1.0X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index f78ccf9569a0f..1beb766dd3d20 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -17,16 +17,18 @@ package org.apache.spark.sql.execution.benchmark +import java.util.UUID + import scala.util.Random +import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetOutputFormat} + import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.functions.col /** * Benchmark to measure read performance with Bloom filters. * - * Currently, only ORC supports bloom filters, we will add Parquet BM as soon as it becomes - * available. - * * To run this benchmark: * {{{ * 1. without sbt: bin/spark-submit --class @@ -41,19 +43,21 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private val scaleFactor = 100 private val N = scaleFactor * 1000 * 1000 - private val df = spark.range(N).map(_ => Random.nextInt) + private val df1 = spark.range(N).map(_ => Random.nextInt) + + private val df2 = Seq.fill(N) {UUID.randomUUID().toString.replace("-", "")}.toDF - private def writeBenchmark(): Unit = { + private def writeORCBenchmark(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - runBenchmark(s"ORC Write") { + runBenchmark("ORC Write") { val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => - df.write.mode("overwrite").orc(path + "/withoutBF") + df1.write.mode("overwrite").orc(path + "/withoutBF") } benchmark.addCase("With bloom filter") { _ => - df.write.mode("overwrite") + df1.write.mode("overwrite") .option("orc.bloom.filter.columns", "value").orc(path + "/withBF") } benchmark.run() @@ -61,14 +65,14 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } - private def readBenchmark(): Unit = { + private def readORCBenchmark(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - df.write.orc(path + "/withoutBF") - df.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + df1.write.orc(path + "/withoutBF") + df1.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") - runBenchmark(s"ORC Read") { + runBenchmark("ORC Read") { val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.orc(path + "/withoutBF").where("value = 0").noop() @@ -81,8 +85,113 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } + private def readORCBenchmarkForInSet(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + val samples = df2.sample(0.000003, 128).select("value").as[String].collect() + val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" + + df2.repartition(col("value")).sort(col("value")).write.orc(path + "/withoutBF") + df2.repartition(col("value")).sort(col("value")) + .write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + + runBenchmark("ORC Read for IN set") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.orc(path + "/withoutBF").where(filter).noop() + } + benchmark.addCase("With bloom filter") { _ => + spark.read.orc(path + "/withBF").where(filter).noop() + } + benchmark.run() + } + } + } + + private def writeParquetBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + runBenchmark("Parquet Write") { + val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + df1.write.mode("overwrite").parquet(path + "/withoutBF") + } + benchmark.addCase("With bloom filter") { _ => + df1.write.mode("overwrite") + .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.bloom.filter.expected.ndv#value", "100000000") + .parquet(path + "/withBF") + } + benchmark.run() + } + } + } + + private def readParquetBenchmark(): Unit = { + val blockSizes = Seq(2 * 1024 * 1024, 3 * 1024 * 1024, 4 * 1024 * 1024, 5 * 1024 * 1024, + 6 * 1024 * 1024, 8 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024, 64 * 1024 * 1024, + 128 * 1024 * 1024) + for (blocksize <- blockSizes) { + withTempPath { dir => + val path = dir.getCanonicalPath + + df1.write.option("parquet.block.size", blocksize).parquet(path + "/withoutBF") + df1.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.bloom.filter.expected.ndv#value", "100000000") + .option("parquet.block.size", blocksize) + .parquet(path + "/withBF") + + runBenchmark("Parquet Read") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter, blocksize: " + blocksize) { _ => + spark.read.parquet(path + "/withoutBF").where("value = 0").noop() + } + benchmark.addCase("With bloom filter, blocksize: " + blocksize) { _ => + spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) + .parquet(path + "/withBF").where("value = 0").noop() + } + benchmark.run() + } + } + } + } + + private def readParquetBenchmarkForInSet(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + val samples = df2.sample(0.000003, 128).select("value").as[String].collect() + val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" + + df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") + df2.repartition(col("value")).sort(col("value")) + .write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.bloom.filter.expected.ndv#value", "100000000") + .parquet(path + "/withBF") + + runBenchmark("Parquet Read for IN set") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 50) + .parquet(path + "/withoutBF").where(filter).noop() + } + benchmark.addCase("With bloom filter") { _ => + spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) + .option("spark.sql.parquet.pushdown.inFilterThreshold", 50) + .parquet(path + "/withBF").where(filter).noop + } + benchmark.run() + } + } + } + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { - writeBenchmark() - readBenchmark() + writeORCBenchmark() + readORCBenchmark() + readORCBenchmarkForInSet() + writeParquetBenchmark() + readParquetBenchmark() + readParquetBenchmarkForInSet() } }