diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt new file mode 100644 index 000000000000..2d3bae442cc5 --- /dev/null +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -0,0 +1,269 @@ +================================================================================================ +SQL Single Numeric Column Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 21508 / 22112 0.7 1367.5 1.0X +SQL Json 8705 / 8825 1.8 553.4 2.5X +SQL Parquet Vectorized 157 / 186 100.0 10.0 136.7X +SQL Parquet MR 1789 / 1794 8.8 113.8 12.0X +SQL ORC Vectorized 156 / 166 100.9 9.9 138.0X +SQL ORC Vectorized with copy 218 / 225 72.1 13.9 98.6X +SQL ORC MR 1448 / 1492 10.9 92.0 14.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Parquet Reader Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +ParquetReader Vectorized 202 / 211 77.7 12.9 1.0X +ParquetReader Vectorized -> Row 118 / 120 133.5 7.5 1.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 23282 / 23312 0.7 1480.2 1.0X +SQL Json 9187 / 9189 1.7 584.1 2.5X +SQL Parquet Vectorized 204 / 218 77.0 13.0 114.0X +SQL Parquet MR 1941 / 1953 8.1 123.4 12.0X +SQL ORC Vectorized 217 / 225 72.6 13.8 107.5X +SQL ORC Vectorized with copy 279 / 289 56.3 17.8 83.4X +SQL ORC MR 1541 / 1549 10.2 98.0 15.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Parquet Reader Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +ParquetReader Vectorized 288 / 297 54.6 18.3 1.0X +ParquetReader Vectorized -> Row 255 / 257 61.7 16.2 1.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 24990 / 25012 0.6 1588.8 1.0X +SQL Json 9837 / 9865 1.6 625.4 2.5X +SQL Parquet Vectorized 170 / 180 92.3 10.8 146.6X +SQL Parquet MR 2319 / 2328 6.8 147.4 10.8X +SQL ORC Vectorized 293 / 301 53.7 18.6 85.3X +SQL ORC Vectorized with copy 297 / 309 52.9 18.9 84.0X +SQL ORC MR 1667 / 1674 9.4 106.0 15.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Parquet Reader Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +ParquetReader Vectorized 257 / 274 61.3 16.3 1.0X +ParquetReader Vectorized -> Row 259 / 264 60.8 16.4 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 32537 / 32554 0.5 2068.7 1.0X +SQL Json 12610 / 12668 1.2 801.7 2.6X +SQL Parquet Vectorized 258 / 276 61.0 16.4 126.2X +SQL Parquet MR 2422 / 2435 6.5 154.0 13.4X +SQL ORC Vectorized 378 / 385 41.6 24.0 86.2X +SQL ORC Vectorized with copy 381 / 389 41.3 24.2 85.4X +SQL ORC MR 1797 / 1819 8.8 114.3 18.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Parquet Reader Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +ParquetReader Vectorized 352 / 368 44.7 22.4 1.0X +ParquetReader Vectorized -> Row 351 / 359 44.8 22.3 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 27179 / 27184 0.6 1728.0 1.0X +SQL Json 12578 / 12585 1.3 799.7 2.2X +SQL Parquet Vectorized 161 / 171 97.5 10.3 168.5X +SQL Parquet MR 2361 / 2395 6.7 150.1 11.5X +SQL ORC Vectorized 473 / 480 33.3 30.0 57.5X +SQL ORC Vectorized with copy 478 / 483 32.9 30.4 56.8X +SQL ORC MR 1858 / 1859 8.5 118.2 14.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Parquet Reader Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +ParquetReader Vectorized 251 / 255 62.7 15.9 1.0X +ParquetReader Vectorized -> Row 255 / 259 61.8 16.2 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 34797 / 34830 0.5 2212.3 1.0X +SQL Json 17806 / 17828 0.9 1132.1 2.0X +SQL Parquet Vectorized 260 / 269 60.6 16.5 134.0X +SQL Parquet MR 2512 / 2534 6.3 159.7 13.9X +SQL ORC Vectorized 582 / 593 27.0 37.0 59.8X +SQL ORC Vectorized with copy 576 / 584 27.3 36.6 60.4X +SQL ORC MR 2309 / 2313 6.8 146.8 15.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Parquet Reader Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +ParquetReader Vectorized 350 / 363 44.9 22.3 1.0X +ParquetReader Vectorized -> Row 350 / 366 44.9 22.3 1.0X + + +================================================================================================ +Int and String Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 22486 / 22590 0.5 2144.5 1.0X +SQL Json 14124 / 14195 0.7 1347.0 1.6X +SQL Parquet Vectorized 2342 / 2347 4.5 223.4 9.6X +SQL Parquet MR 4660 / 4664 2.2 444.4 4.8X +SQL ORC Vectorized 2378 / 2379 4.4 226.8 9.5X +SQL ORC Vectorized with copy 2548 / 2571 4.1 243.0 8.8X +SQL ORC MR 4206 / 4211 2.5 401.1 5.3X + + +================================================================================================ +Repeated String Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 12150 / 12178 0.9 1158.7 1.0X +SQL Json 7012 / 7014 1.5 668.7 1.7X +SQL Parquet Vectorized 792 / 796 13.2 75.5 15.3X +SQL Parquet MR 1961 / 1975 5.3 187.0 6.2X +SQL ORC Vectorized 482 / 485 21.8 46.0 25.2X +SQL ORC Vectorized with copy 710 / 715 14.8 67.7 17.1X +SQL ORC MR 2081 / 2083 5.0 198.5 5.8X + + +================================================================================================ +Partitioned Table Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Data column - CSV 31789 / 31791 0.5 2021.1 1.0X +Data column - Json 12873 / 12918 1.2 818.4 2.5X +Data column - Parquet Vectorized 267 / 280 58.9 17.0 119.1X +Data column - Parquet MR 3387 / 3402 4.6 215.3 9.4X +Data column - ORC Vectorized 391 / 453 40.2 24.9 81.2X +Data column - ORC Vectorized with copy 392 / 398 40.2 24.9 81.2X +Data column - ORC MR 2508 / 2512 6.3 159.4 12.7X +Partition column - CSV 6965 / 6977 2.3 442.8 4.6X +Partition column - Json 5563 / 5576 2.8 353.7 5.7X +Partition column - Parquet Vectorized 65 / 78 241.1 4.1 487.2X +Partition column - Parquet MR 1811 / 1811 8.7 115.1 17.6X +Partition column - ORC Vectorized 66 / 73 239.0 4.2 483.0X +Partition column - ORC Vectorized with copy 65 / 70 241.1 4.1 487.3X +Partition column - ORC MR 1775 / 1778 8.9 112.8 17.9X +Both columns - CSV 30032 / 30113 0.5 1909.4 1.1X +Both columns - Json 13941 / 13959 1.1 886.3 2.3X +Both columns - Parquet Vectorized 312 / 330 50.3 19.9 101.7X +Both columns - Parquet MR 3858 / 3862 4.1 245.3 8.2X +Both columns - ORC Vectorized 431 / 437 36.5 27.4 73.8X +Both column - ORC Vectorized with copy 523 / 529 30.1 33.3 60.7X +Both columns - ORC MR 2712 / 2805 5.8 172.4 11.7X + + +================================================================================================ +String with Nulls Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 13525 / 13823 0.8 1289.9 1.0X +SQL Json 9913 / 9921 1.1 945.3 1.4X +SQL Parquet Vectorized 1517 / 1517 6.9 144.7 8.9X +SQL Parquet MR 3996 / 4008 2.6 381.1 3.4X +ParquetReader Vectorized 1120 / 1128 9.4 106.8 12.1X +SQL ORC Vectorized 1203 / 1224 8.7 114.7 11.2X +SQL ORC Vectorized with copy 1639 / 1646 6.4 156.3 8.3X +SQL ORC MR 3720 / 3780 2.8 354.7 3.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 15860 / 15877 0.7 1512.5 1.0X +SQL Json 7676 / 7688 1.4 732.0 2.1X +SQL Parquet Vectorized 1072 / 1084 9.8 102.2 14.8X +SQL Parquet MR 2890 / 2897 3.6 275.6 5.5X +ParquetReader Vectorized 1052 / 1053 10.0 100.4 15.1X +SQL ORC Vectorized 1248 / 1248 8.4 119.0 12.7X +SQL ORC Vectorized with copy 1627 / 1637 6.4 155.2 9.7X +SQL ORC MR 3365 / 3369 3.1 320.9 4.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 13401 / 13561 0.8 1278.1 1.0X +SQL Json 5253 / 5303 2.0 500.9 2.6X +SQL Parquet Vectorized 233 / 242 45.0 22.2 57.6X +SQL Parquet MR 1791 / 1796 5.9 170.8 7.5X +ParquetReader Vectorized 236 / 238 44.4 22.5 56.7X +SQL ORC Vectorized 453 / 473 23.2 43.2 29.6X +SQL ORC Vectorized with copy 573 / 577 18.3 54.7 23.4X +SQL ORC MR 1846 / 1850 5.7 176.0 7.3X + + +================================================================================================ +Single Column Scan From Wide Columns +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 3147 / 3148 0.3 3001.1 1.0X +SQL Json 2666 / 2693 0.4 2542.9 1.2X +SQL Parquet Vectorized 54 / 58 19.5 51.3 58.5X +SQL Parquet MR 220 / 353 4.8 209.9 14.3X +SQL ORC Vectorized 63 / 77 16.8 59.7 50.3X +SQL ORC Vectorized with copy 63 / 66 16.7 59.8 50.2X +SQL ORC MR 317 / 321 3.3 302.2 9.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 50 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 7902 / 7921 0.1 7536.2 1.0X +SQL Json 9467 / 9491 0.1 9028.6 0.8X +SQL Parquet Vectorized 73 / 79 14.3 69.8 108.0X +SQL Parquet MR 239 / 247 4.4 228.0 33.1X +SQL ORC Vectorized 78 / 84 13.4 74.6 101.0X +SQL ORC Vectorized with copy 78 / 88 13.4 74.4 101.3X +SQL ORC MR 910 / 918 1.2 867.6 8.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 100 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +SQL CSV 13539 / 13543 0.1 12912.0 1.0X +SQL Json 17420 / 17446 0.1 16613.1 0.8X +SQL Parquet Vectorized 103 / 120 10.2 98.1 131.6X +SQL Parquet MR 250 / 258 4.2 238.9 54.1X +SQL ORC Vectorized 99 / 104 10.6 94.6 136.5X +SQL ORC Vectorized with copy 100 / 106 10.5 95.6 135.1X +SQL ORC MR 1653 / 1659 0.6 1576.3 8.2X + + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index 51a7f9f1ef09..a1e7f9e36f4b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala @@ -22,7 +22,7 @@ import scala.collection.JavaConverters._ import scala.util.Random import org.apache.spark.SparkConf -import org.apache.spark.benchmark.Benchmark +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.SQLHelper @@ -34,10 +34,16 @@ import org.apache.spark.sql.vectorized.ColumnVector /** * Benchmark to measure data source read performance. - * To run this: - * spark-submit --class + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * --jars , + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/DataSourceReadBenchmark-results.txt". + * }}} */ -object DataSourceReadBenchmark extends SQLHelper { +object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { val conf = new SparkConf() .setAppName("DataSourceReadBenchmark") // Since `spark.master` always exists, overrides this value @@ -93,11 +99,16 @@ object DataSourceReadBenchmark extends SQLHelper { def numericScanBenchmark(values: Int, dataType: DataType): Unit = { // Benchmarks running through spark sql. - val sqlBenchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values) + val sqlBenchmark = new Benchmark( + s"SQL Single ${dataType.sql} Column Scan", + values, + output = output) // Benchmarks driving reader component directly. val parquetReaderBenchmark = new Benchmark( - s"Parquet Reader Single ${dataType.sql} Column Scan", values) + s"Parquet Reader Single ${dataType.sql} Column Scan", + values, + output = output) withTempPath { dir => withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { @@ -140,74 +151,6 @@ object DataSourceReadBenchmark extends SQLHelper { } } - /* - OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64 - Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz - SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 22964 / 23096 0.7 1460.0 1.0X - SQL Json 8469 / 8593 1.9 538.4 2.7X - SQL Parquet Vectorized 164 / 177 95.8 10.4 139.9X - SQL Parquet MR 1687 / 1706 9.3 107.2 13.6X - SQL ORC Vectorized 191 / 197 82.3 12.2 120.2X - SQL ORC Vectorized with copy 215 / 219 73.2 13.7 106.9X - SQL ORC MR 1392 / 1412 11.3 88.5 16.5X - - - SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 24090 / 24097 0.7 1531.6 1.0X - SQL Json 8791 / 8813 1.8 558.9 2.7X - SQL Parquet Vectorized 204 / 212 77.0 13.0 117.9X - SQL Parquet MR 1813 / 1850 8.7 115.3 13.3X - SQL ORC Vectorized 226 / 230 69.7 14.4 106.7X - SQL ORC Vectorized with copy 295 / 298 53.3 18.8 81.6X - SQL ORC MR 1526 / 1549 10.3 97.1 15.8X - - - SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 25637 / 25791 0.6 1629.9 1.0X - SQL Json 9532 / 9570 1.7 606.0 2.7X - SQL Parquet Vectorized 181 / 191 86.8 11.5 141.5X - SQL Parquet MR 2210 / 2227 7.1 140.5 11.6X - SQL ORC Vectorized 309 / 317 50.9 19.6 83.0X - SQL ORC Vectorized with copy 316 / 322 49.8 20.1 81.2X - SQL ORC MR 1650 / 1680 9.5 104.9 15.5X - - - SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 31617 / 31764 0.5 2010.1 1.0X - SQL Json 12440 / 12451 1.3 790.9 2.5X - SQL Parquet Vectorized 284 / 315 55.4 18.0 111.4X - SQL Parquet MR 2382 / 2390 6.6 151.5 13.3X - SQL ORC Vectorized 398 / 403 39.5 25.3 79.5X - SQL ORC Vectorized with copy 410 / 413 38.3 26.1 77.1X - SQL ORC MR 1783 / 1813 8.8 113.4 17.7X - - - SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 26679 / 26742 0.6 1696.2 1.0X - SQL Json 12490 / 12541 1.3 794.1 2.1X - SQL Parquet Vectorized 174 / 183 90.4 11.1 153.3X - SQL Parquet MR 2201 / 2223 7.1 140.0 12.1X - SQL ORC Vectorized 415 / 429 37.9 26.4 64.3X - SQL ORC Vectorized with copy 422 / 428 37.2 26.9 63.2X - SQL ORC MR 1767 / 1773 8.9 112.3 15.1X - - - SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 34223 / 34324 0.5 2175.8 1.0X - SQL Json 17784 / 17785 0.9 1130.7 1.9X - SQL Parquet Vectorized 277 / 283 56.7 17.6 123.4X - SQL Parquet MR 2356 / 2386 6.7 149.8 14.5X - SQL ORC Vectorized 533 / 536 29.5 33.9 64.2X - SQL ORC Vectorized with copy 541 / 546 29.1 34.4 63.3X - SQL ORC MR 2166 / 2177 7.3 137.7 15.8X - */ sqlBenchmark.run() // Driving the parquet reader in batch mode directly. @@ -279,51 +222,13 @@ object DataSourceReadBenchmark extends SQLHelper { } } - /* - OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64 - Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz - Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - ParquetReader Vectorized 198 / 202 79.4 12.6 1.0X - ParquetReader Vectorized -> Row 119 / 121 132.3 7.6 1.7X - - - Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - ParquetReader Vectorized 282 / 287 55.8 17.9 1.0X - ParquetReader Vectorized -> Row 246 / 247 64.0 15.6 1.1X - - - Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - ParquetReader Vectorized 258 / 262 60.9 16.4 1.0X - ParquetReader Vectorized -> Row 259 / 260 60.8 16.5 1.0X - - - Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - ParquetReader Vectorized 361 / 369 43.6 23.0 1.0X - ParquetReader Vectorized -> Row 361 / 371 43.6 22.9 1.0X - - - Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - ParquetReader Vectorized 253 / 261 62.2 16.1 1.0X - ParquetReader Vectorized -> Row 254 / 256 61.9 16.2 1.0X - - - Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - ParquetReader Vectorized 357 / 364 44.0 22.7 1.0X - ParquetReader Vectorized -> Row 358 / 366 44.0 22.7 1.0X - */ parquetReaderBenchmark.run() } } } def intStringScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Int and String Scan", values) + val benchmark = new Benchmark("Int and String Scan", values, output = output) withTempPath { dir => withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { @@ -368,26 +273,13 @@ object DataSourceReadBenchmark extends SQLHelper { } } - /* - OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64 - Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz - Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 27145 / 27158 0.4 2588.7 1.0X - SQL Json 12969 / 13337 0.8 1236.8 2.1X - SQL Parquet Vectorized 2419 / 2448 4.3 230.7 11.2X - SQL Parquet MR 4631 / 4633 2.3 441.7 5.9X - SQL ORC Vectorized 2412 / 2465 4.3 230.0 11.3X - SQL ORC Vectorized with copy 2633 / 2675 4.0 251.1 10.3X - SQL ORC MR 4280 / 4350 2.4 408.2 6.3X - */ benchmark.run() } } } def repeatedStringScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Repeated String", values) + val benchmark = new Benchmark("Repeated String", values, output = output) withTempPath { dir => withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { @@ -432,26 +324,13 @@ object DataSourceReadBenchmark extends SQLHelper { } } - /* - OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64 - Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz - Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 17345 / 17424 0.6 1654.1 1.0X - SQL Json 8639 / 8664 1.2 823.9 2.0X - SQL Parquet Vectorized 839 / 854 12.5 80.0 20.7X - SQL Parquet MR 1771 / 1775 5.9 168.9 9.8X - SQL ORC Vectorized 550 / 569 19.1 52.4 31.6X - SQL ORC Vectorized with copy 785 / 849 13.4 74.9 22.1X - SQL ORC MR 2168 / 2202 4.8 206.7 8.0X - */ benchmark.run() } } } def partitionTableScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Partitioned Table", values) + val benchmark = new Benchmark("Partitioned Table", values, output = output) withTempPath { dir => withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { @@ -562,40 +441,13 @@ object DataSourceReadBenchmark extends SQLHelper { } } - /* - OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64 - Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz - Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - Data column - CSV 32613 / 32841 0.5 2073.4 1.0X - Data column - Json 13343 / 13469 1.2 848.3 2.4X - Data column - Parquet Vectorized 302 / 318 52.1 19.2 108.0X - Data column - Parquet MR 2908 / 2924 5.4 184.9 11.2X - Data column - ORC Vectorized 412 / 425 38.1 26.2 79.1X - Data column - ORC Vectorized with copy 442 / 446 35.6 28.1 73.8X - Data column - ORC MR 2390 / 2396 6.6 152.0 13.6X - Partition column - CSV 9626 / 9683 1.6 612.0 3.4X - Partition column - Json 10909 / 10923 1.4 693.6 3.0X - Partition column - Parquet Vectorized 69 / 76 228.4 4.4 473.6X - Partition column - Parquet MR 1898 / 1933 8.3 120.7 17.2X - Partition column - ORC Vectorized 67 / 74 236.0 4.2 489.4X - Partition column - ORC Vectorized with copy 65 / 72 241.9 4.1 501.6X - Partition column - ORC MR 1743 / 1749 9.0 110.8 18.7X - Both columns - CSV 35523 / 35552 0.4 2258.5 0.9X - Both columns - Json 13676 / 13681 1.2 869.5 2.4X - Both columns - Parquet Vectorized 317 / 326 49.5 20.2 102.7X - Both columns - Parquet MR 3333 / 3336 4.7 211.9 9.8X - Both columns - ORC Vectorized 441 / 446 35.6 28.1 73.9X - Both column - ORC Vectorized with copy 517 / 524 30.4 32.9 63.1X - Both columns - ORC MR 2574 / 2577 6.1 163.6 12.7X - */ benchmark.run() } } } def stringWithNullsScanBenchmark(values: Int, fractionOfNulls: Double): Unit = { - val benchmark = new Benchmark("String with Nulls Scan", values) + val benchmark = new Benchmark("String with Nulls Scan", values, output = output) withTempPath { dir => withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { @@ -673,51 +525,16 @@ object DataSourceReadBenchmark extends SQLHelper { } } - /* - OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64 - Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz - String with Nulls Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 14875 / 14920 0.7 1418.6 1.0X - SQL Json 10974 / 10992 1.0 1046.5 1.4X - SQL Parquet Vectorized 1711 / 1750 6.1 163.2 8.7X - SQL Parquet MR 3838 / 3884 2.7 366.0 3.9X - ParquetReader Vectorized 1155 / 1168 9.1 110.2 12.9X - SQL ORC Vectorized 1341 / 1380 7.8 127.9 11.1X - SQL ORC Vectorized with copy 1659 / 1716 6.3 158.2 9.0X - SQL ORC MR 3594 / 3634 2.9 342.7 4.1X - - - String with Nulls Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 17219 / 17264 0.6 1642.1 1.0X - SQL Json 8843 / 8864 1.2 843.3 1.9X - SQL Parquet Vectorized 1169 / 1178 9.0 111.4 14.7X - SQL Parquet MR 2676 / 2697 3.9 255.2 6.4X - ParquetReader Vectorized 1068 / 1071 9.8 101.8 16.1X - SQL ORC Vectorized 1319 / 1319 7.9 125.8 13.1X - SQL ORC Vectorized with copy 1638 / 1639 6.4 156.2 10.5X - SQL ORC MR 3230 / 3257 3.2 308.1 5.3X - - - String with Nulls Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 13976 / 14053 0.8 1332.8 1.0X - SQL Json 5166 / 5176 2.0 492.6 2.7X - SQL Parquet Vectorized 274 / 282 38.2 26.2 50.9X - SQL Parquet MR 1553 / 1555 6.8 148.1 9.0X - ParquetReader Vectorized 241 / 246 43.5 23.0 57.9X - SQL ORC Vectorized 476 / 479 22.0 45.4 29.3X - SQL ORC Vectorized with copy 584 / 588 17.9 55.7 23.9X - SQL ORC MR 1720 / 1734 6.1 164.1 8.1X - */ benchmark.run() } } } def columnsBenchmark(values: Int, width: Int): Unit = { - val benchmark = new Benchmark(s"Single Column Scan from $width columns", values) + val benchmark = new Benchmark( + s"Single Column Scan from $width columns", + values, + output = output) withTempPath { dir => withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { @@ -763,58 +580,35 @@ object DataSourceReadBenchmark extends SQLHelper { } } - /* - OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64 - Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz - Single Column Scan from 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 3478 / 3481 0.3 3316.4 1.0X - SQL Json 2646 / 2654 0.4 2523.6 1.3X - SQL Parquet Vectorized 67 / 72 15.8 63.5 52.2X - SQL Parquet MR 207 / 214 5.1 197.6 16.8X - SQL ORC Vectorized 69 / 76 15.2 66.0 50.3X - SQL ORC Vectorized with copy 70 / 76 15.0 66.5 49.9X - SQL ORC MR 299 / 303 3.5 285.1 11.6X - - - Single Column Scan from 50 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 9214 / 9236 0.1 8786.7 1.0X - SQL Json 9943 / 9978 0.1 9482.7 0.9X - SQL Parquet Vectorized 77 / 86 13.6 73.3 119.8X - SQL Parquet MR 229 / 235 4.6 218.6 40.2X - SQL ORC Vectorized 84 / 96 12.5 80.0 109.9X - SQL ORC Vectorized with copy 83 / 91 12.6 79.4 110.7X - SQL ORC MR 843 / 854 1.2 804.0 10.9X - - - Single Column Scan from 100 columns Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - SQL CSV 16503 / 16622 0.1 15738.9 1.0X - SQL Json 19109 / 19184 0.1 18224.2 0.9X - SQL Parquet Vectorized 99 / 108 10.6 94.3 166.8X - SQL Parquet MR 253 / 264 4.1 241.6 65.1X - SQL ORC Vectorized 107 / 114 9.8 101.6 154.8X - SQL ORC Vectorized with copy 107 / 118 9.8 102.1 154.1X - SQL ORC MR 1526 / 1529 0.7 1455.3 10.8X - */ benchmark.run() } } } - def main(args: Array[String]): Unit = { - Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType => - numericScanBenchmark(1024 * 1024 * 15, dataType) + override def runBenchmarkSuite(): Unit = { + runBenchmark("SQL Single Numeric Column Scan") { + Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { + dataType => numericScanBenchmark(1024 * 1024 * 15, dataType) + } + } + runBenchmark("Int and String Scan") { + intStringScanBenchmark(1024 * 1024 * 10) } - intStringScanBenchmark(1024 * 1024 * 10) - repeatedStringScanBenchmark(1024 * 1024 * 10) - partitionTableScanBenchmark(1024 * 1024 * 15) - for (fractionOfNulls <- List(0.0, 0.50, 0.95)) { - stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls) + runBenchmark("Repeated String Scan") { + repeatedStringScanBenchmark(1024 * 1024 * 10) } - for (columnWidth <- List(10, 50, 100)) { - columnsBenchmark(1024 * 1024 * 1, columnWidth) + runBenchmark("Partitioned Table Scan") { + partitionTableScanBenchmark(1024 * 1024 * 15) + } + runBenchmark("String with Nulls Scan") { + for (fractionOfNulls <- List(0.0, 0.50, 0.95)) { + stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls) + } + } + runBenchmark("Single Column Scan From Wide Columns") { + for (columnWidth <- List(10, 50, 100)) { + columnsBenchmark(1024 * 1024 * 1, columnWidth) + } } } }