From 63bd0e0c3f3cdf7edca348fd69f58a949c1ebff3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Jan 2025 09:54:45 -0700 Subject: [PATCH 1/3] fix links and provide complete scripts --- .../benchmark-results/tpc-ds.md | 61 ++++++++++++++++ .../benchmark-results/tpc-h.md | 71 +++++++++++++++++-- docs/source/contributor-guide/benchmarking.md | 60 ---------------- 3 files changed, 128 insertions(+), 64 deletions(-) diff --git a/docs/source/contributor-guide/benchmark-results/tpc-ds.md b/docs/source/contributor-guide/benchmark-results/tpc-ds.md index a6650f7e74..1554cf2434 100644 --- a/docs/source/contributor-guide/benchmark-results/tpc-ds.md +++ b/docs/source/contributor-guide/benchmark-results/tpc-ds.md @@ -43,3 +43,64 @@ The raw results of these benchmarks in JSON format is available here: - [Spark](0.5.0/spark-tpcds.json) - [Comet](0.5.0/comet-tpcds.json) + +# Scripts + +Here are the scripts that were used to generate these results. + +## Apache Spark + +```shell +#!/bin/bash +$SPARK_HOME/bin/spark-submit \ + --master $SPARK_MASTER \ + --conf spark.driver.memory=8G \ + --conf spark.executor.memory=32G \ + --conf spark.executor.instances=2 \ + --conf spark.executor.cores=8 \ + --conf spark.cores.max=16 \ + --conf spark.eventLog.enabled=true \ + tpcbench.py \ + --benchmark tpcds \ + --name spark \ + --data /mnt/bigdata/tpcds/sf100/ \ + --queries ../../tpcds/ \ + --output . \ + --iterations 5 +``` + +## Apache Spark + Comet + +```shell +#!/bin/bash +$SPARK_HOME/bin/spark-submit \ + --master $SPARK_MASTER \ + --conf spark.driver.memory=8G \ + --conf spark.executor.instances=2 \ + --conf spark.executor.memory=16G \ + --conf spark.executor.cores=8 \ + --total-executor-cores=16 \ + --conf spark.eventLog.enabled=true \ + --conf spark.driver.maxResultSize=2G \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=24g \ + --jars $COMET_JAR \ + --conf spark.driver.extraClassPath=$COMET_JAR \ + --conf spark.executor.extraClassPath=$COMET_JAR \ + --conf spark.plugins=org.apache.spark.CometPlugin \ + --conf spark.comet.enabled=true \ + --conf spark.comet.cast.allowIncompatible=true \ + --conf spark.comet.exec.replaceSortMergeJoin=false \ + --conf spark.comet.exec.shuffle.enabled=true \ + --conf spark.comet.exec.shuffle.mode=auto \ + --conf spark.comet.exec.shuffle.fallbackToColumnar=true \ + --conf spark.comet.exec.shuffle.compression.codec=lz4 \ + --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ + tpcbench.py \ + --name comet \ + --benchmark tpcds \ + --data /mnt/bigdata/tpcds/sf100/ \ + --queries ../../tpcds/ \ + --output . \ + --iterations 5 +``` \ No newline at end of file diff --git a/docs/source/contributor-guide/benchmark-results/tpc-h.md b/docs/source/contributor-guide/benchmark-results/tpc-h.md index 336deb7a7c..02acd0bdb6 100644 --- a/docs/source/contributor-guide/benchmark-results/tpc-h.md +++ b/docs/source/contributor-guide/benchmark-results/tpc-h.md @@ -25,21 +25,84 @@ and we encourage you to run these benchmarks in your own environments. The tracking issue for improving TPC-H performance is [#391](https://github.com/apache/datafusion-comet/issues/391). -![](../../_static/images/benchmark-results/0.5.0-SNAPSHOT-2025-01-09/tpch_allqueries.png) +![](../../_static/images/benchmark-results/0.5.0/tpch_allqueries.png) Here is a breakdown showing relative performance of Spark and Comet for each query. -![](../../_static/images/benchmark-results/0.5.0-SNAPSHOT-2025-01-09/tpch_queries_compare.png) +![](../../_static/images/benchmark-results/0.5.0/tpch_queries_compare.png) The following chart shows how much Comet currently accelerates each query from the benchmark in relative terms. -![](../../_static/images/benchmark-results/0.5.0-SNAPSHOT-2025-01-09/tpch_queries_speedup_rel.png) +![](../../_static/images/benchmark-results/0.5.0/tpch_queries_speedup_rel.png) The following chart shows how much Comet currently accelerates each query from the benchmark in absolute terms. -![](../../_static/images/benchmark-results/0.5.0-SNAPSHOT-2025-01-09/tpch_queries_speedup_abs.png) +![](../../_static/images/benchmark-results/0.5.0/tpch_queries_speedup_abs.png) The raw results of these benchmarks in JSON format is available here: - [Spark](0.5.0/spark-tpch.json) - [Comet](0.5.0/comet-tpch.json) + +# Scripts + +Here are the scripts that were used to generate these results. + +## Apache Spark + +```shell +#!/bin/bash +$SPARK_HOME/bin/spark-submit \ + --master $SPARK_MASTER \ + --conf spark.driver.memory=8G \ + --conf spark.executor.instances=1 \ + --conf spark.executor.cores=8 \ + --conf spark.cores.max=8 \ + --conf spark.executor.memory=16g \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=16g \ + --conf spark.eventLog.enabled=true \ + tpcbench.py \ + --name spark \ + --benchmark tpch \ + --data /mnt/bigdata/tpch/sf100/ \ + --queries /home/andy/git/apache/datafusion-benchmarks/tpch/queries \ + --output . \ + --iterations 5 + +``` + +## Apache Spark + Comet + +```shell +#!/bin/bash +$SPARK_HOME/bin/spark-submit \ + --master $SPARK_MASTER \ + --conf spark.driver.memory=8G \ + --conf spark.executor.instances=1 \ + --conf spark.executor.cores=8 \ + --conf spark.cores.max=8 \ + --conf spark.executor.memory=16g \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=16g \ + --conf spark.comet.exec.replaceSortMergeJoin=true \ + --conf spark.eventLog.enabled=true \ + --jars $COMET_JAR \ + --driver-class-path $COMET_JAR \ + --conf spark.driver.extraClassPath=$COMET_JAR \ + --conf spark.executor.extraClassPath=$COMET_JAR \ + --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ + --conf spark.comet.enabled=true \ + --conf spark.comet.exec.shuffle.enabled=true \ + --conf spark.comet.exec.shuffle.mode=auto \ + --conf spark.comet.exec.shuffle.fallbackToColumnar=true \ + --conf spark.comet.exec.shuffle.compression.codec=lz4 \ + --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ + tpcbench.py \ + --name comet \ + --benchmark tpch \ + --data /mnt/bigdata/tpch/sf100/ \ + --queries /home/andy/git/apache/datafusion-benchmarks/tpch/queries \ + --output . \ + --iterations 5 +``` \ No newline at end of file diff --git a/docs/source/contributor-guide/benchmarking.md b/docs/source/contributor-guide/benchmarking.md index 173d598ac2..1193ada625 100644 --- a/docs/source/contributor-guide/benchmarking.md +++ b/docs/source/contributor-guide/benchmarking.md @@ -24,66 +24,6 @@ benchmarking documentation and scripts are available in the [DataFusion Benchmar We also have many micro benchmarks that can be run from an IDE located [here](https://github.com/apache/datafusion-comet/tree/main/spark/src/test/scala/org/apache/spark/sql/benchmark). -Here are example commands for running the benchmarks against a Spark cluster. This command will need to be -adapted based on the Spark environment and location of data files. - -These commands are intended to be run from the `runners/datafusion-comet` directory in the `datafusion-benchmarks` -repository. - -## Running Benchmarks Against Apache Spark - -```shell -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.memory=32G \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - tpcbench.py \ - --benchmark tpch \ - --data /mnt/bigdata/tpch/sf100/ \ - --queries ../../tpch/queries \ - --iterations 3 -``` - -## Running Benchmarks Against Apache Spark with Apache DataFusion Comet Enabled - -### TPC-H - -```shell -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.memory=16G \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --jars $COMET_JAR \ - --conf spark.driver.extraClassPath=$COMET_JAR \ - --conf spark.executor.extraClassPath=$COMET_JAR \ - --conf spark.plugins=org.apache.spark.CometPlugin \ - --conf spark.comet.cast.allowIncompatible=true \ - --conf spark.comet.exec.replaceSortMergeJoin=true \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.comet.exec.shuffle.enabled=true \ - --conf spark.comet.exec.shuffle.mode=auto \ - --conf spark.comet.exec.shuffle.enableFastEncoding=true \ - --conf spark.comet.exec.shuffle.fallbackToColumnar=true \ - --conf spark.comet.exec.shuffle.compression.codec=lz4 \ - tpcbench.py \ - --benchmark tpch \ - --data /mnt/bigdata/tpch/sf100/ \ - --queries ../../tpch/queries \ - --iterations 3 -``` - -### TPC-DS - -For TPC-DS, use `spark.comet.exec.replaceSortMergeJoin=false`. - ## Current Benchmark Results - [Benchmarks derived from TPC-H](benchmark-results/tpc-h) From 7e756edbd2740bc05700eb51f10e3959c475d3e8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Jan 2025 09:56:37 -0700 Subject: [PATCH 2/3] fix path --- docs/source/contributor-guide/benchmark-results/tpc-h.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/contributor-guide/benchmark-results/tpc-h.md b/docs/source/contributor-guide/benchmark-results/tpc-h.md index 02acd0bdb6..d383cae852 100644 --- a/docs/source/contributor-guide/benchmark-results/tpc-h.md +++ b/docs/source/contributor-guide/benchmark-results/tpc-h.md @@ -66,7 +66,7 @@ $SPARK_HOME/bin/spark-submit \ --name spark \ --benchmark tpch \ --data /mnt/bigdata/tpch/sf100/ \ - --queries /home/andy/git/apache/datafusion-benchmarks/tpch/queries \ + --queries ../../tpch/queries \ --output . \ --iterations 5 @@ -102,7 +102,7 @@ $SPARK_HOME/bin/spark-submit \ --name comet \ --benchmark tpch \ --data /mnt/bigdata/tpch/sf100/ \ - --queries /home/andy/git/apache/datafusion-benchmarks/tpch/queries \ + --queries ../../tpch/queries \ --output . \ --iterations 5 ``` \ No newline at end of file From ac0dcbc95ae4859379a161b6a7b841afca221432 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Jan 2025 10:21:19 -0700 Subject: [PATCH 3/3] fix incorrect text --- docs/source/contributor-guide/benchmark-results/tpc-ds.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/contributor-guide/benchmark-results/tpc-ds.md b/docs/source/contributor-guide/benchmark-results/tpc-ds.md index 1554cf2434..012913189a 100644 --- a/docs/source/contributor-guide/benchmark-results/tpc-ds.md +++ b/docs/source/contributor-guide/benchmark-results/tpc-ds.md @@ -19,8 +19,8 @@ under the License. # Apache DataFusion Comet: Benchmarks Derived From TPC-DS -The following benchmarks were performed on a two node Kubernetes cluster with -data stored locally in Parquet format on NVMe storage. Performance characteristics will vary in different environments +The following benchmarks were performed on a Linux workstation with PCIe 5, AMD 7950X CPU (16 cores), 128 GB RAM, and +data stored locally in Parquet format on NVMe storage. Performance characteristics will vary in different environments and we encourage you to run these benchmarks in your own environments. The tracking issue for improving TPC-DS performance is [#858](https://github.com/apache/datafusion-comet/issues/858).