rapidsai · paul-aiyedun · Sep 26, 2025 · Sep 23, 2025 · Sep 26, 2025
@@ -12,3 +12,6 @@ __pycache__/
 
 # Maven cache for Java builds in docker
 .mvn_cache/
+
+# Default benchmark output directory
+benchmark_output
@@ -113,5 +113,9 @@ Note that `velox-testing/presto/testing/integration_tests` and `velox-testing/be
 ### Setting Up Benchmark Tables
 A couple of utility scripts have been added to facilitate the process of setting up benchmark tables either from scratch or on top of existing benchmark data (Parquet) files. Specifically, the `setup_benchmark_tables.sh` script can be used to set up a new schema and tables on top of already generated benchmark data files. Execute `./setup_benchmark_tables.sh --help` to get more details about script options. The `setup_benchmark_data_and_tables.sh` script can be used to generate benchmark data at a specified scale factor and set up a schema and tables on top of the generated data files. Execute `./setup_benchmark_data_and_tables.sh --help` to get more details about script options. Both scripts should be executed from within the `velox-testing/presto/scripts` directory.
 
+> [!TIP]
+> Add `export PRESTO_DATA_DIR={path to directory that will contain datasets}` to your `~/.bashrc` file. This avoids having to always set the `PRESTO_DATA_DIR` environment variable when executing the `start_*` scripts and/or the schema/table setup scripts.
+
+
 ## Presto Benchmarking
-TODO: Add details when related infrastructure is added.
+The Presto benchmarks are implemented using the [pytest](https://docs.pytest.org/en/stable/) framework and builds on top of infrastructure that was implemented for general Presto testing. Specifically, the `start_*` scripts mentioned in the "Presto Testing" section can be used to start up a Presto variant (make sure the `PRESTO_DATA_DIR` environment variable is set appropriately before running the script), and the benchmark can be run by executing the `run_benchmark.sh` script from within the `velox-testing/presto/scripts` directory. Execute `./run_benchmark.sh --help` to get more details about the benchmark script options.
@@ -0,0 +1,213 @@
+#!/bin/bash
+
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+print_help() {
+  cat << EOF
+
+Usage: $0 [OPTIONS]
+
+This script runs the specified type of benchmark.
+
+OPTIONS:
+    -h, --help              Show this help message.
+    -b, --benchmark-type    Type of benchmark to run. Only "tpch" and "tpcds" are currently supported.
+    -q, --queries           Set of benchmark queries to run. This should be a comma separate list of query numbers.
+                            By default, all benchmark queries are run.
+    -h, --hostname          Hostname of the Presto coordinator.
+    -p, --port              Port number of the Presto coordinator.
+    -u, --user              User who queries will be executed as.
+    -s, --schema-name       Name of the schema containing the tables that will be queried. This must be an existing
+                            schema that contains the benchmark tables.
+    -o, --output-dir        Directory path that will contain the output files from the benchmark run.
+                            By default, output files are written to "$(pwd)/benchmark_output".
+    -i, --iterations        Number of query run iterations. By default, 5 iterations are run.
+    -t, --tag               Tag associated with the benchmark run. When a tag is specified, benchmark output will be
+                            stored inside a directory under the --output-dir path with a name matching the tag name.
+                            Tags must contain only alphanumeric and underscore characters.
+
+EXAMPLES:
+    $0 -b tpch -s bench_sf100
+    $0 -b tpch -q "1,2" -s bench_sf100
+    $0 -b tpch -s bench_sf100 -i 10 -o ~/tpch_benchmark_output
+    $0 -b tpch -s bench_sf100 -t gh200_cpu_sf100
+    $0 -h
+
+EOF
+}
+
+parse_args() { 
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      -h|--help)
+        print_help
+        exit 0
+        ;;
+      -b|--benchmark-type)
+        if [[ -n $2 ]]; then
+          BENCHMARK_TYPE=$2
+          shift 2
+        else
+          echo "Error: --benchmark-type requires a value"
+          exit 1
+        fi
+        ;;
+      -q|--queries)
+        if [[ -n $2 ]]; then
+          QUERIES=$2
+          shift 2
+        else
+          echo "Error: --queries requires a value"
+          exit 1
+        fi
+        ;;
+      -h|--hostname)
+        if [[ -n $2 ]]; then
+          HOST_NAME=$2
+          shift 2
+        else
+          echo "Error: --hostname requires a value"
+          exit 1
+        fi
+        ;;
+      -p|--port)
+        if [[ -n $2 ]]; then
+          PORT=$2
+          shift 2
+        else
+          echo "Error: --port requires a value"
+          exit 1
+        fi
+        ;;
+      -u|--user)
+        if [[ -n $2 ]]; then
+          USER_NAME=$2
+          shift 2
+        else
+          echo "Error: --user requires a value"
+          exit 1
+        fi
+        ;;
+      -s|--schema-name)
+        if [[ -n $2 ]]; then
+          SCHEMA_NAME=$2
+          shift 2
+        else
+          echo "Error: --schema-name requires a value"
+          exit 1
+        fi
+        ;;
+      -o|--output-dir)
+        if [[ -n $2 ]]; then
+          OUTPUT_DIR=$2
+          shift 2
+        else
+          echo "Error: --output-dir requires a value"
+          exit 1
+        fi
+        ;;
+      -i|--iterations)
+        if [[ -n $2 ]]; then
+          ITERATIONS=$2
+          shift 2
+        else
+          echo "Error: --iterations requires a value"
+          exit 1
+        fi
+        ;;
+      -t|--tag)
+        if [[ -n $2 ]]; then
+          TAG=$2
+          shift 2
+        else
+          echo "Error: --tag requires a value"
+          exit 1
+        fi
+        ;;
+      *)
+        echo "Error: Unknown argument $1"
+        print_help
+        exit 1
+        ;;
+    esac
+  done
+}
+
+parse_args "$@"
+
+if [[ -z ${BENCHMARK_TYPE} || ! ${BENCHMARK_TYPE} =~ ^tpc(h|ds)$ ]]; then
+  echo "Error: A valid benchmark type (tpch or tpcds) is required. Use the -b or --benchmark-type argument."
+  print_help
+  exit 1
+fi
+
+if [[ -z ${SCHEMA_NAME} ]]; then
+  echo "Error: A schema name must be set. Use the -s or --schema-name argument."
+  print_help
+  exit 1
+fi
+
+PYTEST_ARGS=("--schema-name ${SCHEMA_NAME}")
+
+if [[ -n ${QUERIES} ]]; then
+  PYTEST_ARGS+=("--queries ${QUERIES}")
+fi
+
+if [[ -n ${HOST_NAME} ]]; then
+  PYTEST_ARGS+=("--hostname ${HOST_NAME}")
+fi
+
+if [[ -n ${PORT} ]]; then
+  PYTEST_ARGS+=("--port ${PORT}")
+fi
+
+if [[ -n ${USER_NAME} ]]; then
+  PYTEST_ARGS+=("--user ${USER_NAME}")
+fi
+
+if [[ -n ${OUTPUT_DIR} ]]; then
+  PYTEST_ARGS+=("--output-dir ${OUTPUT_DIR}")
+fi
+
+if [[ -n ${ITERATIONS} ]]; then
+  PYTEST_ARGS+=("--iterations ${ITERATIONS}")
+fi
+
+if [[ -n ${TAG} ]]; then
+  if [[ ! ${TAG} =~ ^[a-zA-Z0-9_]+$ ]]; then
+    echo "Error: Invalid --tag value. Tags must contain only alphanumeric and underscore characters."
+    print_help
+    exit 1
+  fi
+  PYTEST_ARGS+=("--tag ${TAG}")
+fi
+
+source ../../scripts/py_env_functions.sh
+
+trap delete_python_virtual_env EXIT
+
+init_python_virtual_env
+
+TEST_DIR=$(readlink -f ../testing)
+pip install -q -r ${TEST_DIR}/requirements.txt
+
+source ./common_functions.sh
+
+wait_for_worker_node_registration "$HOST_NAME" "$PORT"
+
+BENCHMARK_TEST_DIR=${TEST_DIR}/performance_benchmarks
+pytest -q ${BENCHMARK_TEST_DIR}/${BENCHMARK_TYPE}_test.py ${PYTEST_ARGS[*]}
@@ -98,7 +98,7 @@ parse_args() {
         ;;
       -u|--user)
         if [[ -n $2 ]]; then
-          USER=$2
+          USER_NAME=$2
           shift 2
         else
           echo "Error: --user requires a value"
@@ -140,8 +140,6 @@ if [[ -z ${BENCHMARK_TYPE} || ! ${BENCHMARK_TYPE} =~ ^tpc(h|ds)$ ]]; then
   exit 1
 fi
 
-INTEGRATION_TEST_DIR=$(readlink -f ../testing/integration_tests)
-
 PYTEST_ARGS=()
 
 if [[ "${KEEP_TABLES}" == "true" ]]; then
@@ -160,8 +158,8 @@ if [[ -n ${PORT} ]]; then
   PYTEST_ARGS+=("--port ${PORT}")
 fi
 
-if [[ -n ${USER} ]]; then
-  PYTEST_ARGS+=("--user ${USER}")
+if [[ -n ${USER_NAME} ]]; then
+  PYTEST_ARGS+=("--user ${USER_NAME}")
 fi
 
 if [[ -n ${SCHEMA_NAME} ]]; then
@@ -188,10 +186,12 @@ trap delete_python_virtual_env EXIT
 
 init_python_virtual_env
 
-pip install -q -r ${INTEGRATION_TEST_DIR}/requirements.txt
+TEST_DIR=$(readlink -f ../testing)
+pip install -q -r ${TEST_DIR}/requirements.txt
 
 source ./common_functions.sh
 
 wait_for_worker_node_registration "$HOST_NAME" "$PORT"
 
+INTEGRATION_TEST_DIR=${TEST_DIR}/integration_tests
 pytest -v ${INTEGRATION_TEST_DIR}/${BENCHMARK_TYPE}_test.py ${PYTEST_ARGS[*]}
@@ -31,6 +31,7 @@ fi
 
 SCHEMA_GEN_SCRIPT_PATH=$(readlink -f ../../benchmark_data_tools/generate_table_schemas.py)
 CREATE_TABLES_SCRIPT_PATH=$(readlink -f ../../presto/testing/integration_tests/create_hive_tables.py)
+CREATE_TABLES_REQUIREMENTS_PATH=$(readlink -f ../../presto/testing/requirements.txt)
 TEMP_SCHEMA_DIR=$(readlink -f temp-schema-dir)
 
 function cleanup() {
@@ -42,5 +43,5 @@ trap cleanup EXIT
 ../../scripts/run_py_script.sh -p $SCHEMA_GEN_SCRIPT_PATH --benchmark-type $BENCHMARK_TYPE \
 --schema-name $SCHEMA_NAME --schemas-dir-path $TEMP_SCHEMA_DIR $CONVERT_DECIMALS_TO_FLOATS_ARG
 
-../../scripts/run_py_script.sh -p $CREATE_TABLES_SCRIPT_PATH --schema-name $SCHEMA_NAME \
---schemas-dir-path $TEMP_SCHEMA_DIR --data-dir-name $DATA_DIR_NAME
+../../scripts/run_py_script.sh -p $CREATE_TABLES_SCRIPT_PATH -r $CREATE_TABLES_REQUIREMENTS_PATH \
+--schema-name $SCHEMA_NAME --schemas-dir-path $TEMP_SCHEMA_DIR --data-dir-name $DATA_DIR_NAME
@@ -0,0 +1,70 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+def pytest_generate_tests(metafunc):
+    TPCH_FIXTURE_NAME = "tpch_query_id"
+    if TPCH_FIXTURE_NAME in metafunc.fixturenames:
+        TPCH_NUM_QUERIES = 22
+        set_query_id_param(metafunc, TPCH_FIXTURE_NAME, TPCH_NUM_QUERIES, [])
+
+    TPCDS_FIXTURE_NAME = "tpcds_query_id"
+    if TPCDS_FIXTURE_NAME in metafunc.fixturenames:
+        TPCDS_NUM_QUERIES = 99
+        TPCDS_DISABLED_QUERIES = [
+            16, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 1:224: Cannot check if date is BETWEEN varchar(10) and date", query_id=20250815_182910_01441_uy5t2)
+            32, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 1:162: Cannot check if date is BETWEEN varchar(10) and date", query_id=20250815_182915_01457_uy5t2)
+            58, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 1:251: '=' cannot be applied to date, varchar(10)", query_id=20250815_182921_01483_uy5t2)
+            70, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="Invalid reference to output of SELECT clause from grouping() expression in ORDER BY", query_id=20250815_182928_01495_uy5t2)
+            72, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 1:886: '+' cannot be applied to date, integer", query_id=20250815_182928_01497_uy5t2)
+            83, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 1:258: IN value and list items must be the same type: date", query_id=20250815_182930_01508_uy5t2)
+            86, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="Invalid reference to output of SELECT clause from grouping() expression in ORDER BY", query_id=20250815_182935_01511_uy5t2)
+            92, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 1:156: Cannot check if date is BETWEEN varchar(10) and date", query_id=20250815_182936_01517_uy5t2)
+            94, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 1:222: Cannot check if date is BETWEEN varchar(10) and date", query_id=20250815_182936_01519_uy5t2)
+            95, # PrestoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 1:444: Cannot check if date is BETWEEN varchar(10) and date", query_id=20250815_182936_01520_uy5t2)
+
+            # The following queries fail on presto native CPU with PrestoQueryError(type=INTERNAL_ERROR, name=GENERIC_INTERNAL_ERROR, message="Internal error", query_id=...)
+            14,
+            31,
+            64,
+            74,
+            88,
+        ]
+        set_query_id_param(metafunc, TPCDS_FIXTURE_NAME, TPCDS_NUM_QUERIES, TPCDS_DISABLED_QUERIES)
+
+
+def set_query_id_param(metafunc, param_name, num_queries, disabled_queries):
+    queries = metafunc.config.getoption("--queries")
+    metafunc.parametrize(param_name, get_query_ids(num_queries, queries, disabled_queries))
+
+
+def get_query_ids(num_queries, selected_query_ids, disabled_queries):
+    query_ids = parse_selected_query_ids(selected_query_ids, num_queries)
+    if len(query_ids) == 0:
+        query_ids = [id for id in range(1, num_queries + 1) if id not in disabled_queries]
+    return format_query_ids(query_ids)
+
+
+def parse_selected_query_ids(selected_query_ids, num_queries):
+    query_ids = []
+    if selected_query_ids and selected_query_ids.strip():
+        for id_str in selected_query_ids.split(","):
+            id_int = int(id_str)
+            if id_int < 1 or id_int > num_queries:
+                raise ValueError(f"Invalid Query ID: {id_str}. Query ID must be between 1 and {num_queries}.")
+            query_ids.append(id_int)
+    return query_ids
+
+
+def format_query_ids(query_ids):
+    return [f"Q{query_id}" for query_id in query_ids]
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from . import test_utils
+
+
+@pytest.fixture(scope="module")
+def tpch_queries(request):
+    queries = test_utils.get_queries(request.node.obj.BENCHMARK_TYPE)
+    # Referencing the CTE defined "supplier_no" alias in the parent query causes issues on presto.
+    queries["Q15"] = queries["Q15"].replace(" AS supplier_no", "").replace("supplier_no", "l_suppkey")
+    return queries
+
+
+@pytest.fixture(scope="module")
+def tpcds_queries(request):
+    return test_utils.get_queries(request.node.obj.BENCHMARK_TYPE)