vllm-project · jcyang43 · Oct 7, 2025 · Oct 7, 2025
@@ -11,6 +11,12 @@ if [ "$#" -eq 0 ]; then
   exit 1
 fi
 
+ENV_VARS=(
+  -e TEST_MODEL="$TEST_MODEL"
+  -e MINIMUM_ACCURACY_THRESHOLD="$MINIMUM_ACCURACY_THRESHOLD"
+  -e TENSOR_PARALLEL_SIZE="$TENSOR_PARALLEL_SIZE"
+)
+
 if ! grep -q "^HF_TOKEN=" /etc/environment; then
   gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \
   sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)"
@@ -76,14 +82,16 @@ docker builder prune -f
 
 echo "Cleanup complete."
 
-docker build --no-cache -f docker/Dockerfile -t "vllm-tpu:${BUILDKITE_COMMIT}" .
+IMAGE_NAME="vllm-tpu"
+docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
 
 exec docker run \
   --privileged \
   --net host \
   --shm-size=16G \
   --rm \
   -v "$LOCAL_HF_HOME":"$DOCKER_HF_HOME" \
+  "${ENV_VARS[@]}" \
   -e HF_HOME="$DOCKER_HF_HOME" \
   -e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \
   -e HF_TOKEN="$HF_TOKEN" \
@@ -96,5 +104,5 @@ exec docker run \
   ${JAX_RANDOM_WEIGHTS:+-e JAX_RANDOM_WEIGHTS="$JAX_RANDOM_WEIGHTS"} \
   ${SKIP_ACCURACY_TESTS:+-e SKIP_ACCURACY_TESTS="$SKIP_ACCURACY_TESTS"} \
   ${VLLM_MLA_DISABLE:+-e VLLM_MLA_DISABLE="$VLLM_MLA_DISABLE"} \
-  "vllm-tpu:${BUILDKITE_COMMIT}" \
+  "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
   "$@" # Pass all script arguments as the command to run in the container
@@ -20,9 +20,13 @@ RUN VLLM_TARGET_DEVICE="tpu" pip install -e .
 
 # Install test dependencies
 RUN python3 -m pip install -e tests/vllm_test_utils
-RUN python3 -m pip install --no-cache-dir git+https://github.com/thuml/depyf.git pytest pytest-asyncio tpu-info datasets 'lm_eval[api]==0.4.4'
-RUN python3 -m pip install pytest-cov
-RUN python3 -m pip install numba
+RUN python3 -m pip install --no-cache-dir \
+    git+https://github.com/thuml/depyf.git \
+    pytest-asyncio \
+    git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
+    pytest-cov \
+    tblib \
+    numba
 
 # Install tpu_commons
 WORKDIR /workspace/tpu_commons

@@ -0,0 +1,21 @@
+def pytest_addoption(parser):
+    """Adds custom command-line options to pytest."""
+    parser.addoption("--tensor-parallel-size",
+                     type=int,
+                     default=1,
+                     help="The tensor parallel size to use for the test.")
+    parser.addoption(
+        "--expected-value",
+        type=float,
+        default=None,
+        help=
+        "This value will be used to compare the measure value and determine if the test passes or fails."
+    )
+    parser.addoption("--model-name",
+                     type=str,
+                     default=None,
+                     help="Model name to test (e.g., 'model1')")
+    parser.addoption("--fp8-kv-model-name",
+                     type=str,
+                     default=None,
+                     help="Model name to test fp8-kv (e.g., 'model1')")
@@ -0,0 +1,114 @@
+# Copied from vLLM: https://github.com/vllm-project/vllm/blob/839ab00/tests/entrypoints/llm/test_accuracy.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import threading
+
+import lm_eval
+import pytest
+from vllm.platforms import current_platform
+
+MODEL_NAMES = []
+FP8_KV_MODEL_NAMES = []
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+_JSON_WRITE_LOCK = threading.Lock()
+
+
+def run_test(model_name, expected_value, more_args=None):
+    """Run the end to end accuracy test."""
+    print(f"Running test for model: {model_name}")
+
+    model_args = f"pretrained={model_name},max_model_len=4096"
+    if more_args is not None:
+        model_args = "{},{}".format(model_args, more_args)
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < expected_value < measured_value +
+            RTOL), f"Expected: {expected_value} |  Measured: {measured_value}"
+
+
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
+                    reason="V1 is currently only supported on CUDA and TPU")
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch,
+                                    request: pytest.FixtureRequest):
+    """Run with the V1 Engine."""
+    model = request.config.getoption("--model-name")
+    print(f"Testing model: {model}...")
+
+    tp_size = request.config.getoption("--tensor-parallel-size")
+    expected_value = request.config.getoption("--expected-value")
+
+    if expected_value is None:
+        raise ValueError
+
+    if tp_size is None:
+        tp_size = 1
+    elif tp_size < 1 or tp_size > 8:
+        raise ValueError
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        more_args = None
+        if current_platform.is_tpu():
+            more_args = "max_model_len=2048,max_num_seqs=64"
+            tp_size_str = f"tensor_parallel_size={tp_size}"
+            more_args += ",{}".format(tp_size_str)
+
+        print(f"common args: {more_args}")
+
+        run_test(model, expected_value, more_args)
+
+
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
+                    reason="V1 is currently only supported on CUDA and TPU")
+def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
+        monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest):
+    """Run with the V1 Engine."""
+    fp8_kv_model = request.config.getoption("--fp8-kv-model-name")
+    print(f"Testing fp8_kv_model: {fp8_kv_model}...")
+
+    tp_size = request.config.getoption("--tensor-parallel-size")
+    expected_value = request.config.getoption("--expected-value")
+
+    if expected_value is None:
+        raise ValueError
+
+    if tp_size is None:
+        tp_size = 1
+    elif tp_size < 1 or tp_size > 8:
+        raise ValueError
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        more_args = None
+        if current_platform.is_tpu():
+            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
+            tp_size_str = f"tensor_parallel_size={tp_size}"
+            more_args += ",{}".format(tp_size_str)
+
+        print(f"common args: {more_args}")
+
+        run_test(fp8_kv_model, expected_value, more_args)
@@ -1,4 +1,91 @@
-#!/bin/sh
+#!/bin/bash
 
-# TODO : to be added by https://github.com/vllm-project/tpu_commons/pull/639
-echo "[placeholder] accuracy test passed"
+test_model=""
+tensor_parallel_size=1
+minimum_accuracy_threshold=0
+
+extra_serve_args=()
+echo extra_serve_args: "${extra_serve_args[@]}"
+
+root_dir=/workspace
+exit_code=0
+
+helpFunction()
+{
+   echo ""
+   echo "Usage: $0 [-r full_path_to_root_dir -m model_id]"
+   echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)"
+   exit 1
+}
+
+while [[ "$#" -gt 0 ]]; do
+    case "$1" in
+        -r|--root-dir-path)
+            root_dir="$2"
+            shift
+            shift
+            ;;
+        -h|--help)
+            helpFunction
+            ;;
+        *) # unknown option
+            echo "Unknown option: $1"
+            helpFunction
+            ;;
+    esac
+done
+
+if [ -n "$TEST_MODEL" ]; then
+  test_model="$TEST_MODEL"
+fi
+
+if [ -n "$MINIMUM_ACCURACY_THRESHOLD" ]; then
+  minimum_accuracy_threshold="$MINIMUM_ACCURACY_THRESHOLD"
+fi
+
+if [ -n "$TENSOR_PARALLEL_SIZE" ]; then
+  tensor_parallel_size="$TENSOR_PARALLEL_SIZE"
+fi
+
+# Check if test_model is provided and not empty
+if [[ -z "$test_model" ]]; then
+    echo "Error: Test model name (-m) is a required argument." >&2
+    has_error=1
+fi
+
+# Check if tensor_parallel_size is an integer and greater than 0
+if ! [[ "$tensor_parallel_size" =~ ^[1-9][0-9]*$ ]]; then
+    echo "Error: Tensor parallel size (-t) must be an integer greater than 0. Got: '$tensor_parallel_size'" >&2
+    has_error=1
+fi
+
+# Check if minimum_accuracy_threshold is a float and greater than 0
+if ! awk -v num="$minimum_accuracy_threshold" 'BEGIN { exit !(num > 0) }'; then
+    echo "Error: Minimum accuracy threshold (-e) must be a number greater than 0. Got: '$minimum_accuracy_threshold'" >&2
+    has_error=1
+fi
+
+# If any validation failed, print help and exit
+if [[ "$has_error" -ne 0 ]]; then
+    helpFunction
+fi
+
+
+echo "Using the root directory at $root_dir"
+
+cd "$root_dir"/vllm/tests/entrypoints/llm || exit
+
+# Overwrite a few of the vLLM benchmarking scripts with the TPU Commons ones
+cp "$root_dir"/tpu_commons/scripts/vllm/integration/*.py "$root_dir"/vllm/tests/entrypoints/llm/
+
+echo "--------------------------------------------------"
+echo "Running integration for model: $test_model"
+echo "--------------------------------------------------"
+
+# Default action
+python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    --tensor-parallel-size="$tensor_parallel_size" \
+    --model-name="$test_model" \
+    --expected-value="$minimum_accuracy_threshold"
+
+exit $exit_code