diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh index d6e8bcc0ce..4721ae4d18 100755 --- a/.buildkite/scripts/run_in_docker.sh +++ b/.buildkite/scripts/run_in_docker.sh @@ -11,6 +11,12 @@ if [ "$#" -eq 0 ]; then exit 1 fi +ENV_VARS=( + -e TEST_MODEL="$TEST_MODEL" + -e MINIMUM_ACCURACY_THRESHOLD="$MINIMUM_ACCURACY_THRESHOLD" + -e TENSOR_PARALLEL_SIZE="$TENSOR_PARALLEL_SIZE" +) + if ! grep -q "^HF_TOKEN=" /etc/environment; then gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \ sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)" @@ -76,7 +82,8 @@ docker builder prune -f echo "Cleanup complete." -docker build --no-cache -f docker/Dockerfile -t "vllm-tpu:${BUILDKITE_COMMIT}" . +IMAGE_NAME="vllm-tpu" +docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" . exec docker run \ --privileged \ @@ -84,6 +91,7 @@ exec docker run \ --shm-size=16G \ --rm \ -v "$LOCAL_HF_HOME":"$DOCKER_HF_HOME" \ + "${ENV_VARS[@]}" \ -e HF_HOME="$DOCKER_HF_HOME" \ -e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \ -e HF_TOKEN="$HF_TOKEN" \ @@ -96,5 +104,5 @@ exec docker run \ ${JAX_RANDOM_WEIGHTS:+-e JAX_RANDOM_WEIGHTS="$JAX_RANDOM_WEIGHTS"} \ ${SKIP_ACCURACY_TESTS:+-e SKIP_ACCURACY_TESTS="$SKIP_ACCURACY_TESTS"} \ ${VLLM_MLA_DISABLE:+-e VLLM_MLA_DISABLE="$VLLM_MLA_DISABLE"} \ - "vllm-tpu:${BUILDKITE_COMMIT}" \ + "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \ "$@" # Pass all script arguments as the command to run in the container diff --git a/docker/Dockerfile b/docker/Dockerfile index 168a22d337..f5b64fd80a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,9 +20,13 @@ RUN VLLM_TARGET_DEVICE="tpu" pip install -e . # Install test dependencies RUN python3 -m pip install -e tests/vllm_test_utils -RUN python3 -m pip install --no-cache-dir git+https://github.com/thuml/depyf.git pytest pytest-asyncio tpu-info datasets 'lm_eval[api]==0.4.4' -RUN python3 -m pip install pytest-cov -RUN python3 -m pip install numba +RUN python3 -m pip install --no-cache-dir \ + git+https://github.com/thuml/depyf.git \ + pytest-asyncio \ + git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \ + pytest-cov \ + tblib \ + numba # Install tpu_commons WORKDIR /workspace/tpu_commons diff --git a/scripts/vllm/integration/conftest.py b/scripts/vllm/integration/conftest.py new file mode 100644 index 0000000000..86c0e45343 --- /dev/null +++ b/scripts/vllm/integration/conftest.py @@ -0,0 +1,21 @@ +def pytest_addoption(parser): + """Adds custom command-line options to pytest.""" + parser.addoption("--tensor-parallel-size", + type=int, + default=1, + help="The tensor parallel size to use for the test.") + parser.addoption( + "--expected-value", + type=float, + default=None, + help= + "This value will be used to compare the measure value and determine if the test passes or fails." + ) + parser.addoption("--model-name", + type=str, + default=None, + help="Model name to test (e.g., 'model1')") + parser.addoption("--fp8-kv-model-name", + type=str, + default=None, + help="Model name to test fp8-kv (e.g., 'model1')") diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py new file mode 100644 index 0000000000..154d65d526 --- /dev/null +++ b/scripts/vllm/integration/test_accuracy.py @@ -0,0 +1,114 @@ +# Copied from vLLM: https://github.com/vllm-project/vllm/blob/839ab00/tests/entrypoints/llm/test_accuracy.py + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file test accuracy of the vLLM server via LMEval. +It uses local-completions, which interacts with vLLM +through the OAI API with N concurrent connections. +This simulates real work usage of the API and makes +sure that the zmq frontend mp RPC message passing and +AsyncLLMEngine are working correctly. +""" + +import threading + +import lm_eval +import pytest +from vllm.platforms import current_platform + +MODEL_NAMES = [] +FP8_KV_MODEL_NAMES = [] +NUM_CONCURRENT = 500 +TASK = "gsm8k" +FILTER = "exact_match,strict-match" +RTOL = 0.03 +_JSON_WRITE_LOCK = threading.Lock() + + +def run_test(model_name, expected_value, more_args=None): + """Run the end to end accuracy test.""" + print(f"Running test for model: {model_name}") + + model_args = f"pretrained={model_name},max_model_len=4096" + if more_args is not None: + model_args = "{},{}".format(model_args, more_args) + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks="gsm8k", + batch_size="auto", + ) + + measured_value = results["results"][TASK][FILTER] + assert (measured_value - RTOL < expected_value < measured_value + + RTOL), f"Expected: {expected_value} | Measured: {measured_value}" + + +@pytest.mark.skipif(not current_platform.is_cuda() + and not current_platform.is_tpu(), + reason="V1 is currently only supported on CUDA and TPU") +def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch, + request: pytest.FixtureRequest): + """Run with the V1 Engine.""" + model = request.config.getoption("--model-name") + print(f"Testing model: {model}...") + + tp_size = request.config.getoption("--tensor-parallel-size") + expected_value = request.config.getoption("--expected-value") + + if expected_value is None: + raise ValueError + + if tp_size is None: + tp_size = 1 + elif tp_size < 1 or tp_size > 8: + raise ValueError + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + more_args = None + if current_platform.is_tpu(): + more_args = "max_model_len=2048,max_num_seqs=64" + tp_size_str = f"tensor_parallel_size={tp_size}" + more_args += ",{}".format(tp_size_str) + + print(f"common args: {more_args}") + + run_test(model, expected_value, more_args) + + +@pytest.mark.skipif(not current_platform.is_cuda() + and not current_platform.is_tpu(), + reason="V1 is currently only supported on CUDA and TPU") +def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( + monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest): + """Run with the V1 Engine.""" + fp8_kv_model = request.config.getoption("--fp8-kv-model-name") + print(f"Testing fp8_kv_model: {fp8_kv_model}...") + + tp_size = request.config.getoption("--tensor-parallel-size") + expected_value = request.config.getoption("--expected-value") + + if expected_value is None: + raise ValueError + + if tp_size is None: + tp_size = 1 + elif tp_size < 1 or tp_size > 8: + raise ValueError + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + more_args = None + if current_platform.is_tpu(): + more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8" + tp_size_str = f"tensor_parallel_size={tp_size}" + more_args += ",{}".format(tp_size_str) + + print(f"common args: {more_args}") + + run_test(fp8_kv_model, expected_value, more_args) diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh index 2ef94372ac..2d776cf5ea 100644 --- a/tests/e2e/benchmarking/test_accuracy.sh +++ b/tests/e2e/benchmarking/test_accuracy.sh @@ -1,4 +1,91 @@ -#!/bin/sh +#!/bin/bash -# TODO : to be added by https://github.com/vllm-project/tpu_commons/pull/639 -echo "[placeholder] accuracy test passed" +test_model="" +tensor_parallel_size=1 +minimum_accuracy_threshold=0 + +extra_serve_args=() +echo extra_serve_args: "${extra_serve_args[@]}" + +root_dir=/workspace +exit_code=0 + +helpFunction() +{ + echo "" + echo "Usage: $0 [-r full_path_to_root_dir -m model_id]" + echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)" + exit 1 +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -r|--root-dir-path) + root_dir="$2" + shift + shift + ;; + -h|--help) + helpFunction + ;; + *) # unknown option + echo "Unknown option: $1" + helpFunction + ;; + esac +done + +if [ -n "$TEST_MODEL" ]; then + test_model="$TEST_MODEL" +fi + +if [ -n "$MINIMUM_ACCURACY_THRESHOLD" ]; then + minimum_accuracy_threshold="$MINIMUM_ACCURACY_THRESHOLD" +fi + +if [ -n "$TENSOR_PARALLEL_SIZE" ]; then + tensor_parallel_size="$TENSOR_PARALLEL_SIZE" +fi + +# Check if test_model is provided and not empty +if [[ -z "$test_model" ]]; then + echo "Error: Test model name (-m) is a required argument." >&2 + has_error=1 +fi + +# Check if tensor_parallel_size is an integer and greater than 0 +if ! [[ "$tensor_parallel_size" =~ ^[1-9][0-9]*$ ]]; then + echo "Error: Tensor parallel size (-t) must be an integer greater than 0. Got: '$tensor_parallel_size'" >&2 + has_error=1 +fi + +# Check if minimum_accuracy_threshold is a float and greater than 0 +if ! awk -v num="$minimum_accuracy_threshold" 'BEGIN { exit !(num > 0) }'; then + echo "Error: Minimum accuracy threshold (-e) must be a number greater than 0. Got: '$minimum_accuracy_threshold'" >&2 + has_error=1 +fi + +# If any validation failed, print help and exit +if [[ "$has_error" -ne 0 ]]; then + helpFunction +fi + + +echo "Using the root directory at $root_dir" + +cd "$root_dir"/vllm/tests/entrypoints/llm || exit + +# Overwrite a few of the vLLM benchmarking scripts with the TPU Commons ones +cp "$root_dir"/tpu_commons/scripts/vllm/integration/*.py "$root_dir"/vllm/tests/entrypoints/llm/ + +echo "--------------------------------------------------" +echo "Running integration for model: $test_model" +echo "--------------------------------------------------" + +# Default action +python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine \ + --tensor-parallel-size="$tensor_parallel_size" \ + --model-name="$test_model" \ + --expected-value="$minimum_accuracy_threshold" + +exit $exit_code