From d73e3d1e2704930d1a9dd444e23f0375a5276295 Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 21 Aug 2025 12:08:41 +0800 Subject: [PATCH 01/38] accomplish dev intergration test, this change adds a new test accuracy on TPU to the CI pipeline. The test covers the Llama-3.1-8B-Instruct and Llama-3.1-70B-Instruct models, modifying the test to support comparing `EXPECTED_VALUE`. It also allows users to input `tensor-parallel-size` and `model-names` parameters for greater flexibility during execution --- .buildkite/pipeline_jax.yml | 42 ++++- .buildkite/scripts/bootstrap.sh | 39 ++-- .buildkite/scripts/run_in_docker.sh | 44 ++++- docker/Dockerfile | 4 +- requirements_benchmarking.txt | 3 + scripts/vllm/integration/conftest.py | 30 ++++ scripts/vllm/integration/test_accuracy.py | 209 ++++++++++++++++++++++ tests/e2e/benchmarking/test_accuracy.sh | 66 +++++++ 8 files changed, 413 insertions(+), 24 deletions(-) create mode 100644 scripts/vllm/integration/conftest.py create mode 100644 scripts/vllm/integration/test_accuracy.py create mode 100644 tests/e2e/benchmarking/test_accuracy.sh diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml index 060178fabd..385113eb2e 100644 --- a/.buildkite/pipeline_jax.yml +++ b/.buildkite/pipeline_jax.yml @@ -110,6 +110,43 @@ steps: .buildkite/scripts/run_in_docker.sh \ bash -c 'SKIP_JAX_PRECOMPILE=1 MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_commons/tests/lora/test_lora.py' + - label: "Integration Test llama-3.1-8B on TPU" + key: integration_test_llama_3_1_8B_tpu + depends_on: + - test_0 + - test_1 + - test_2 + - test_3 + - test_4 + - test_5 + - test_6 + - test_7 + - test_8 + - test_9 + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct" + + - label: "Integration Test llama-3.1-70B on TPU" + key: integration_test_llama_3_1_70B_tpu + depends_on: + - test_0 + - test_1 + - test_2 + - test_3 + - test_4 + - test_5 + - test_6 + - test_7 + - test_8 + - test_9 + soft_fail: true + agents: + queue: tpu_v6e_8_queue + commands: + - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 8 -m "meta-llama/Llama-3.1-70B-Instruct" # ----------------------------------------------------------------- # NOTIFICATION STEP @@ -126,9 +163,12 @@ steps: - test_7 - test_8 - test_9 + - integration_test_llama_3_1_8B_tpu + - integration_test_llama_3_1_70B_tpu agents: queue: tpu_v6e_queue commands: - | .buildkite/scripts/check_results.sh \ - "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 + "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 \ + integration_test_llama_3_1_8B_tpu integration_test_llama_3_1_70B_tpu diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh index e5c892d31d..d4113bda16 100755 --- a/.buildkite/scripts/bootstrap.sh +++ b/.buildkite/scripts/bootstrap.sh @@ -2,25 +2,26 @@ echo "--- Starting Buildkite Bootstrap ---" -# Check if the current build is a pull request -if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then - echo "This is a Pull Request build." - PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') +# # Check if the current build is a pull request +# if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then +# echo "This is a Pull Request build." +# PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') - # If it's a PR, check for the specific label - if [[ $PR_LABELS == *"ready"* ]]; then - echo "Found 'ready' label on PR. Uploading main pipeline..." - buildkite-agent pipeline upload .buildkite/pipeline_jax.yml - # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml - else - echo "No 'ready' label found on PR. Skipping main pipeline upload." - exit 0 # Exit with 0 to indicate success (no error, just skipped) - fi -else - # If it's NOT a Pull Request (e.g., branch push, tag, manual build) - echo "This is not a Pull Request build. Uploading main pipeline." - buildkite-agent pipeline upload .buildkite/pipeline_jax.yml - # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml -fi +# # If it's a PR, check for the specific label +# if [[ $PR_LABELS == *"ready"* ]]; then +# echo "Found 'ready' label on PR. Uploading main pipeline..." +# buildkite-agent pipeline upload .buildkite/pipeline_jax.yml +# # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml +# else +# echo "No 'ready' label found on PR. Skipping main pipeline upload." +# exit 0 # Exit with 0 to indicate success (no error, just skipped) +# fi +# else +# # If it's NOT a Pull Request (e.g., branch push, tag, manual build) +# echo "This is not a Pull Request build. Uploading main pipeline." +# buildkite-agent pipeline upload .buildkite/pipeline_jax.yml +# # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml +# fi +buildkite-agent pipeline upload .buildkite/pipeline_jax.yml echo "--- Buildkite Bootstrap Finished ---" diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh index 13292e206b..7f51f86c06 100755 --- a/.buildkite/scripts/run_in_docker.sh +++ b/.buildkite/scripts/run_in_docker.sh @@ -11,6 +11,36 @@ if [ "$#" -eq 0 ]; then exit 1 fi +MOUNT_EXPECT_RESULT="False" +OTHER_ARGS=() + +while [[ $# -gt 0 ]]; do + case "$1" in + --mount-expect-result) + MOUNT_EXPECT_RESULT="True" + shift 1 + ;; + *) + OTHER_ARGS+=("$@") + break + ;; + esac +done + +# TBD: To support the functionality of connecting GPU and TPU expected values in the future +EXPECT_VOLUME=() +EXPECT_ENV=() +if [ "$MOUNT_EXPECT_RESULT" = "True" ]; then + touch "$EXPECT_VALUES_FILENAME" + echo "[DEBUG] Path: $EXPECT_VALUES_PATH, Filename: $EXPECT_VALUES_FILENAME, " + + EXPECT_VOLUME=(-v "$(pwd)/$EXPECT_VALUES_FILENAME":"$EXPECT_VALUES_PATH$EXPECT_VALUES_FILENAME") + echo "docker -v cmd: ${EXPECT_VOLUME[@]}" + + EXPECT_ENV=(-e EXPECT_VALUES_PATH="$EXPECT_VALUES_PATH" -e EXPECT_VALUES_FILENAME="$EXPECT_VALUES_FILENAME") + echo "docker -e cmd: ${EXPECT_ENV[@]}" +fi + if ! grep -q "^HF_TOKEN=" /etc/environment; then gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \ sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)" @@ -46,6 +76,9 @@ else fi DOCKER_HF_HOME="/tmp/hf_home" +# Prune older images on the host to save space. +docker system prune -a -f --filter "until=3h" + # (TODO): Consider creating a remote registry to cache and share between agents. # Subsequent builds on the same host should be cached. @@ -76,7 +109,10 @@ fi echo "Cleanup complete." -docker build --no-cache -f docker/Dockerfile -t "vllm-tpu:${BUILDKITE_COMMIT}" . +IMAGE_NAME="vllm-tpu" +docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" . + +echo "Execute Cmd: $@ on Image: ${IMAGE_NAME}:${BUILDKITE_COMMIT}" exec docker run \ --privileged \ @@ -84,6 +120,8 @@ exec docker run \ --shm-size=16G \ --rm \ -v "$LOCAL_HF_HOME":"$DOCKER_HF_HOME" \ + "${EXPECT_VOLUME[@]}" \ + "${EXPECT_ENV[@]}" \ -e HF_HOME="$DOCKER_HF_HOME" \ -e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \ -e HF_TOKEN="$HF_TOKEN" \ @@ -93,5 +131,5 @@ exec docker run \ ${QUANTIZATION:+-e QUANTIZATION="$QUANTIZATION"} \ ${NEW_MODEL_DESIGN:+-e NEW_MODEL_DESIGN="$NEW_MODEL_DESIGN"} \ ${USE_V6E8_QUEUE:+-e USE_V6E8_QUEUE="$USE_V6E8_QUEUE"} \ - "vllm-tpu:${BUILDKITE_COMMIT}" \ - "$@" # Pass all script arguments as the command to run in the container + "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \ + "$@" # Pass all script arguments as the command to run in the container \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 168a22d337..a11b596c62 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,12 +15,14 @@ RUN apt-get update && apt-get install -y \ WORKDIR /workspace/vllm ARG VLLM_REPO=https://github.com/vllm-project/vllm.git RUN git clone $VLLM_REPO /workspace/vllm + RUN pip install -r requirements/tpu.txt RUN VLLM_TARGET_DEVICE="tpu" pip install -e . # Install test dependencies RUN python3 -m pip install -e tests/vllm_test_utils -RUN python3 -m pip install --no-cache-dir git+https://github.com/thuml/depyf.git pytest pytest-asyncio tpu-info datasets 'lm_eval[api]==0.4.4' +RUN python3 -m pip install --no-cache-dir git+https://github.com/thuml/depyf.git pytest pytest-asyncio tpu-info datasets +RUN python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] RUN python3 -m pip install pytest-cov RUN python3 -m pip install numba diff --git a/requirements_benchmarking.txt b/requirements_benchmarking.txt index 04350c2cae..e9fa110218 100644 --- a/requirements_benchmarking.txt +++ b/requirements_benchmarking.txt @@ -5,3 +5,6 @@ evaluate datasets rouge-score scikit-learn +openai +lm_eval +pytest \ No newline at end of file diff --git a/scripts/vllm/integration/conftest.py b/scripts/vllm/integration/conftest.py new file mode 100644 index 0000000000..9e16362777 --- /dev/null +++ b/scripts/vllm/integration/conftest.py @@ -0,0 +1,30 @@ +import pytest +import json + +def pytest_addoption(parser): + """Adds custom command-line options to pytest.""" + parser.addoption( + "--tensor-parallel-size", + type=int, + default=1, + help="The tensor parallel size to use for the test." + ) + parser.addoption( + "--expected-values-file", + type=str, + default=None, + help="Path to a JSON file with expected accuracy values." + ) + parser.addoption( + "--model-names", + action="store", + # default="meta-llama/Llama-3.1-8B-Instruct", + default=None, + help="Comma-separated list of model names to test (e.g., 'model1,model2')" + ) + parser.addoption( + "--fp8-kv-model-names", + action="store", + default=None, + help="Comma-separated list of model names to test fp8-kv (e.g., 'model1,model2')" + ) \ No newline at end of file diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py new file mode 100644 index 0000000000..381c3eabf1 --- /dev/null +++ b/scripts/vllm/integration/test_accuracy.py @@ -0,0 +1,209 @@ +# Copied from vLLM: https://github.com/vllm-project/vllm/blob/839ab00/tests/entrypoints/llm/test_accuracy.py + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file test accuracy of the vLLM server via LMEval. +It uses local-completions, which interacts with vLLM +through the OAI API with N concurrent connections. +This simulates real work usage of the API and makes +sure that the zmq frontend mp RPC message passing and +AsyncLLMEngine are working correctly. +""" + +import lm_eval +import pytest +import json +import threading +import os + +from pathlib import Path +from vllm.platforms import current_platform + +MODEL_NAMES = [ + "Qwen/Qwen3-1.7B", + "google/gemma-3-1b-it", + # "meta-llama/Llama-3.1-8B-Instruct", +] +FP8_KV_MODEL_NAMES = [ + "Qwen/Qwen3-1.7B", +] +NUM_CONCURRENT = 500 +TASK = "gsm8k" +FILTER = "exact_match,strict-match" +RTOL = 0.03 +_JSON_WRITE_LOCK = threading.Lock() + +EXPECTED_VALUES = { + "Qwen/Qwen3-1.7B": 0.68, + "google/gemma-3-1b-it": 0.25, + "meta-llama/Llama-3.1-8B-Instruct": 0.76, + "meta-llama/Llama-3.1-70B-Instruct": 0.876, +} + +# Parametrize test cases based on CLI arguments or default values +def parametrize_by_cli_or_default(metafunc, fixture_name, cli_parameter, default_list): + if fixture_name in metafunc.fixturenames: + print(f"Checking CLI parameter '{cli_parameter}' for '{fixture_name}'") + names_str = metafunc.config.getoption(cli_parameter) + if names_str: + print(f"Using '{cli_parameter}' parameter for '{fixture_name}'") + param_list = [name.strip() for name in names_str.split(',') if name.strip()] + metafunc.parametrize(fixture_name, param_list) + else: + print(f"Using default list for '{fixture_name}'") + metafunc.parametrize(fixture_name, default_list) + +def pytest_generate_tests(metafunc): + parametrize_by_cli_or_default(metafunc, fixture_name="model", cli_parameter="--model-names", default_list=MODEL_NAMES) + parametrize_by_cli_or_default(metafunc, fixture_name="fp8_kv_model", cli_parameter="--fp8-kv-model-names", default_list=FP8_KV_MODEL_NAMES) + +# Write expected values to json file +# TBD: To support the functionality of connecting GPU and TPU expected values in the future +def write_expected_value_to_json(model_name, measured_value, json_filepath): + with _JSON_WRITE_LOCK: + data = {} + try: + with open(json_filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + print(f"'{json_filepath}' not found or is empty/invalid. A new one will be created.") + data = {} + + data[model_name] = measured_value + + try: + with open(json_filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=4) + print(f"Successfully updated '{json_filepath}' with the result for {model_name}.") + except IOError as e: + print(f"Error: Failed to write to file '{json_filepath}'. Reason: {e}") + +# Read expected values from json file if exist +# TBD: To support the functionality of connecting GPU and TPU expected values in the future +def read_expected_value(expected_json_filepath=None): + expected_values_data = {} + if expected_json_filepath is None: + expected_values_data = EXPECTED_VALUES + else: + path_obj = Path(expected_json_filepath) + # Read expected values from json file if exist + if path_obj.is_file() and os.path.getsize(expected_json_filepath) > 0: + print(f"\n[Fixture] Loading from: {expected_json_filepath}") + with open(expected_json_filepath, 'r', encoding='utf-8') as f: + expected_values_data = json.load(f) + else: + raise FileNotFoundError(f"Expected values file not found: {expected_json_filepath}") + return expected_values_data + + +def run_test(model_name, expected_values_data, expected_json_filepath, more_args=None): + """Run the end to end accuracy test.""" + print(f"Running test for model: {model_name}") + + model_args = f"pretrained={model_name},max_model_len=4096" + + download_path = "/mnt/disks/persist" + # download_path = "/tmp/hf_model" + if os.path.isdir(download_path) and os.access(download_path, os.R_OK) and os.access(download_path, os.W_OK): + model_args = f"{model_args},download_dir={download_path}" + + if more_args is not None: + model_args = "{},{}".format(model_args, more_args) + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks="gsm8k", + batch_size="auto", + ) + + # Execute default behavior when `expected_json_filepath` is not set. + if expected_json_filepath is None: + print(f"Execute default behavior") + measured_value = results["results"][TASK][FILTER] + assert model_name in EXPECTED_VALUES, ( + f"Cannot find the expected value for the model {model_name=}") + expected_value = EXPECTED_VALUES[model_name] + assert (measured_value - RTOL < expected_value + and measured_value + RTOL > expected_value + ), f"Expected: {expected_value} | Measured: {measured_value}" + else: + print(f"Execute specific models behavior") + measured_value = results["results"][TASK][FILTER] + expected_value = expected_values_data.get(model_name) + + # Model expected value not exist, write in file + if model_name not in expected_values_data: + print(f"Warning: No expected value found for {model_name}. " + "Skipping accuracy check.") + print(f"Measured value: {measured_value}") + write_expected_value_to_json(model_name, measured_value, expected_json_filepath) + + else: + print(f"Found expected value! {model_name=}, {measured_value=}, {expected_value=}") + assert (measured_value - RTOL < expected_value + and measured_value + RTOL > expected_value + ), f"Expected: {expected_value} | Measured: {measured_value}" + +@pytest.mark.skipif(not current_platform.is_cuda() + and not current_platform.is_tpu(), + reason="V1 is currently only supported on CUDA and TPU") +def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest): + """Run with the V1 Engine.""" + print(f"Testing model: {model}...") + + tp_size = request.config.getoption("--tensor-parallel-size") + expected_json_filepath = request.config.getoption("--expected-values-file") + + expected_values_data = read_expected_value(expected_json_filepath) + + if tp_size is None: + tp_size = 1 + elif tp_size < 1 or tp_size > 8: + raise ValueError + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + more_args = None + if current_platform.is_tpu(): + more_args = "max_model_len=2048,max_num_seqs=64" + tp_size_str = f"tensor_parallel_size={tp_size}" + more_args += ",{}".format(tp_size_str) + + print(f"common args: {more_args}") + + run_test(model, expected_values_data, expected_json_filepath, more_args) + + +@pytest.mark.skipif(not current_platform.is_cuda() + and not current_platform.is_tpu(), + reason="V1 is currently only supported on CUDA and TPU") +def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( + fp8_kv_model, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest): + """Run with the V1 Engine.""" + print(f"Testing fp8_kv_model: {fp8_kv_model}...") + + tp_size = request.config.getoption("--tensor-parallel-size") + expected_json_filepath = request.config.getoption("--expected-values-file") + + expected_values_data = read_expected_value(expected_json_filepath) + + if tp_size is None: + tp_size = 1 + elif tp_size < 1 or tp_size > 8: + raise ValueError + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + more_args = None + if current_platform.is_tpu(): + more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8" + tp_size_str = f"tensor_parallel_size={tp_size}" + more_args += ",{}".format(tp_size_str) + + print(f"common args: {more_args}") + + run_test(fp8_kv_model, expected_values_data, expected_json_filepath, more_args) \ No newline at end of file diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh new file mode 100644 index 0000000000..cf65860b2d --- /dev/null +++ b/tests/e2e/benchmarking/test_accuracy.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.1-70B-Instruct" +tensor_parallel_size=1 + +extra_serve_args=() +echo extra_serve_args: "${extra_serve_args[@]}" + +root_dir=/workspace +exit_code=0 + +helpFunction() +{ + echo "" + echo "Usage: $0 [-r full_path_to_root_dir -m model_id]" + echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)" + echo -e "\t-m A space-separated list of HuggingFace model ids to use (default: meta-llama/Llama-3.1-8B-Instruct and meta-llama/Llama-3.1-70B-Instruct)" + echo -e "\t-t Tensor parallel size (default: 1)" + exit 1 +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + -r|--root-dir-path) + root_dir="$2" + shift + shift + ;; + -m|--model) + model_list="$2" + shift + shift + ;; + -t|--tensor-parallel-size) + tensor_parallel_size="$2" + shift + shift + ;; + -h|--help) + helpFunction + ;; + *) # unknown option + echo "Unknown option: $1" + helpFunction + ;; + esac +done + +echo "Using the root directory at $root_dir" +echo "Testing $model_list prompts" + +cd "$root_dir"/vllm/tests/entrypoints/llm || exit + +# Overwrite a few of the vLLM benchmarking scripts with the TPU Commons ones +cp "$root_dir"/tpu_commons/scripts/vllm/integration/*.py "$root_dir"/vllm/tests/entrypoints/llm/ + +comma_model_list=${model_list// /,} + +echo "--------------------------------------------------" +echo "Running integration for models: $comma_model_list" +echo "--------------------------------------------------" + +# Default action +python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" + +exit $exit_code \ No newline at end of file From ecd93ada2ef6647a45a55d2e8334114bfa93544e Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 4 Sep 2025 11:24:04 +0800 Subject: [PATCH 02/38] squash 32 commit for next dev --- .buildkite/README_generate.md | 101 ++++++ .buildkite/buildkite_ci_feature_template.yml | 70 ++++ .buildkite/buildkite_ci_model_template.yml | 70 ++++ .buildkite/features/Feat-A.yml | 70 ++++ .buildkite/generate_feature_buildkite.py | 97 ++++++ .buildkite/generate_model_buildkite.py | 97 ++++++ .../meta-llama_Llama-3_1-70B-Instruct.yml | 70 ++++ .../meta-llama_Llama-3_1-8B-Instruct.yml | 71 ++++ .buildkite/pipeline_jax.yml | 321 +++++++++--------- .buildkite/scripts/bootstrap.sh | 26 +- .buildkite/scripts/dynamic_upload.sh | 81 +++++ .buildkite/scripts/run_in_docker.sh | 18 +- docker/Dockerfile | 13 +- docker/Dockerfile.cuda | 72 ++++ requirements_benchmarking.txt | 5 +- scripts/vllm/integration/conftest.py | 5 +- scripts/vllm/integration/test_accuracy.py | 32 +- tests/e2e/benchmarking/test_accuracy.sh | 11 +- 18 files changed, 1024 insertions(+), 206 deletions(-) create mode 100644 .buildkite/README_generate.md create mode 100644 .buildkite/buildkite_ci_feature_template.yml create mode 100644 .buildkite/buildkite_ci_model_template.yml create mode 100644 .buildkite/features/Feat-A.yml create mode 100644 .buildkite/generate_feature_buildkite.py create mode 100644 .buildkite/generate_model_buildkite.py create mode 100644 .buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml create mode 100644 .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml create mode 100644 .buildkite/scripts/dynamic_upload.sh create mode 100644 docker/Dockerfile.cuda diff --git a/.buildkite/README_generate.md b/.buildkite/README_generate.md new file mode 100644 index 0000000000..4f243068f0 --- /dev/null +++ b/.buildkite/README_generate.md @@ -0,0 +1,101 @@ +# Buildkite Pipeline Generator + +This document outlines the process for using Python scripts to automatically generate Buildkite CI/CD pipeline configuration files. These scripts leverage templates to create consistent testing pipelines for both models and features. + +## Overview + +The primary goal of these tools is to streamline the creation of Buildkite pipelines. Instead of manually creating and editing YAML files for each new model or feature, you can run a simple command to generate a standardized pipeline file. + +There are two main generators: + +1. **Model Pipeline Generator** (`generate_model_buildkite.py`): Creates a pipeline file for testing a specific machine learning model. +2. **Feature Pipeline Generator** (`generate_feature_buildkite.py`): Creates a pipeline file for testing a new feature. + +Both scripts work by reading a corresponding template file (`.yml`), replacing placeholder variables with your command-line arguments, and saving the result as a new YAML file in a designated output directory. + +## Directory Structure + +To use the scripts, your files should be arranged as follows. The output directories (`models/` and `features/`) will be created automatically if they do not exist. + +``` +. +├── generate_model_buildkite.py +├── buildkite_ci_model_template.yml +├── generate_feature_buildkite.py +├── buildkite_ci_feature_template.yml +└── README.md +``` + +----- + +## How to Use + +### 1\. Generating a Model Pipeline + +Use the `generate_model_buildkite.py` script to create a CI pipeline for a new model. + +**Command:** + +```bash +python generate_model_buildkite.py --model-name --queue +``` + +**Arguments:** + + * `--model-name` (required): The name of the model to be tested. If the name contains special characters like `/` or `.`, they will be replaced with `_` in the output filename and for Buildkite step keys. + * `--queue` (required): The name of the Buildkite agent queue where the jobs will run (e.g., `tpu_v6e_queue`). + +**Example:** + +```bash +python generate_model_buildkite.py --model-name meta-llama/Llama-3.1-8B-Instruct --queue tpu_v6e_queue +``` + +**Output:** + +This command will generate a new file located at `models/meta-llama_Llama-3_1-8B-Instruct.yml`. + +----- + +### 2\. Generating a Feature Pipeline + +Use the `generate_feature_buildkite.py` script to create a CI pipeline for a new feature. + +**Command:** + +```bash +python generate_feature_buildkite.py --feature-name --queue +``` + +**Arguments:** + + * `--feature-name` (required): The name of the feature to be tested. + * `--queue` (required): The name of the Buildkite agent queue. + +**Example:** + +```bash +python generate_feature_buildkite.py --feature-name Feat-A --queue tpu_v6e_queue +``` + +**Output:** + +This command will generate a new file located at `features/Feat-A.yml`. + +----- + +## Important Notes: Placeholders & Customization + +The scripts work by performing a find-and-replace on specific placeholders within the template files. You can customize the `buildkite_ci_*_template.yml` files to change the structure of the generated pipelines. + +#### **Model Template Placeholders (`buildkite_ci_model_template.yml`)** + + * `{MODEL_NAME}`: Replaced with the exact string provided to `--model-name`. This is typically used in human-readable fields like step `label`. + * `{SAFE_MODEL_NAME}`: A sanitized version of the model name, automatically generated by replacing characters like `/` and `.` with `_`. This is used for machine-readable fields like the step `key` and the output filename to ensure validity. + * `{QUEUE}`: Replaced with the string provided to `--queue`. + +#### **Feature Template Placeholders (`buildkite_ci_feature_template.yml`)** + + * `{FEATURE_NAME}`: Replaced with the exact string provided to `--feature-name`. + * `{SAFE_FEATURE_NAME}`: A sanitized version of the feature name. + * `{QUEUE}`: Replaced with the string provided to `--queue`. \ No newline at end of file diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml new file mode 100644 index 0000000000..52b6a350af --- /dev/null +++ b/.buildkite/buildkite_ci_feature_template.yml @@ -0,0 +1,70 @@ +# {FEATURE_NAME} +agents: + queue: {QUEUE} +steps: + - label: "Unit tests for {FEATURE_NAME}" + key: "ut_{SAFE_FEATURE_NAME}" + commands: + # - replace_with_test_commands # TODO: Replaced to actual test commands + - echo "[DEBUG], unit testing for {FEATURE_NAME}" # TODO: Replace to actual test commands + - label: "Notifications: Unit tests for {FEATURE_NAME}" + key: "notifications_ut_{SAFE_FEATURE_NAME}" + depends_on: "ut_{SAFE_FEATURE_NAME}" + soft_fail: true + agents: + queue: {QUEUE} + commands: + - | + .buildkite/scripts/check_results.sh \ + "Unit tests for {FEATURE_NAME}" ut_{SAFE_FEATURE_NAME} + + - label: "Integration tests for {FEATURE_NAME}" + key: "it_{SAFE_FEATURE_NAME}" + depends_on: "notifications_ut_{SAFE_FEATURE_NAME}" + commands: + # TODO: expected_accuracy need parameterized + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "{FEATURE_NAME}" + - echo "[DEBUG], integration testing for {FEATURE_NAME}" # TODO: Replace to actual test commands + - label: "Notifications: Integration tests for {FEATURE_NAME}" + key: "notifications_it_{SAFE_FEATURE_NAME}" + depends_on: "it_{SAFE_FEATURE_NAME}" + soft_fail: true + agents: + queue: {QUEUE} + commands: + - | + .buildkite/scripts/check_results.sh \ + "Integration tests for {FEATURE_NAME}" it_{SAFE_FEATURE_NAME} + + - label: "Performance benchmarks for {FEATURE_NAME}" + key: "pb_{SAFE_FEATURE_NAME}" + depends_on: "notifications_it_{SAFE_FEATURE_NAME}" + commands: + # - replace_with_test_command # TODO + - echo "[DEBUG], performance benchmarking for {FEATURE_NAME}" # TODO: Replace to actual test commands + - label: "Notifications: Performance benchmarks for {FEATURE_NAME}" + key: "notifications_pb_{SAFE_FEATURE_NAME}" + depends_on: "pb_{SAFE_FEATURE_NAME}" + soft_fail: true + agents: + queue: {QUEUE} + commands: + - | + .buildkite/scripts/check_results.sh \ + "Performance benchmarks for {FEATURE_NAME}" pb_{SAFE_FEATURE_NAME} + + - label: "Stress tests for {FEATURE_NAME}" + key: "st_{SAFE_FEATURE_NAME}" + depends_on: "notifications_pb_{SAFE_FEATURE_NAME}" + commands: + # - our_stress_tests_script {FEATURE_NAME} expected_throughput # TODO: expected_throughput need parameterized + - echo "[DEBUG], stress testing for {FEATURE_NAME}" # TODO: Replace to actual test commands + - label: "Notifications: Stress tests for {FEATURE_NAME}" + key: "notifications_st_{SAFE_FEATURE_NAME}" + depends_on: "st_{SAFE_FEATURE_NAME}" + agents: + queue: {QUEUE} + commands: + - | + .buildkite/scripts/check_results.sh \ + "Stress tests for {FEATURE_NAME}" st_{SAFE_FEATURE_NAME} diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml new file mode 100644 index 0000000000..aa5425b11f --- /dev/null +++ b/.buildkite/buildkite_ci_model_template.yml @@ -0,0 +1,70 @@ +# {MODEL_NAME} +agents: + queue: {QUEUE} +steps: + - label: "Unit tests for {MODEL_NAME}" + key: "ut_{SAFE_MODEL_NAME}" + commands: + # - replace_with_test_commands # TODO: Replaced to actual test commands + - echo "[DEBUG], unit testing for {MODEL_NAME}" # TODO: Replace to actual test commands + - label: "Notifications: Unit tests for {MODEL_NAME}" + key: "notifications_ut_{SAFE_MODEL_NAME}" + depends_on: "ut_{SAFE_MODEL_NAME}" + soft_fail: true + agents: + queue: {QUEUE} + commands: + - | + .buildkite/scripts/check_results.sh \ + "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME} + + - label: "Integration tests for {MODEL_NAME}" + key: "it_{SAFE_MODEL_NAME}" + depends_on: "notifications_ut_{SAFE_MODEL_NAME}" + commands: + # TODO: expected_accuracy need parameterized + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "{MODEL_NAME}" + - echo "[DEBUG], integration testing for {MODEL_NAME}" # TODO: Replace to actual test commands + - label: "Notifications: Integration tests for {MODEL_NAME}" + key: "notifications_it_{SAFE_MODEL_NAME}" + depends_on: "it_{SAFE_MODEL_NAME}" + soft_fail: true + agents: + queue: {QUEUE} + commands: + - | + .buildkite/scripts/check_results.sh \ + "Integration tests for {MODEL_NAME}" it_{SAFE_MODEL_NAME} + + - label: "Performance benchmarks for {MODEL_NAME}" + key: "pb_{SAFE_MODEL_NAME}" + depends_on: "notifications_it_{SAFE_MODEL_NAME}" + commands: + # - replace_with_test_command # TODO + - echo "[DEBUG], performance benchmarking for {MODEL_NAME}" # TODO: Replace to actual test commands + - label: "Notifications: Performance benchmarks for {MODEL_NAME}" + key: "notifications_pb_{SAFE_MODEL_NAME}" + depends_on: "pb_{SAFE_MODEL_NAME}" + soft_fail: true + agents: + queue: {QUEUE} + commands: + - | + .buildkite/scripts/check_results.sh \ + "Performance benchmarks for {MODEL_NAME}" pb_{SAFE_MODEL_NAME} + + - label: "Stress tests for {MODEL_NAME}" + key: "st_{SAFE_MODEL_NAME}" + depends_on: "notifications_pb_{SAFE_MODEL_NAME}" + commands: + # - our_stress_tests_script {MODEL_NAME} expected_throughput # TODO: expected_throughput need parameterized + - echo "[DEBUG], stress testing for {MODEL_NAME}" # TODO: Replace to actual test commands + - label: "Notifications: Stress tests for {MODEL_NAME}" + key: "notifications_st_{SAFE_MODEL_NAME}" + depends_on: "st_{SAFE_MODEL_NAME}" + agents: + queue: {QUEUE} + commands: + - | + .buildkite/scripts/check_results.sh \ + "Stress tests for {MODEL_NAME}" st_{SAFE_MODEL_NAME} diff --git a/.buildkite/features/Feat-A.yml b/.buildkite/features/Feat-A.yml new file mode 100644 index 0000000000..20ee8caeb8 --- /dev/null +++ b/.buildkite/features/Feat-A.yml @@ -0,0 +1,70 @@ +# Feat-A +agents: + queue: tpu_v6e_queue +steps: + - label: "Unit tests for Feat-A" + key: "ut_Feat-A" + commands: + # - replace_with_test_commands # TODO: Replaced to actual test commands + - echo "[DEBUG], unit testing for Feat-A" # TODO: Replace to actual test commands + - label: "Notifications: Unit tests for Feat-A" + key: "notifications_ut_Feat-A" + depends_on: "ut_Feat-A" + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Unit tests for Feat-A" ut_Feat-A + + - label: "Integration tests for Feat-A" + key: "it_Feat-A" + depends_on: "notifications_ut_Feat-A" + commands: + # TODO: expected_accuracy need parameterized + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "Feat-A" + - echo "[DEBUG], integration testing for Feat-A" # TODO: Replace to actual test commands + - label: "Notifications: Integration tests for Feat-A" + key: "notifications_it_Feat-A" + depends_on: "it_Feat-A" + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Integration tests for Feat-A" it_Feat-A + + - label: "Performance benchmarks for Feat-A" + key: "pb_Feat-A" + depends_on: "notifications_it_Feat-A" + commands: + # - replace_with_test_command # TODO + - echo "[DEBUG], performance benchmarking for Feat-A" # TODO: Replace to actual test commands + - label: "Notifications: Performance benchmarks for Feat-A" + key: "notifications_pb_Feat-A" + depends_on: "pb_Feat-A" + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Performance benchmarks for Feat-A" pb_Feat-A + + - label: "Stress tests for Feat-A" + key: "st_Feat-A" + depends_on: "notifications_pb_Feat-A" + commands: + # - our_stress_tests_script Feat-A expected_throughput # TODO: expected_throughput need parameterized + - echo "[DEBUG], stress testing for Feat-A" # TODO: Replace to actual test commands + - label: "Notifications: Stress tests for Feat-A" + key: "notifications_st_Feat-A" + depends_on: "st_Feat-A" + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Stress tests for Feat-A" st_Feat-A diff --git a/.buildkite/generate_feature_buildkite.py b/.buildkite/generate_feature_buildkite.py new file mode 100644 index 0000000000..4403469425 --- /dev/null +++ b/.buildkite/generate_feature_buildkite.py @@ -0,0 +1,97 @@ +import argparse +from pathlib import Path + +# Define the template filename and output directory as constants for easy modification. +TEMPLATE_FILENAME = "buildkite_ci_feature_template.yml" +OUTPUT_DIR = Path("features") + +def generate_from_template(feature_name: str, queue: str) -> None: + """ + Generates a buildkite yml file from a template. + + Args: + feature_name (str): The feature_name parameter. + queue (str): The Queue parameter. + """ + print(f"--- Starting to generate for Feature '{feature_name}' ---") + + # Check if the template file exists. + template_path = Path(TEMPLATE_FILENAME) + if not template_path.is_file(): + print(f"Error: Template file '{TEMPLATE_FILENAME}' not found!") + return + + # Ensure the output directory exists. If not, create it. + OUTPUT_DIR.mkdir(exist_ok=True) + print(f"Output directory '{OUTPUT_DIR}' is ready.") + + # Read the content of the template file. + try: + with open(template_path, 'r', encoding='utf-8') as f: + template_content = f.read() + print("Template file read successfully.") + except Exception as e: + print(f"Error reading template file: {e}") + return + + # Replace '/' and "." with an underscore for valid filenames and buildkite's key. + safe_feature_name = feature_name.replace("/", "_").replace(".", "_") + + # Substitute the placeholders with the provided arguments. + try: + generated_content = template_content.format( + FEATURE_NAME=feature_name, + SAFE_FEATURE_NAME=safe_feature_name, + QUEUE=queue, + ) + print("Parameter substitution complete.") + except KeyError as e: + print(f"Error: A placeholder key {e} was not found in the provided arguments.") + print("Please check for mismatches between your template file and script.") + return + + # Define the output filename and path. + # The filename is based on the feature_name with a .yml extension. + output_filename = f"{safe_feature_name}.yml" + output_path = OUTPUT_DIR / output_filename + + # Write the generated content to the file. + try: + with open(output_path, 'w', encoding='utf-8') as f: + f.write(generated_content) + print(f"✅ Success! Config file generated at: '{output_path}'") + except Exception as e: + print(f"Error writing output file: {e}") + + print("-" * 40 + "\n") + +def main(): + """ + Main function to parse command-line arguments and run the generator. + """ + parser = argparse.ArgumentParser( + description="Generate a Buildkite CI config file from a template." + ) + + # Add the command-line arguments. Both are required. + parser.add_argument( + "--feature-name", + type=str, + required=True, + help=""" + The name of the feature to use in the template (e.g., 'Feature-A'). + If have '/' or '.' in the feature name, it will be replaced with '_' in the generated file name. + """ + ) + parser.add_argument( + "--queue", + type=str, + required=True, + help="The name of the agent queue to use (e.g., 'tpu_v6e_queue' or 'tpu_v6e_8_queue')." + ) + + args = parser.parse_args() + generate_from_template(feature_name=args.feature_name, queue=args.queue) + +if __name__ == "__main__": + main() diff --git a/.buildkite/generate_model_buildkite.py b/.buildkite/generate_model_buildkite.py new file mode 100644 index 0000000000..8928345fbe --- /dev/null +++ b/.buildkite/generate_model_buildkite.py @@ -0,0 +1,97 @@ +import argparse +from pathlib import Path + +# Define the template filename and output directory as constants for easy modification. +TEMPLATE_FILENAME = "buildkite_ci_model_template.yml" +OUTPUT_DIR = Path("models") + +def generate_from_template(model_name: str, queue: str) -> None: + """ + Generates a buildkite yml file from a template. + + Args: + model_name (str): The model_name parameter. + queue (str): The Queue parameter. + """ + print(f"--- Starting to generate for model '{model_name}' ---") + + # Check if the template file exists. + template_path = Path(TEMPLATE_FILENAME) + if not template_path.is_file(): + print(f"Error: Template file '{TEMPLATE_FILENAME}' not found!") + return + + # Ensure the output directory exists. If not, create it. + OUTPUT_DIR.mkdir(exist_ok=True) + print(f"Output directory '{OUTPUT_DIR}' is ready.") + + # Read the content of the template file. + try: + with open(template_path, 'r', encoding='utf-8') as f: + template_content = f.read() + print("Template file read successfully.") + except Exception as e: + print(f"Error reading template file: {e}") + return + + # Replace '/' and "." with an underscore for valid filenames and buildkite's key. + safe_model_name = model_name.replace("/", "_").replace(".", "_") + + # Substitute the placeholders with the provided arguments. + try: + generated_content = template_content.format( + MODEL_NAME=model_name, + SAFE_MODEL_NAME=safe_model_name, + QUEUE=queue, + ) + print("Parameter substitution complete.") + except KeyError as e: + print(f"Error: A placeholder key {e} was not found in the provided arguments.") + print("Please check for mismatches between your template file and script.") + return + + # Define the output filename and path. + # The filename is based on the model_name with a .yml extension. + output_filename = f"{safe_model_name}.yml" + output_path = OUTPUT_DIR / output_filename + + # Write the generated content to the file. + try: + with open(output_path, 'w', encoding='utf-8') as f: + f.write(generated_content) + print(f"✅ Success! Config file generated at: '{output_path}'") + except Exception as e: + print(f"Error writing output file: {e}") + + print("-" * 40 + "\n") + +def main(): + """ + Main function to parse command-line arguments and run the generator. + """ + parser = argparse.ArgumentParser( + description="Generate a Buildkite CI config file from a template." + ) + + # Add the command-line arguments. Both are required. + parser.add_argument( + "--model-name", + type=str, + required=True, + help=""" + The name of the model to use in the template (e.g., 'meta-llama/Llama-3.1-8B-Instruct'). + If have '/' or '.' in the model name, it will be replaced with '_' in the generated file name. + """ + ) + parser.add_argument( + "--queue", + type=str, + required=True, + help="The name of the agent queue to use (e.g., 'tpu_v6e_queue' or 'tpu_v6e_8_queue')." + ) + + args = parser.parse_args() + generate_from_template(model_name=args.model_name, queue=args.queue) + +if __name__ == "__main__": + main() diff --git a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml new file mode 100644 index 0000000000..da93c2dc2d --- /dev/null +++ b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml @@ -0,0 +1,70 @@ +# meta-llama/Llama-3.1-70B-Instruct +agents: + queue: tpu_v6e_8_queue +steps: + - label: "Unit tests for meta-llama/Llama-3.1-70B-Instruct" + key: "ut_meta-llama_Llama-3_1-70B-Instruct" + commands: + # - replace_with_test_commands # TODO: Replaced to actual test commands + - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Unit tests for meta-llama/Llama-3.1-70B-Instruct" + key: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "ut_meta-llama_Llama-3_1-70B-Instruct" + soft_fail: true + agents: + queue: tpu_v6e_8_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct + + - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct" + key: "it_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct" + commands: + # TODO: expected_accuracy need parameterized + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-70B-Instruct" + - echo "[DEBUG], integration testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Integration tests for meta-llama/Llama-3.1-70B-Instruct" + key: "notifications_it_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "it_meta-llama_Llama-3_1-70B-Instruct" + soft_fail: true + agents: + queue: tpu_v6e_8_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct + + - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" + key: "pb_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "notifications_it_meta-llama_Llama-3_1-70B-Instruct" + commands: + # - replace_with_test_command # TODO + - echo "[DEBUG], performance benchmarking for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" + key: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "pb_meta-llama_Llama-3_1-70B-Instruct" + soft_fail: true + agents: + queue: tpu_v6e_8_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct + + - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct" + key: "st_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct" + commands: + # - our_stress_tests_script meta-llama/Llama-3.1-70B-Instruct expected_throughput # TODO: expected_throughput need parameterized + - echo "[DEBUG], stress testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Stress tests for meta-llama/Llama-3.1-70B-Instruct" + key: "notifications_st_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "st_meta-llama_Llama-3_1-70B-Instruct" + agents: + queue: tpu_v6e_8_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml new file mode 100644 index 0000000000..6d4e4288ab --- /dev/null +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -0,0 +1,71 @@ +# meta-llama/Llama-3.1-8B-Instruct +agents: + queue: tpu_v6e_queue +steps: + - label: "Unit tests for meta-llama/Llama-3.1-8B-Instruct" + key: "ut_meta-llama_Llama-3_1-8B-Instruct" + commands: + # - replace_with_test_commands # TODO: Replaced to actual test commands + # - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-8B-Instruct" # TODO: Replace to actual test commands + - echo "Running..."; sleep 20;echo "End" + - label: "Notifications: Unit tests for meta-llama/Llama-3.1-8B-Instruct" + key: "notifications_ut_meta-llama_Llama-3_1-8B-Instruct" + depends_on: "ut_meta-llama_Llama-3_1-8B-Instruct" + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct + + - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct" + key: "it_meta-llama_Llama-3_1-8B-Instruct" + depends_on: "notifications_ut_meta-llama_Llama-3_1-8B-Instruct" + commands: + # TODO: expected_accuracy need parameterized + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct" + - echo "[DEBUG], integration testing for meta-llama/Llama-3.1-8B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Integration tests for meta-llama/Llama-3.1-8B-Instruct" + key: "notifications_it_meta-llama_Llama-3_1-8B-Instruct" + depends_on: "it_meta-llama_Llama-3_1-8B-Instruct" + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Integration tests for meta-llama/Llama-3.1-8B-Instruct" it_meta-llama_Llama-3_1-8B-Instruct + + - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" + key: "pb_meta-llama_Llama-3_1-8B-Instruct" + depends_on: "notifications_it_meta-llama_Llama-3_1-8B-Instruct" + commands: + # - replace_with_test_command # TODO + - echo "[DEBUG], performance benchmarking for meta-llama/Llama-3.1-8B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" + key: "notifications_pb_meta-llama_Llama-3_1-8B-Instruct" + depends_on: "pb_meta-llama_Llama-3_1-8B-Instruct" + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" pb_meta-llama_Llama-3_1-8B-Instruct + + - label: "Stress tests for meta-llama/Llama-3.1-8B-Instruct" + key: "st_meta-llama_Llama-3_1-8B-Instruct" + depends_on: "notifications_pb_meta-llama_Llama-3_1-8B-Instruct" + commands: + # - our_stress_tests_script meta-llama/Llama-3.1-8B-Instruct expected_throughput # TODO: expected_throughput need parameterized + - echo "[DEBUG], stress testing for meta-llama/Llama-3.1-8B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Stress tests for meta-llama/Llama-3.1-8B-Instruct" + key: "notifications_st_meta-llama_Llama-3_1-8B-Instruct" + depends_on: "st_meta-llama_Llama-3_1-8B-Instruct" + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/check_results.sh \ + "Stress tests for meta-llama/Llama-3.1-8B-Instruct" st_meta-llama_Llama-3_1-8B-Instruct diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml index 385113eb2e..5170f89d57 100644 --- a/.buildkite/pipeline_jax.yml +++ b/.buildkite/pipeline_jax.yml @@ -1,174 +1,191 @@ steps: # ----------------------------------------------------------------- # TEST STEPS - Calling wrapper - # ----------------------------------------------------------------- - - label: "E2E MLPerf tests for JAX models" - key: test_0 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # -----------------------------------------------------------------" + # - label: "E2E MLPerf tests for JAX models" + # key: test_0 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E MLPerf tests for JAX models with quantization" - key: test_1 - soft_fail: true - env: - QUANTIZATION: "True" - agents: - queue: tpu_v6e_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E MLPerf tests for JAX models with quantization" + # key: test_1 + # soft_fail: true + # env: + # QUANTIZATION: "True" + # agents: + # queue: tpu_v6e_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E MLPerf tests for JAX new models" - key: test_2 - soft_fail: true - env: - NEW_MODEL_DESIGN: "True" - agents: - queue: tpu_v6e_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E MLPerf tests for JAX new models" + # key: test_2 + # soft_fail: true + # env: + # NEW_MODEL_DESIGN: "True" + # agents: + # queue: tpu_v6e_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E MLPerf tests for JAX + vLLM models" - key: test_3 - soft_fail: true - env: - MODEL_IMPL_TYPE: "vllm" - agents: - queue: tpu_v6e_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E MLPerf tests for JAX + vLLM models" + # key: test_3 + # soft_fail: true + # env: + # MODEL_IMPL_TYPE: "vllm" + # agents: + # queue: tpu_v6e_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E MLperf tests for Llama4 models" - key: test_4 - soft_fail: true - env: - NEW_MODEL_DESIGN: "True" - USE_V6E8_QUEUE: "True" - agents: - queue: tpu_v6e_8_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E MLperf tests for Llama4 models" + # key: test_4 + # soft_fail: true + # env: + # NEW_MODEL_DESIGN: "True" + # USE_V6E8_QUEUE: "True" + # agents: + # queue: tpu_v6e_8_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E multi modality test" + # key: test_5 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \ + # bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh' - - label: "E2E multi modality test" - key: test_5 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \ - bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh' + # - label: "E2E speculative decoding test" + # key: test_6 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py' - - label: "E2E speculative decoding test" - key: test_6 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py' + # - label: "JAX unit tests" + # key: test_7 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \ + # --ignore=/workspace/tpu_commons/tests/kernels \ + # --ignore=/workspace/tpu_commons/tests/lora \ + # --ignore=/workspace/tpu_commons/tests/e2e \ + # --ignore=/workspace/tpu_commons/tpu_commons/mock \ + # --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69 - - label: "JAX unit tests" - key: test_7 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \ - --ignore=/workspace/tpu_commons/tests/kernels \ - --ignore=/workspace/tpu_commons/tests/lora \ - --ignore=/workspace/tpu_commons/tests/e2e \ - --ignore=/workspace/tpu_commons/tpu_commons/mock \ - --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69 + # - label: "JAX unit tests - kernels" + # key: test_8 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \ + # --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \ + # --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py - - label: "JAX unit tests - kernels" - key: test_8 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \ - --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \ - --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py + # - label: "lora tests for JAX + vLLM models" + # key: test_9 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # bash -c 'SKIP_JAX_PRECOMPILE=1 MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_commons/tests/lora/test_lora.py' - - label: "lora tests for JAX + vLLM models" - key: test_9 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - bash -c 'SKIP_JAX_PRECOMPILE=1 MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_commons/tests/lora/test_lora.py' + # - label: "Integration Test llama-3.1-8B on TPU" + # key: integration_test_llama_3_1_8B_tpu + # depends_on: + # - test_0 + # - test_1 + # - test_2 + # - test_3 + # - test_4 + # - test_5 + # - test_6 + # - test_7 + # - test_8 + # - test_9 + # soft_fail: true + # agents: + # # Need check agent + # queue: tpu_v6e_queue + # env: + # EXPECT_VALUES_PATH: "/workspace/" + # EXPECT_VALUES_FILENAME: "expect_values.json" + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct" + + # - label: "Integration Test llama-3.1-70B on TPU" + # key: integration_test_llama_3_1_70B_tpu + # depends_on: + # - test_0 + # - test_1 + # - test_2 + # - test_3 + # - test_4 + # - test_5 + # - test_6 + # - test_7 + # - test_8 + # - test_9 + # soft_fail: true + # agents: + # queue: tpu_v6e_8_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 8 -m "meta-llama/Llama-3.1-70B-Instruct" + + # # ----------------------------------------------------------------- + # # NOTIFICATION STEP + # # ----------------------------------------------------------------- + # - label: "TPU Test Notification" + # depends_on: + # - test_0 + # - test_1 + # - test_2 + # - test_3 + # - test_4 + # - test_5 + # - test_6 + # - test_7 + # - test_8 + # - test_9 + # - integration_test_llama_3_1_8B_tpu + # - integration_test_llama_3_1_70B_tpu + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/check_results.sh \ + # "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 \ + # integration_test_llama_3_1_8B_tpu integration_test_llama_3_1_70B_tpu - - label: "Integration Test llama-3.1-8B on TPU" - key: integration_test_llama_3_1_8B_tpu - depends_on: - - test_0 - - test_1 - - test_2 - - test_3 - - test_4 - - test_5 - - test_6 - - test_7 - - test_8 - - test_9 - soft_fail: true + - label: "Upload Dynamic Pipeline Test" agents: queue: tpu_v6e_queue commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct" + - .buildkite/scripts/dynamic_upload.sh - - label: "Integration Test llama-3.1-70B on TPU" - key: integration_test_llama_3_1_70B_tpu - depends_on: - - test_0 - - test_1 - - test_2 - - test_3 - - test_4 - - test_5 - - test_6 - - test_7 - - test_8 - - test_9 - soft_fail: true - agents: - queue: tpu_v6e_8_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 8 -m "meta-llama/Llama-3.1-70B-Instruct" + - wait: ~ - # ----------------------------------------------------------------- - # NOTIFICATION STEP - # ----------------------------------------------------------------- - - label: "TPU Test Notification" - depends_on: - - test_0 - - test_1 - - test_2 - - test_3 - - test_4 - - test_5 - - test_6 - - test_7 - - test_8 - - test_9 - - integration_test_llama_3_1_8B_tpu - - integration_test_llama_3_1_70B_tpu + - label: "Generate support matrices" agents: queue: tpu_v6e_queue commands: - - | - .buildkite/scripts/check_results.sh \ - "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 \ - integration_test_llama_3_1_8B_tpu integration_test_llama_3_1_70B_tpu + - echo "Generate support matrices..." \ No newline at end of file diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh index d4113bda16..d7a901f4ce 100755 --- a/.buildkite/scripts/bootstrap.sh +++ b/.buildkite/scripts/bootstrap.sh @@ -1,27 +1,7 @@ #!/bin/bash -echo "--- Starting Buildkite Bootstrap ---" - -# # Check if the current build is a pull request -# if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then -# echo "This is a Pull Request build." -# PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') - -# # If it's a PR, check for the specific label -# if [[ $PR_LABELS == *"ready"* ]]; then -# echo "Found 'ready' label on PR. Uploading main pipeline..." -# buildkite-agent pipeline upload .buildkite/pipeline_jax.yml -# # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml -# else -# echo "No 'ready' label found on PR. Skipping main pipeline upload." -# exit 0 # Exit with 0 to indicate success (no error, just skipped) -# fi -# else -# # If it's NOT a Pull Request (e.g., branch push, tag, manual build) -# echo "This is not a Pull Request build. Uploading main pipeline." -# buildkite-agent pipeline upload .buildkite/pipeline_jax.yml -# # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml -# fi +echo "--- Starting Special Buildkite Bootstrap ---" buildkite-agent pipeline upload .buildkite/pipeline_jax.yml -echo "--- Buildkite Bootstrap Finished ---" + +echo "--- Buildkite Special Bootstrap Finished ---" diff --git a/.buildkite/scripts/dynamic_upload.sh b/.buildkite/scripts/dynamic_upload.sh new file mode 100644 index 0000000000..d2376176f2 --- /dev/null +++ b/.buildkite/scripts/dynamic_upload.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +echo "--- Starting Special Buildkite Bootstrap ---" + +# for loop features and models upload to buildkite +BUILDKITE_DIR=".buildkite" +TARGET_FOLDERS="models features" +MODEL_LIST_METADATA_KEY="model-names-list" + +declare -a model_names +declare -a pipeline_steps + +echo "--- Scanning: ${TARGET_FOLDERS}" + +for folder in $TARGET_FOLDERS; do + folder=$BUILDKITE_DIR/$folder + # Check if the folder exists + if [[ ! -d "$folder" ]]; then + echo "Warning: Folder '$folder' not found. Skipping." + continue + fi + + # Use find command to locate all .yml or .yaml files + # -print0 and read -r -d '' are a safe way to handle filenames with special characters (like spaces) + while IFS= read -r -d '' yml_file; do + echo "--- handling yml file: ${yml_file}" + + # Read the first line for getting model name + first_line=$(awk 'NR==1{print $0; exit}' "${yml_file}") + + # Check if the first line contains the '# ' comment marker + if [[ "$first_line" == "# "* ]]; then + model_name=${first_line#\# } + echo "Model Name: ${model_name}" + model_names+=("${model_name}") + else + echo "Warning: The first line of ${yml_file} is not in the expected comment format (ex: '# model-name')." + fi + + # --- Dynamic Buildkite Pipeline Step --- + # For each found .yml file, generate a command step + # Here we assume the .yml file itself is an executable buildkite pipeline step script + pipeline_yaml=$(cat < Date: Tue, 23 Sep 2025 16:58:49 +0800 Subject: [PATCH 03/38] clean for test --- .buildkite/buildkite_ci_feature_template.yml | 51 +++- .buildkite/buildkite_ci_model_template.yml | 51 +++- .buildkite/features/Feat-A.yml | 70 ----- .../meta-llama_Llama-3_1-70B-Instruct.yml | 51 +++- .../meta-llama_Llama-3_1-8B-Instruct.yml | 54 +++- .buildkite/pipeline_dynamic.yml | 17 ++ .buildkite/pipeline_jax.yml | 276 +++++++----------- .buildkite/scripts/bootstrap.sh | 25 +- .buildkite/scripts/dynamic_bootstrap.sh | 145 +++++++++ .buildkite/scripts/dynamic_upload.sh | 81 ----- .buildkite/scripts/run_in_docker.sh | 59 +--- buildkite-script-dynamic.gz | Bin 0 -> 10583 bytes 12 files changed, 494 insertions(+), 386 deletions(-) delete mode 100644 .buildkite/features/Feat-A.yml rename .buildkite/models/{ => informational}/meta-llama_Llama-3_1-70B-Instruct.yml (58%) create mode 100644 .buildkite/pipeline_dynamic.yml create mode 100644 .buildkite/scripts/dynamic_bootstrap.sh delete mode 100644 .buildkite/scripts/dynamic_upload.sh create mode 100644 buildkite-script-dynamic.gz diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml index 52b6a350af..e907815416 100644 --- a/.buildkite/buildkite_ci_feature_template.yml +++ b/.buildkite/buildkite_ci_feature_template.yml @@ -10,13 +10,24 @@ steps: - label: "Notifications: Unit tests for {FEATURE_NAME}" key: "notifications_ut_{SAFE_FEATURE_NAME}" depends_on: "ut_{SAFE_FEATURE_NAME}" - soft_fail: true agents: queue: {QUEUE} commands: - | .buildkite/scripts/check_results.sh \ "Unit tests for {FEATURE_NAME}" ut_{SAFE_FEATURE_NAME} + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "{FEATURE_NAME}:UnitTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "{FEATURE_NAME}:UnitTest" "failed" + fi - label: "Integration tests for {FEATURE_NAME}" key: "it_{SAFE_FEATURE_NAME}" @@ -28,13 +39,24 @@ steps: - label: "Notifications: Integration tests for {FEATURE_NAME}" key: "notifications_it_{SAFE_FEATURE_NAME}" depends_on: "it_{SAFE_FEATURE_NAME}" - soft_fail: true agents: queue: {QUEUE} commands: - | .buildkite/scripts/check_results.sh \ "Integration tests for {FEATURE_NAME}" it_{SAFE_FEATURE_NAME} + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "{FEATURE_NAME}:IntTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "{FEATURE_NAME}:IntTest" "failed" + fi - label: "Performance benchmarks for {FEATURE_NAME}" key: "pb_{SAFE_FEATURE_NAME}" @@ -45,13 +67,24 @@ steps: - label: "Notifications: Performance benchmarks for {FEATURE_NAME}" key: "notifications_pb_{SAFE_FEATURE_NAME}" depends_on: "pb_{SAFE_FEATURE_NAME}" - soft_fail: true agents: queue: {QUEUE} commands: - | .buildkite/scripts/check_results.sh \ "Performance benchmarks for {FEATURE_NAME}" pb_{SAFE_FEATURE_NAME} + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "{FEATURE_NAME}:Benchmark" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "{FEATURE_NAME}:Benchmark" "failed" + fi - label: "Stress tests for {FEATURE_NAME}" key: "st_{SAFE_FEATURE_NAME}" @@ -68,3 +101,15 @@ steps: - | .buildkite/scripts/check_results.sh \ "Stress tests for {FEATURE_NAME}" st_{SAFE_FEATURE_NAME} + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "{FEATURE_NAME}:StressTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "{FEATURE_NAME}:StressTest" "failed" + fi diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml index aa5425b11f..f58b557557 100644 --- a/.buildkite/buildkite_ci_model_template.yml +++ b/.buildkite/buildkite_ci_model_template.yml @@ -10,13 +10,24 @@ steps: - label: "Notifications: Unit tests for {MODEL_NAME}" key: "notifications_ut_{SAFE_MODEL_NAME}" depends_on: "ut_{SAFE_MODEL_NAME}" - soft_fail: true agents: queue: {QUEUE} commands: - | .buildkite/scripts/check_results.sh \ "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME} + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "{MODEL_NAME}:UnitTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "{MODEL_NAME}:UnitTest" "failed" + fi - label: "Integration tests for {MODEL_NAME}" key: "it_{SAFE_MODEL_NAME}" @@ -28,13 +39,24 @@ steps: - label: "Notifications: Integration tests for {MODEL_NAME}" key: "notifications_it_{SAFE_MODEL_NAME}" depends_on: "it_{SAFE_MODEL_NAME}" - soft_fail: true agents: queue: {QUEUE} commands: - | .buildkite/scripts/check_results.sh \ "Integration tests for {MODEL_NAME}" it_{SAFE_MODEL_NAME} + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "{MODEL_NAME}:IntTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "{MODEL_NAME}:IntTest" "failed" + fi - label: "Performance benchmarks for {MODEL_NAME}" key: "pb_{SAFE_MODEL_NAME}" @@ -45,13 +67,24 @@ steps: - label: "Notifications: Performance benchmarks for {MODEL_NAME}" key: "notifications_pb_{SAFE_MODEL_NAME}" depends_on: "pb_{SAFE_MODEL_NAME}" - soft_fail: true agents: queue: {QUEUE} commands: - | .buildkite/scripts/check_results.sh \ "Performance benchmarks for {MODEL_NAME}" pb_{SAFE_MODEL_NAME} + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "{MODEL_NAME}:Benchmark" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "{MODEL_NAME}:Benchmark" "failed" + fi - label: "Stress tests for {MODEL_NAME}" key: "st_{SAFE_MODEL_NAME}" @@ -68,3 +101,15 @@ steps: - | .buildkite/scripts/check_results.sh \ "Stress tests for {MODEL_NAME}" st_{SAFE_MODEL_NAME} + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "{MODEL_NAME}:StressTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "{MODEL_NAME}:StressTest" "failed" + fi diff --git a/.buildkite/features/Feat-A.yml b/.buildkite/features/Feat-A.yml deleted file mode 100644 index 20ee8caeb8..0000000000 --- a/.buildkite/features/Feat-A.yml +++ /dev/null @@ -1,70 +0,0 @@ -# Feat-A -agents: - queue: tpu_v6e_queue -steps: - - label: "Unit tests for Feat-A" - key: "ut_Feat-A" - commands: - # - replace_with_test_commands # TODO: Replaced to actual test commands - - echo "[DEBUG], unit testing for Feat-A" # TODO: Replace to actual test commands - - label: "Notifications: Unit tests for Feat-A" - key: "notifications_ut_Feat-A" - depends_on: "ut_Feat-A" - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "Unit tests for Feat-A" ut_Feat-A - - - label: "Integration tests for Feat-A" - key: "it_Feat-A" - depends_on: "notifications_ut_Feat-A" - commands: - # TODO: expected_accuracy need parameterized - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "Feat-A" - - echo "[DEBUG], integration testing for Feat-A" # TODO: Replace to actual test commands - - label: "Notifications: Integration tests for Feat-A" - key: "notifications_it_Feat-A" - depends_on: "it_Feat-A" - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "Integration tests for Feat-A" it_Feat-A - - - label: "Performance benchmarks for Feat-A" - key: "pb_Feat-A" - depends_on: "notifications_it_Feat-A" - commands: - # - replace_with_test_command # TODO - - echo "[DEBUG], performance benchmarking for Feat-A" # TODO: Replace to actual test commands - - label: "Notifications: Performance benchmarks for Feat-A" - key: "notifications_pb_Feat-A" - depends_on: "pb_Feat-A" - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "Performance benchmarks for Feat-A" pb_Feat-A - - - label: "Stress tests for Feat-A" - key: "st_Feat-A" - depends_on: "notifications_pb_Feat-A" - commands: - # - our_stress_tests_script Feat-A expected_throughput # TODO: expected_throughput need parameterized - - echo "[DEBUG], stress testing for Feat-A" # TODO: Replace to actual test commands - - label: "Notifications: Stress tests for Feat-A" - key: "notifications_st_Feat-A" - depends_on: "st_Feat-A" - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "Stress tests for Feat-A" st_Feat-A diff --git a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml similarity index 58% rename from .buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml rename to .buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml index da93c2dc2d..9539111d5b 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml +++ b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml @@ -10,13 +10,24 @@ steps: - label: "Notifications: Unit tests for meta-llama/Llama-3.1-70B-Instruct" key: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct" depends_on: "ut_meta-llama_Llama-3_1-70B-Instruct" - soft_fail: true agents: queue: tpu_v6e_8_queue commands: - | .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:UnitTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:UnitTest" "failed" + fi - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct" key: "it_meta-llama_Llama-3_1-70B-Instruct" @@ -28,13 +39,24 @@ steps: - label: "Notifications: Integration tests for meta-llama/Llama-3.1-70B-Instruct" key: "notifications_it_meta-llama_Llama-3_1-70B-Instruct" depends_on: "it_meta-llama_Llama-3_1-70B-Instruct" - soft_fail: true agents: queue: tpu_v6e_8_queue commands: - | .buildkite/scripts/check_results.sh \ "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:IntTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:IntTest" "failed" + fi - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" key: "pb_meta-llama_Llama-3_1-70B-Instruct" @@ -45,13 +67,24 @@ steps: - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" key: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct" depends_on: "pb_meta-llama_Llama-3_1-70B-Instruct" - soft_fail: true agents: queue: tpu_v6e_8_queue commands: - | .buildkite/scripts/check_results.sh \ "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:Benchmark" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:Benchmark" "failed" + fi - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct" key: "st_meta-llama_Llama-3_1-70B-Instruct" @@ -68,3 +101,15 @@ steps: - | .buildkite/scripts/check_results.sh \ "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:StressTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:StressTest" "failed" + fi diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index 6d4e4288ab..eff2e3c815 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -6,18 +6,28 @@ steps: key: "ut_meta-llama_Llama-3_1-8B-Instruct" commands: # - replace_with_test_commands # TODO: Replaced to actual test commands - # - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-8B-Instruct" # TODO: Replace to actual test commands - - echo "Running..."; sleep 20;echo "End" + - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-8B-Instruct" # TODO: Replace to actual test commands - label: "Notifications: Unit tests for meta-llama/Llama-3.1-8B-Instruct" key: "notifications_ut_meta-llama_Llama-3_1-8B-Instruct" depends_on: "ut_meta-llama_Llama-3_1-8B-Instruct" - soft_fail: true agents: queue: tpu_v6e_queue commands: - | .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "failed" + fi - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct" key: "it_meta-llama_Llama-3_1-8B-Instruct" @@ -29,13 +39,24 @@ steps: - label: "Notifications: Integration tests for meta-llama/Llama-3.1-8B-Instruct" key: "notifications_it_meta-llama_Llama-3_1-8B-Instruct" depends_on: "it_meta-llama_Llama-3_1-8B-Instruct" - soft_fail: true agents: queue: tpu_v6e_queue commands: - | .buildkite/scripts/check_results.sh \ "Integration tests for meta-llama/Llama-3.1-8B-Instruct" it_meta-llama_Llama-3_1-8B-Instruct + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:IntTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:IntTest" "failed" + fi - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" key: "pb_meta-llama_Llama-3_1-8B-Instruct" @@ -46,13 +67,24 @@ steps: - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" key: "notifications_pb_meta-llama_Llama-3_1-8B-Instruct" depends_on: "pb_meta-llama_Llama-3_1-8B-Instruct" - soft_fail: true agents: queue: tpu_v6e_queue commands: - | .buildkite/scripts/check_results.sh \ "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" pb_meta-llama_Llama-3_1-8B-Instruct + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:Benchmark" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:Benchmark" "failed" + fi - label: "Stress tests for meta-llama/Llama-3.1-8B-Instruct" key: "st_meta-llama_Llama-3_1-8B-Instruct" @@ -69,3 +101,15 @@ steps: - | .buildkite/scripts/check_results.sh \ "Stress tests for meta-llama/Llama-3.1-8B-Instruct" st_meta-llama_Llama-3_1-8B-Instruct + plugins: + - hooks#v1: + post-command: | + echo "--- Post-command hook triggered ---" + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:StressTest" "passed" + else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:StressTest" "failed" + fi diff --git a/.buildkite/pipeline_dynamic.yml b/.buildkite/pipeline_dynamic.yml new file mode 100644 index 0000000000..fa1355330e --- /dev/null +++ b/.buildkite/pipeline_dynamic.yml @@ -0,0 +1,17 @@ +steps: + # ----------------------------------------------------------------- + # TEST STEPS - Calling wrapper + # -----------------------------------------------------------------" + - label: "Upload Dynamic Pipeline Test" + agents: + queue: tpu_v6e_queue + commands: + - .buildkite/scripts/dynamic_upload.sh + + - wait: ~ + + - label: "Generate support matrices" + agents: + queue: tpu_v6e_queue + commands: + - echo "Generate support matrices..." \ No newline at end of file diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml index 5170f89d57..fc42dbde93 100644 --- a/.buildkite/pipeline_jax.yml +++ b/.buildkite/pipeline_jax.yml @@ -1,191 +1,123 @@ steps: # ----------------------------------------------------------------- # TEST STEPS - Calling wrapper - # -----------------------------------------------------------------" - # - label: "E2E MLPerf tests for JAX models" - # key: test_0 - # soft_fail: true - # agents: - # queue: tpu_v6e_queue - # commands: - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - # - label: "E2E MLPerf tests for JAX models with quantization" - # key: test_1 - # soft_fail: true - # env: - # QUANTIZATION: "True" - # agents: - # queue: tpu_v6e_queue - # commands: - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - # - label: "E2E MLPerf tests for JAX new models" - # key: test_2 - # soft_fail: true - # env: - # NEW_MODEL_DESIGN: "True" - # agents: - # queue: tpu_v6e_queue - # commands: - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - # - label: "E2E MLPerf tests for JAX + vLLM models" - # key: test_3 - # soft_fail: true - # env: - # MODEL_IMPL_TYPE: "vllm" - # agents: - # queue: tpu_v6e_queue - # commands: - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - # - label: "E2E MLperf tests for Llama4 models" - # key: test_4 - # soft_fail: true - # env: - # NEW_MODEL_DESIGN: "True" - # USE_V6E8_QUEUE: "True" - # agents: - # queue: tpu_v6e_8_queue - # commands: - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - # - label: "E2E multi modality test" - # key: test_5 - # soft_fail: true - # agents: - # queue: tpu_v6e_queue - # commands: - # - | - # .buildkite/scripts/run_in_docker.sh \ - # bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \ - # bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh' + # ----------------------------------------------------------------- + - label: "E2E MLPerf tests for JAX models" + key: test_0 + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - # - label: "E2E speculative decoding test" - # key: test_6 - # soft_fail: true - # agents: - # queue: tpu_v6e_queue - # commands: - # - | - # .buildkite/scripts/run_in_docker.sh \ - # bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py' + - label: "E2E MLPerf tests for JAX models with quantization" + key: test_1 + soft_fail: true + env: + QUANTIZATION: "True" + agents: + queue: tpu_v6e_queue + commands: + - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - # - label: "JAX unit tests" - # key: test_7 - # soft_fail: true - # agents: - # queue: tpu_v6e_queue - # commands: - # - | - # .buildkite/scripts/run_in_docker.sh \ - # python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \ - # --ignore=/workspace/tpu_commons/tests/kernels \ - # --ignore=/workspace/tpu_commons/tests/lora \ - # --ignore=/workspace/tpu_commons/tests/e2e \ - # --ignore=/workspace/tpu_commons/tpu_commons/mock \ - # --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69 + - label: "E2E MLPerf tests for JAX new models" + key: test_2 + soft_fail: true + env: + NEW_MODEL_DESIGN: "True" + agents: + queue: tpu_v6e_queue + commands: + - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - # - label: "JAX unit tests - kernels" - # key: test_8 - # soft_fail: true - # agents: - # queue: tpu_v6e_queue - # commands: - # - | - # .buildkite/scripts/run_in_docker.sh \ - # python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \ - # --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \ - # --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py + - label: "E2E MLPerf tests for JAX + vLLM models" + key: test_3 + soft_fail: true + env: + MODEL_IMPL_TYPE: "vllm" + agents: + queue: tpu_v6e_queue + commands: + - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - # - label: "lora tests for JAX + vLLM models" - # key: test_9 - # soft_fail: true - # agents: - # queue: tpu_v6e_queue - # commands: - # - | - # .buildkite/scripts/run_in_docker.sh \ - # bash -c 'SKIP_JAX_PRECOMPILE=1 MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_commons/tests/lora/test_lora.py' + - label: "E2E MLperf tests for Llama4 models" + key: test_4 + soft_fail: true + env: + NEW_MODEL_DESIGN: "True" + USE_V6E8_QUEUE: "True" + agents: + queue: tpu_v6e_8_queue + commands: + - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - # - label: "Integration Test llama-3.1-8B on TPU" - # key: integration_test_llama_3_1_8B_tpu - # depends_on: - # - test_0 - # - test_1 - # - test_2 - # - test_3 - # - test_4 - # - test_5 - # - test_6 - # - test_7 - # - test_8 - # - test_9 - # soft_fail: true - # agents: - # # Need check agent - # queue: tpu_v6e_queue - # env: - # EXPECT_VALUES_PATH: "/workspace/" - # EXPECT_VALUES_FILENAME: "expect_values.json" - # commands: - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct" - # - label: "Integration Test llama-3.1-70B on TPU" - # key: integration_test_llama_3_1_70B_tpu - # depends_on: - # - test_0 - # - test_1 - # - test_2 - # - test_3 - # - test_4 - # - test_5 - # - test_6 - # - test_7 - # - test_8 - # - test_9 - # soft_fail: true - # agents: - # queue: tpu_v6e_8_queue - # commands: - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 8 -m "meta-llama/Llama-3.1-70B-Instruct" + - label: "E2E multi modality test" + key: test_5 + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/run_in_docker.sh \ + bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \ + bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh' - # # ----------------------------------------------------------------- - # # NOTIFICATION STEP - # # ----------------------------------------------------------------- - # - label: "TPU Test Notification" - # depends_on: - # - test_0 - # - test_1 - # - test_2 - # - test_3 - # - test_4 - # - test_5 - # - test_6 - # - test_7 - # - test_8 - # - test_9 - # - integration_test_llama_3_1_8B_tpu - # - integration_test_llama_3_1_70B_tpu - # agents: - # queue: tpu_v6e_queue - # commands: - # - | - # .buildkite/scripts/check_results.sh \ - # "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 \ - # integration_test_llama_3_1_8B_tpu integration_test_llama_3_1_70B_tpu + - label: "E2E speculative decoding test" + key: test_6 + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/run_in_docker.sh \ + bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py' - - label: "Upload Dynamic Pipeline Test" + - label: "JAX unit tests" + key: test_7 + soft_fail: true agents: queue: tpu_v6e_queue commands: - - .buildkite/scripts/dynamic_upload.sh + - | + .buildkite/scripts/run_in_docker.sh \ + python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \ + --ignore=/workspace/tpu_commons/tests/kernels \ + --ignore=/workspace/tpu_commons/tests/e2e \ + --ignore=/workspace/tpu_commons/tpu_commons/mock \ + --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69 - - wait: ~ + - label: "JAX unit tests - kernels" + key: test_8 + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - | + .buildkite/scripts/run_in_docker.sh \ + python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \ + --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \ + --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py - - label: "Generate support matrices" + # ----------------------------------------------------------------- + # NOTIFICATION STEP + # ----------------------------------------------------------------- + - label: "TPU Test Notification" + depends_on: + - test_0 + - test_1 + - test_2 + - test_3 + - test_4 + - test_5 + - test_6 + - test_7 + - test_8 + - integration_test_llama_3_1_8B_tpu + - integration_test_llama_3_1_70B_tpu agents: queue: tpu_v6e_queue commands: - - echo "Generate support matrices..." \ No newline at end of file + - | + .buildkite/scripts/check_results.sh \ + "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh index d7a901f4ce..e5c892d31d 100755 --- a/.buildkite/scripts/bootstrap.sh +++ b/.buildkite/scripts/bootstrap.sh @@ -1,7 +1,26 @@ #!/bin/bash -echo "--- Starting Special Buildkite Bootstrap ---" +echo "--- Starting Buildkite Bootstrap ---" -buildkite-agent pipeline upload .buildkite/pipeline_jax.yml +# Check if the current build is a pull request +if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then + echo "This is a Pull Request build." + PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') -echo "--- Buildkite Special Bootstrap Finished ---" + # If it's a PR, check for the specific label + if [[ $PR_LABELS == *"ready"* ]]; then + echo "Found 'ready' label on PR. Uploading main pipeline..." + buildkite-agent pipeline upload .buildkite/pipeline_jax.yml + # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml + else + echo "No 'ready' label found on PR. Skipping main pipeline upload." + exit 0 # Exit with 0 to indicate success (no error, just skipped) + fi +else + # If it's NOT a Pull Request (e.g., branch push, tag, manual build) + echo "This is not a Pull Request build. Uploading main pipeline." + buildkite-agent pipeline upload .buildkite/pipeline_jax.yml + # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml +fi + +echo "--- Buildkite Bootstrap Finished ---" diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh new file mode 100644 index 0000000000..9736eaa08b --- /dev/null +++ b/.buildkite/scripts/dynamic_bootstrap.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +echo "--- Starting Special Buildkite Bootstrap ---" + +# for loop features and models upload to buildkite +BUILDKITE_DIR=".buildkite" +TARGET_FOLDERS="models features models/informational" + +MODEL_LIST_KEY="tpu-model-list" +INFORMATIONAL_MODEL_LIST_KEY="vllm-model-list" +POPURLAR_MODEL_LIST_KEY="popular-model-list" + +FEATURE_LIST_METADATA_KEY="feature-list" + +declare -a model_names +declare -a pipeline_steps + +# Declare separate arrays for each list +declare -a tpu_model_list +declare -a vllm_model_list +declare -a popular_model_list +declare -a feature_list + +echo "--- Scanning: ${TARGET_FOLDERS}" + +for folder_path in $TARGET_FOLDERS; do + folder=$BUILDKITE_DIR/$folder_path + # Check if the folder exists + if [[ ! -d "$folder" ]]; then + echo "Warning: Folder '$folder' not found. Skipping." + continue + fi + + # Use find command to locate all .yml or .yaml files + # -print0 and read -r -d '' are a safe way to handle filenames with special characters (like spaces) + while IFS= read -r -d '' yml_file; do + echo "--- handling yml file: ${yml_file}" + + # Read the first line for getting model name + first_line=$(awk 'NR==1{print $0; exit}' "${yml_file}") + + # Check if the first line contains the '# ' comment marker + if [[ "$first_line" == "# "* ]]; then + model_name=${first_line#\# } + echo "Model Name: ${model_name}" + + # folder_name=$(basename "$folder_path") + + # Based on the folder name, add the model to the correct list + case "$folder_path" in + "models") + tpu_model_list+=("${model_name}") + ;; + "models/informational") + vllm_model_list+=("${model_name}") + ;; + "models/popular") + popular_model_list+=("${model_name}") + ;; + "features") + feature_list+=("${model_name}") + ;; + *) + echo "Warning: No specific list for folder '${folder_path}'. Ignoring model '${model_name}'." + ;; + esac + + + model_names+=("${model_name}") + else + echo "Warning: The first line of ${yml_file} is not in the expected comment format (ex: '# model-name')." + fi + + # --- Dynamic Buildkite Pipeline Step --- + # For each found .yml file, generate a command step + # Here we assume the .yml file itself is an executable buildkite pipeline step script + pipeline_yaml=$(cat <Q_bi|wh6D%xWfsFGk1D*+QU|_*yHz&yv z!j`duEjhZtuw>rnd^!JMKfb?if61xpZb_}1Ex?#d)X(k&ORetes;=tls_v>I?eh=1 z*4OQI`P?%`{Y%5uznXKYR4T1TgM8JZg?v?Qv>MeK!RM^NOQlh1*6a0Vt5zo!rKL1m zUy;xA1|+R`j;q-K%8hPLbR)ao#5Ts7eRE-3-X!fk8wm;i`$CG+AfC~0D^*3Q)HX}C z3K_e`SZ^z>#zs@AZZ?{7wOZM%Ha9k#rKbkAD043%*RMZXA3p!r>&W-5B>w}yMESo_ zZ8g6ljYoq>D1UMO7wCT!ECs;BQ-2VNva|$mUSD(=5Wiryv z_0blQPEErluI{)F8CZ6t9SJHDa;e`yHP2OVPqsgF)WrLqH6CkbpEiqS2^zO`m~cH^ zy*AvTijAv&EqvcS+&$bP$E;4DxE9fRuBVMC7(siyxiZmvLyJhi>~`Lte*A5Xcs{^} zd65J;(f*X?LnM5%Tw`GLG}o|9XN%;-i79@iW@D(YPju)}walrA45@}CM!7Uhm&u>} zFG2(f;Ph-`;yUZSq29YxZQb!kt|L1`@@r6=Os6|Z1Co9`8F?3mDL^o^tV?I-1s=D=YrhdeS%Yq0N1-75bVwS z)4l!OANRT)b?5NlVEfaq+WF^RS3T)&cTZ2EO&J673z24gLCX3cq(XlCj<`eJjC>ye zGt>$46Paj^qxWTUIvH77A7>@Qxh%_4?8|^iWlC%^)?KaK*IbP_x=SRH2wO-X9I!+r z)VNOjnTZ5tNw(&2$FNm`g=CVCgHB7Z{^)7WAnd@AGf7Y)p`_h?!*jaZc z9u;~3;5r@mx?Y9VSvPw_Sb;8q9Q1Jj0k%}rdZWz$DV(!eYvScaf@Bniey^cSILmD~T*s8uR8h1!3stw!PhSxA}U|7j|fdUdlQ*OcmJrM0nH`-`#0U!n;7 zKWDBF%Pjv-wN{JqzXD$r{+~sZz^}7}s>{O)ZKc~e>5`Lf=ja4xeFYaky=v+z{mI_N_1v8H-vl;D*&*dC(Ik8|8jZOv_)0+o@wzyo zQpUg)K5&3*VvMM|pEZ8v#f(1~d~{S`%oAE)m#?_gZc?Jl8KIIuCyi(@=PCOX^z|QPlw&T4|_Y?-Mzz4 zbUXf7TkM|{MnlmV#i=Fuk3!tWx7?9uO3Jfp%ClO^vwF(2M#{5h%ClC=vyGHzn`tno z4kT?PO4?BTNqqPDN+f!6ERP3mS)i0dK2XQUvOo#}90BnHz6CT3*cFf};8H-LfH@Ii zBA!I_h**&dYx5#Xs{hAdwbX&Gxt^`7K_!q;a^H2Y$r*^XWs z*cK?XnmZgB=Y+pJf}f?*64})UhNGmaJe`TLcfhM4ku7$>iWW3Y+-B-|2%` zO&?I0r>46ypa+#sU~1C?XRa0Cf|B6_>yj+rik*L1rpK378SqPF2h~VCb(Eku9%TYe zlJ)GrQE}S%v`A2OmDYvEOv{aaA{&m1jjpVQxbjEQv2DwSQAavozD=|apkuoO%QO35 zORGUUwhp5+{Q-4M2S<&=NlaBfBC$2U2E$32jYP@fqH(=Z8DG%yp`1$HKZsNunuv&UQc(Nfj1h_;b3k3 zb5EbRq(lFMw;`ZYJ%hBg6*@!3wztTw{w0%&{t?OL^<^q`Bw1p3&`bj4*KMa~*=(L) zX(OW_$S8+a54d;PL3;v2s-q3Ub&oP)JHnSTKia+`txIb}Ql-`KiOiq)@SS+)j%&D{ z%Vrv_HMB+u7=x2OwXG|oulG^%dSl%BrBK)B*}g!WIK5wJzI1SKYJA%itdGe!v!=}QNX|hGO zMLZ(iaEg#?_$J_cVwhQ|m69-yU?~E2qaqHc%GhuKjquwc=em2X>n6FeJUb@$a00Pi zwwyUXg=LaUnaCnZMINt8#lQa&!~$&S0d9+pBQhP?1Sh!k0pfu04qZI48E$F}+JTuX z-6jb6g{_*x2V7hW>*#vCEq_S=unj(&gDDwEDp*_GVGaQ$^a|qXI;VIV$k*8SN`e*Q z6dg$a`Jev>yhr8H*JQ^s2gU^>S_re|Zhx(R5A&|-vc}waQ z@-2~mHGQ3AtQn@i()nxl0X!oE&+M_wV-}&hm~ZJRqn|RI%ybgjo+*;UiYo#5y$!_m z^*VW*e||*Ygwmp~v*`>Swi$_0gp5V<#xc zew1bSYRh{;3V48jE>8jpu%gQsGR$4BJ(ahYS7j1y5wwS)cE#2vm?+Egvfql|HWjL% zRsd&=Mj&eZOH=$@__7*&5l&l*c)ARQIFWqB=RqHp*!Ra<98c_^HPcHuu6*1?43-QB z{-X{4a5Xw-0AL4ZDjx#<4-UfN`bQ2OzKzx8$g0JFV=w5R&>85SDt)V}qz!uvRYhJ@ zRf#bpW6Zyrjbpm~kDUQ{s2$+$_P=_wS}*K>izxZ*f14Gh(JJhJudt-q|K`K>VVPX;C)A6LzOeE=nUznNg7>oV3BtUv@D&!m7i;08 z^Z1eNdrSeEVcnzg14biltgY|kzvY^&lsDd&_e^%Q_bx_1)?s1tn~TXWlyuClCoEru zx$nMZdi;m}4It3yJ@790UF?6&Ruj+v)GF1&{&hOVaRESD6-2YEji+|nBH~Z`rw@J z0&)IOJ=(ME4EepLYt4Y&$A8CeIRz@W<4veJ6*i!@0ehG^0LP+cW%hv4vS!dfb^Z6x zQO;?ORdVNlY5?CW!2gw2t5&T4izqYlf7B`sxn9}agqO9#|Mv=u!2dqi&pi=TC3pTO z&i_hFLH)nT|Md7}`u^{C?eiS8z}fr1dYJ#Kh4Q~zuNCrtAtfgNtDCX>e;W z)oaD}?=_aR{oj1JJ}fi#f30S{6660$qf#mMe~TzDaQ_$GUUjM+aDw-xyG*MS!JqtzCq6FrjN`#!Abkv`koeh;9(YVV*+jw#uCXq-3 zX%QMp%W!WfrV^NYtQdkqZ2vq%7gG>0Jd}H6IV%QG`SWZvg`vb$ih(c?A;#(a=9_4j z_rm>HrN06Gf-@O5Jwrx!61QTlEWV> zzTrb~bYIj&TPc@~3)8ao_SA2ne-rhNdwdF%xZ`KS-(whh{Og`|Rb~fTvRaU#if#j4 z>>jI6gtxo}Tc@^kSGULIvEk6b)$9$@w@l9+wws$%rwP2^h)GPC$BF~_6cgqF#OI%8 zMMh(<)vKx>BW(_STwbX?tvA$FZ_>w8CSl_*G637%qOZjxuf?OT#Url8qpigwt;M6P z#Urf6qpQUts|lGXS>P{v@Jg+zin{S0!_elc+Nub3ACGyaCMIIUK=v1dtVzTVu*QR{ z@xW?4s2UHb#)GNxKx#aQ8pG6M@BdTnzf6mrTOW7u|3Rst{ij~!e_2eK?*D7nt1Wr6 zxw+AT3Y&!i`1KWm{~6cM-TQYZ{~zOjrP*j!3jg0CN~-*y8GY++ea!a%h37x2)hfvU zCd`;Z{x71;IRBw+H08Qdt<`JQ&3YmKUtbaUf6iPVmg)XKMQJpeG5)Vt8uh~ew}`St zZp8p#UrOQjDVE7K@DvJ1FGE8I&BPjMNz6Fw`B_<0YTPN2xRc0~+n1|xp)KX zOpu8;3e;S3@#e-%G#BqQJVmncPOG@se7w^~O40$P4HOx9$zx{ZjUapr$wW^?KZA_C znULklD!vz*fpRgJk(Z_Nb7E3r#d3uENwu;wgfBgw%Z-Z5AgvdANnE0!NmoikQi^~y zViCvPnQ0sXL%@D9WG82ziO~|OH57D2KAV->RwokA}2Rar&V06z{)>rk`8nswK9amqe-pz zKAE1M+!8^{Be_mo=-A1n8aVe;^rmrIn!K_+8*{t?R8(3yDr2JALF4r(FUGQ`KDNj65P=c$rKV9Stg zPi^f>mQ?@Gac6t?pd*j_^XVg}|EE!_RBB4S#qNI;{{MxPDgK|PQmI!r8*&X^Hk-|A zaRKZV7J>ig%=KZJasR7QZ?>ZRuQVI=M&bWiL|F>sz?Div*PS?9>!Bx525kgwj%99h zER}E_{b;y9c<1C=Kc`BH;qdTrp3S^5Tx+3m!@?+Y{`EqchGpw{wvB5lU6cLKnOTfS z9P{gMl}cx4XKC)It)L2D&4b_{QGZdop<-P!~+M~Oo7)MT4mNnQW> zxaZYJ{_}s$N>!ojf3w~!*8fG6>E}Q2-nU$@*EU4IGhhv3G<)-3-Uj}ABFs1M9G!^1>SDd8%6x5S6CAJzqxUJSZ3`1 zTCnkp@qY#WD)xViD8l|P44wGVk2#L}J=os$tsV_9jp5L-g5yWv$4KndVD8cDkw^6Y zAP=4IP_KjMT4UJO^|aCGCd@`6#vWha*$wu3v7I0W!FK3=(b0oeM>@N@8AR@;9r5HH zm~F;zSk!v%Ao1H~fSzM?avmcC&Fp^AGr3e+S|W$g^p&AsvwbcnIk~X35#N!i#z}G_Q5_IK2Mv| z8-}4OFvK>8dkxbUke+7Jc<~x>49vdZZ)qbLU^Mg%n0?LeqjxDpIScEL1GaSYNq={uF z)0+T9c!2tBb&a2Qpc7L_b#bNHhIWqRrd}MHlOjF9{u;|ppZ|BVZi#*~j|z+rQKH%MA@LgU-RSYbA>5bxMNAMhFjip@!9XnCVPhO*aeTQe^( z@dD1&(a5@H3>)vvfUj20_!XlwI)<|#;NaP65E`HHaX%(FSVp275JWT)2+hPJGrN#n z`p>`r=Rg1czwkdYWdaJdX5^MiYcFjUWwn@gavtjbo7WAjtQ7hZyVQ$x_`$kHem!+` z{JBJwUuAz%v-uU}y;A9v6KE>maqpv7s}v;^!&gd9YFwI-Kj2 zYtgZ{$6nEDm`MaAnZu+CNb%lKv$dX!$$3V`CEyO`B4FT5K3iimEImcTnR@1@Xh=); zf=CSIJvC`w>NhmU4;>C2{q&K9pNp9muBlMQ_rF``4g(GaJx{%G7BOa?ty1cBK5MuJ zJcxR;a$p1$japMVNm({53IJ6f{MCqpFRVvYx}u;OSv{V-h4>;Ogh1sGs1XRLIF$wd zSt8Xq!^GE!G?qzSAM!IzxH#lvo3!poG`%r-C;FA|!8~GP(JjRTS8TSn`zyXr(J%o` zsI^k=S_+iEASYY|5i4gimesS>V2a*`Yr~f_;UZ6vE=~UoG8%!SYLcJm3qX%94X9`s zOclSGa81F()LMYGp$BuIjLI3RRWNvc)aNpv%I7ocL^|WM{Wg{#4Z+$5CCdbD9F=9* z7C3@yN=)mMzQgG=)zDY<(v}^n0?B8=0uzpgj)7(?V=8d^FkJsc%NeXSthu!H39zQK zB#1OMp5fm3V`itRQVEcbdL|LV`QXdfTr(m~EfVKM6zva1 z(o^s2^Oh9*e`p8x6CH^D&wKQd-~QXIRH}8n|JiC4=YJMbX88Y_)rzcCtDBp(YNd$( z@e0e7^d=AF`moHp|6QxpWBgyMREzUJ3n@!qub&&{I#tk1;Zfuuhx7^GIESkKdw*K6 zGs&eTs;Z1EYZ4e!sWQTitSm_m-C_g>jlz@0yL-oNDV&2+sk?prvC~yQ9PaOSj!)V= zsdv~i-yRw!ON0xnw>APGnCiB_chXgV?EKu8@XQj^6h{U|wA}mj;qdr?24maaSL3xY z4sE3R(c#hQ@&5L4qUOY!cq7e@R4(}{2Ca6`>2B|Cceh!6UoYV``+9GLCf2eh*oCoeu?gHHvDV{naGV*h_`ovW`pO%9E>08JgrIG9 zi@dpwQtk_k79ku^ZIET61-1N5wAOc|Z-Ju4Dz)E4=EeFO;oDLoJ2`vFlX#cdA@*O$ z*QDGh68};nzlq1y8QA||*5obnfivL4y9cy0WOzTZJ(AR~kHp04 zLe}zuVLL7ftRWY=>jpM@sz8_6k#`#d^+B!nn-%T)k}Q8ZZnu?NiUsnf@*Taq@MRfD zCUlmbj$Y_7Fzey-WuAK8$C>N9c>?&Ex3HZg{Y9qn#6>}x9t@vIvla?ci+XEipgKGa!khDxn(Yh zBGr<@DSxy6$`Pp`%Cp=}y%nFwgvlo>G~!^RB$Rz1VQvM=d|8&s9tJH9*=0G>{c_|U zO3=UVXuVP?F<+f5@P(T&F<84%W`fPHze#soQj znS|uQPJ^_3DPHCiG+ywKrcnEXzn-x)@3Jq5)Xn=##Oc;#b^IU33IZt?JnJtCs!0cj|zY?Pe2Sx_jPTsxi9DXRpa}UNuW#mM0 z>9{gLb}Y2}y1Gn07$6$RJ}%E&fvUJ; znOC~)G988<5J0mm0$uj?n?5(J@Rl*E>H$*%SKiz%Mdj8nfBWsr>XuZZAPtE0znou9 z2^W)OHK`PRhL<>js(HR#^+)*=WK1L zuL>0t&$EEbXaV4hehcJG`&JQD_}5Xip@k}$~SQwPTNeyFq+a@K9uI}BR_&?HF{r+XKWTlLZoxlOj@LW7AHqW z|1nCWx0R0=d3v}D9WvXy_oOc|Z6hleFm>jM^oXpUGwG0M?}UCvw}Y_AsZmuyVpo-< zH_N}8%jtE#O%&iA#R;$mN+NZr zMo|)>3xX5xiz-)jhrmsO2tn=A$(U7Uzwq>tT+b;g}Ow=R&BhG6y1PeY# z#e>y#z+lQVhyISfg-b%j6K7&gwFBt;qewZbwIZOt*AoYal?yB|8HM7s%xpy zVaW3LsF_gE6smC?9G-sas-1rxb#}Vyap&Z8zuT5RXrKg1rNizIonv+T_~S`?1vaP5 zuntP(8$iW{iy-|L*wVQXR*9Hi@w}2}x@BfZE3>oOq8$;_BBt~o$@)i&Gc*PWviK~^ zI7vtdI5iFZO^Uugw{`6@tto0{WQLt3(tW?XMY@lk^rtGr@Xp1*t zXs9NCJUXS04K&#VhQljuW97k4j%iF1YZXaqeYH1cL;MP`oYn5_lGr5E}6q6@!D^1 zjc{v!Y#8M$0AXcv-CteLf@CWWBX*fpc~yX$O5f%JM_sn!7f<=)DTY+SU z?KCKm%SbvTiTYDP)RQ0rC=5Psp!k|x*l7IygGm49hYxD^@W;-lwnWx-x3{jFSB7nw zXl)KCe9;?OULUrjJzIAj|J;^Cv>v^IL~jA%p=4H}eLiN}jq-3%cCAa@qz6b1kT8GJ zz+aBnw}`7#*Jiv=@;zDa>sRZhHyV+5@7@t90Fc^%SEWGyZ1?flm;V~~KJmun*nT#s ztBuezcf;M6$Q>!4BC?i%@%7Pm%!$Aq!Rw|oH$+0}UrzgJSVzc}&M1M4-*Isw4 z{6)$Vqw%^#M2auGefI!JYgr=Se^U)wp@$dwxoO7yPO?dKVQK8z$}XENu&7?>VN;=4 z7Df?I92*@(>}`86(mt!E`I)1yYg0Qj&G-hi;s*#y6P!XAY?Nk#>l=1kS|6M4I-vN{ zS?6!p{dBHFYpjb^-~4cR&jZPt)oV% zl-82WdJHI|SHJw);uzR1Ouun9oE2x4hqmKz*<(^3WORoVwMKoFAtD?L`yIAi{$sTy zKpuWF1z@N&8#IH#CNzj3v6g%S_Iwu;%&)zJ?T_qiFF(bDNGen^u`kD`=a& zT4&OSx(Lh6G#oVaWh*H+LEyj~LjPd^s%O%rq>S#cD`*Grz_AA=tTS{PvY*ayTy~6q zfz)n(F z2LOinQ~JU`N@UN*591Hv;Gqr*R4lQ@|WF4N6`kuT)ho5?>->8cCqxwAw;w=k%KJ1nrq~3nt*>9sQtwhaU zqFa!j2}OO9hQ2=ZTv#hnO{Axdf9(8>izP0zfnDzKw7Y|PP|`0=osrZfZLww~>XX!Q z!*xsj(d5rR`4AmE5v=zk1`m3GjnyQ)1$55x4t;A3M_>4ueBBOwp5!<3%{OW9{r;dr zEaJ96pCDsAAdeaC{LaEkU#Q8w7Ez}AfAJocyiusO>ql+>u^p0CfBpEq6-QQP_ zJAXgzoOGGaL)Fcs1 zq{nN7`D|!FF|dh&6Z)>vlEpy~VQSWIR@ z1GfdC)_om&)-?n{3Iyi`f7d?K&{g<-@*rQH7F^5j4S5?eb?uWC8wfkr$%o@#zl)AW zJXy%qzzX4q4(Qdm3fjvIvv1IAc#g-MgDa-xA4vG!L+1cj0qMz1UyTfD_vx@3pJ_bW z(z&gfy&=qMXSjxLtTi6PkKtxDF<;S7mO5uM$%P|(7^xmgsH`KH#nH=GT7>DqFW`mh zf8>(i1Nz9X|2Lr2Q2(zfwL<@2M47JtH|y1wyxH8`XepJAdcp0luO#_D!u4}W1~5ba zr~DG-f2Cfn7V-ZVQKrfNJTVya^ijzj|EW={HPQaxg8vore-UN6{BKk@K>lwus;z3R zQJnvOg(b=V^WyrjRHny&swoX6#{cl6IRC$pvNSW+Zz*)+q=qC&Nz8U?8m^E{cUBK# zE@97AXHCF+ zZ$4?8?}{6eV>B%EN0HO{5pz1vqSF@A^_k>&z6-2*^4ag1u71I@o7|79jeGI&J!L^s z$i_0*YrURlYrUK7{Pif8b6hecH23B@hRmjc%pz8Kim7$Is;{5xW^af=zJRFoaR33f zQFbw!&3HE1q%0|K#OSkFVgK;i5vB9}=1T6boKB z<`MiyEC&GI03m@8yE}hR&L1xQbMyC|RBT@WW=Ymb8Uhw8Ugt!EC^qj$k=$QBg^Ya& zo=A?76@oTHdM!}UzW3?+5=5;CY#w2I;@ar>zn==_%t67<&E9Du*79r&%>^T%0@hLa zEuoo7nF6ic9om+6F~mF|35{KNm#MQSgtrqSbg?4)L6nTrbqlyv|<>`PIFBhY3D=%+Sx7bUgK^8e592Dd6X97A{T|2FE)2C3APR-?%Ovyd|5 z{76XYqNKGwn|DHb0*~*jX$#`3$_p^B3R`BWFH;@eJbT|LkeR_B~LP l=`=2c^)nOJ&);pF4Q}B#E=o~~Qj|rO{|^8;-8%qO0RW1;3nBmj literal 0 HcmV?d00001 From 86f4667d1786298d2e3a8901eac6e5dfbcfa5016 Mon Sep 17 00:00:00 2001 From: StingLin Date: Tue, 23 Sep 2025 17:04:31 +0800 Subject: [PATCH 04/38] test new dynamic --- .buildkite/pipeline_dynamic.yml | 2 +- .buildkite/pipeline_jax.yml | 2 -- .buildkite/scripts/bootstrap.sh | 38 +++++++++++++++++---------------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.buildkite/pipeline_dynamic.yml b/.buildkite/pipeline_dynamic.yml index fa1355330e..dbc315abfa 100644 --- a/.buildkite/pipeline_dynamic.yml +++ b/.buildkite/pipeline_dynamic.yml @@ -6,7 +6,7 @@ steps: agents: queue: tpu_v6e_queue commands: - - .buildkite/scripts/dynamic_upload.sh + - .buildkite/scripts/dynamic_bootstrap.sh - wait: ~ diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml index fc42dbde93..3184e36e60 100644 --- a/.buildkite/pipeline_jax.yml +++ b/.buildkite/pipeline_jax.yml @@ -113,8 +113,6 @@ steps: - test_6 - test_7 - test_8 - - integration_test_llama_3_1_8B_tpu - - integration_test_llama_3_1_70B_tpu agents: queue: tpu_v6e_queue commands: diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh index e5c892d31d..44fa7bf64b 100755 --- a/.buildkite/scripts/bootstrap.sh +++ b/.buildkite/scripts/bootstrap.sh @@ -3,24 +3,26 @@ echo "--- Starting Buildkite Bootstrap ---" # Check if the current build is a pull request -if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then - echo "This is a Pull Request build." - PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') +# if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then +# echo "This is a Pull Request build." +# PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') - # If it's a PR, check for the specific label - if [[ $PR_LABELS == *"ready"* ]]; then - echo "Found 'ready' label on PR. Uploading main pipeline..." - buildkite-agent pipeline upload .buildkite/pipeline_jax.yml - # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml - else - echo "No 'ready' label found on PR. Skipping main pipeline upload." - exit 0 # Exit with 0 to indicate success (no error, just skipped) - fi -else - # If it's NOT a Pull Request (e.g., branch push, tag, manual build) - echo "This is not a Pull Request build. Uploading main pipeline." - buildkite-agent pipeline upload .buildkite/pipeline_jax.yml - # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml -fi +# # If it's a PR, check for the specific label +# if [[ $PR_LABELS == *"ready"* ]]; then +# echo "Found 'ready' label on PR. Uploading main pipeline..." +# buildkite-agent pipeline upload .buildkite/pipeline_jax.yml +# # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml +# else +# echo "No 'ready' label found on PR. Skipping main pipeline upload." +# exit 0 # Exit with 0 to indicate success (no error, just skipped) +# fi +# else +# # If it's NOT a Pull Request (e.g., branch push, tag, manual build) +# echo "This is not a Pull Request build. Uploading main pipeline." +# buildkite-agent pipeline upload .buildkite/pipeline_jax.yml +# # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml +# fi + +buildkite-agent pipeline upload .buildkite/pipeline_dynamic.yml echo "--- Buildkite Bootstrap Finished ---" From 6603df2477739269cd9d1be6f60d85bfbd8607f6 Mon Sep 17 00:00:00 2001 From: StingLin Date: Tue, 23 Sep 2025 17:08:10 +0800 Subject: [PATCH 05/38] test --- .buildkite/scripts/dynamic_bootstrap.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh index 9736eaa08b..5b9fd47294 100644 --- a/.buildkite/scripts/dynamic_bootstrap.sh +++ b/.buildkite/scripts/dynamic_bootstrap.sh @@ -106,24 +106,24 @@ model_list_string=$(printf "%s\n" "${model_names[@]}") if [[ -n "$tpu_model_list_str" ]]; then echo "--- Uploading tpu_model_list_str to Meta-data:${MODEL_LIST_KEY}" - # echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}" - # echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")" + echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}" + echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")" else echo "--- No Model Names found to upload." fi if [[ -n "$vllm_model_list_str" ]]; then echo "--- Uploading vllm_model_list_str to Meta-data:${INFORMATIONAL_MODEL_LIST_KEY}" - # echo "${vllm_model_list_str}" | buildkite-agent meta-data set "${INFORMATIONAL_MODEL_LIST_KEY}" - # echo "Testing: $(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")" + echo "${vllm_model_list_str}" | buildkite-agent meta-data set "${INFORMATIONAL_MODEL_LIST_KEY}" + echo "Testing: $(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")" else echo "--- No Model Names found to upload." fi POPURLAR_MODEL_LIST_KEY if [[ -n "$popular_model_list_str" ]]; then echo "--- Uploading popular_model_list_str to Meta-data:${POPURLAR_MODEL_LIST_KEY}" - # echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}" - # echo "Testing: $(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")" + echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}" + echo "Testing: $(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")" else echo "--- No Model Names found to upload." fi @@ -136,10 +136,10 @@ if [[ -n "$pipeline_steps" ]]; then final_pipeline_yaml="steps:"$'\n' final_pipeline_yaml+=$(printf "%s\n" "${pipeline_steps[@]}") echo "Upload YML: ${final_pipeline_yaml}" - # echo -e "${final_pipeline_yaml}" | buildkite-agent pipeline upload + echo -e "${final_pipeline_yaml}" | buildkite-agent pipeline upload else echo "--- No .yml files found, no new Pipeline Steps to upload." - # buildkite-agent step update --state "passed" + buildkite-agent step update --state "passed" fi echo "--- Buildkite Special Bootstrap Finished ---" From 89ef6c4e561a020cb368b4ee44b586d00c6c3e30 Mon Sep 17 00:00:00 2001 From: StingLin Date: Tue, 23 Sep 2025 17:18:17 +0800 Subject: [PATCH 06/38] test ssh --- .buildkite/buildkite_ci_model_template.yml | 2 +- .buildkite/scripts/dynamic_bootstrap.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml index f58b557557..48e451bc58 100644 --- a/.buildkite/buildkite_ci_model_template.yml +++ b/.buildkite/buildkite_ci_model_template.yml @@ -17,7 +17,7 @@ steps: .buildkite/scripts/check_results.sh \ "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME} plugins: - - hooks#v1: + - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1:: post-command: | echo "--- Post-command hook triggered ---" echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh index 5b9fd47294..2b76261d3d 100644 --- a/.buildkite/scripts/dynamic_bootstrap.sh +++ b/.buildkite/scripts/dynamic_bootstrap.sh @@ -119,7 +119,7 @@ if [[ -n "$vllm_model_list_str" ]]; then else echo "--- No Model Names found to upload." fi -POPURLAR_MODEL_LIST_KEY + if [[ -n "$popular_model_list_str" ]]; then echo "--- Uploading popular_model_list_str to Meta-data:${POPURLAR_MODEL_LIST_KEY}" echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}" From f0720f332ddd48afc0d75f67bd0fac63ade9eac1 Mon Sep 17 00:00:00 2001 From: StingLin Date: Tue, 23 Sep 2025 17:19:02 +0800 Subject: [PATCH 07/38] test git ssh --- .../models/informational/meta-llama_Llama-3_1-70B-Instruct.yml | 2 +- .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml index 9539111d5b..5adbb2c503 100644 --- a/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml +++ b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml @@ -17,7 +17,7 @@ steps: .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct plugins: - - hooks#v1: + - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1:: post-command: | echo "--- Post-command hook triggered ---" echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index eff2e3c815..47bd3be519 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -17,7 +17,7 @@ steps: .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct plugins: - - hooks#v1: + - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1:: post-command: | echo "--- Post-command hook triggered ---" echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" From 7060e48de9d385bcf8028084521ec5b3d8b1afd0 Mon Sep 17 00:00:00 2001 From: StingLin Date: Tue, 23 Sep 2025 17:41:47 +0800 Subject: [PATCH 08/38] test --- .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index 47bd3be519..5aa88e108b 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -17,7 +17,7 @@ steps: .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct plugins: - - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1:: + - buildkite-plugins/hooks-plugin#v1.1.0: post-command: | echo "--- Post-command hook triggered ---" echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" From a337edba393f983b9837c7e18b773204fb8b2a41 Mon Sep 17 00:00:00 2001 From: StingLin Date: Tue, 23 Sep 2025 17:51:14 +0800 Subject: [PATCH 09/38] test --- .buildkite/hooks/post-command | 13 +++++++++++++ .../models/meta-llama_Llama-3_1-8B-Instruct.yml | 12 +----------- 2 files changed, 14 insertions(+), 11 deletions(-) create mode 100644 .buildkite/hooks/post-command diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command new file mode 100644 index 0000000000..84b464ad45 --- /dev/null +++ b/.buildkite/hooks/post-command @@ -0,0 +1,13 @@ +#!/bin/bash +set -euo pipefail + +echo "--- Post-command hook triggered ---" +echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + +if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "passed" +else + echo "The step failed. Uploading result..." + buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "failed" +fi diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index 5aa88e108b..f75ac13f81 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -17,17 +17,7 @@ steps: .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct plugins: - - buildkite-plugins/hooks-plugin#v1.1.0: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "failed" - fi + - ".buildkite": ~ - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct" key: "it_meta-llama_Llama-3_1-8B-Instruct" From 6d96a0a1a7a17e877ceadf76d173122b3ab9cbb2 Mon Sep 17 00:00:00 2001 From: StingLin Date: Tue, 23 Sep 2025 17:53:33 +0800 Subject: [PATCH 10/38] test --- .../meta-llama_Llama-3_1-8B-Instruct.yml | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index f75ac13f81..8d66a393f3 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -35,18 +35,6 @@ steps: - | .buildkite/scripts/check_results.sh \ "Integration tests for meta-llama/Llama-3.1-8B-Instruct" it_meta-llama_Llama-3_1-8B-Instruct - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:IntTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:IntTest" "failed" - fi - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" key: "pb_meta-llama_Llama-3_1-8B-Instruct" @@ -63,18 +51,6 @@ steps: - | .buildkite/scripts/check_results.sh \ "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" pb_meta-llama_Llama-3_1-8B-Instruct - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:Benchmark" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:Benchmark" "failed" - fi - label: "Stress tests for meta-llama/Llama-3.1-8B-Instruct" key: "st_meta-llama_Llama-3_1-8B-Instruct" @@ -91,15 +67,3 @@ steps: - | .buildkite/scripts/check_results.sh \ "Stress tests for meta-llama/Llama-3.1-8B-Instruct" st_meta-llama_Llama-3_1-8B-Instruct - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:StressTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:StressTest" "failed" - fi From 3d7dcd8c0e924addf051abe695f49f37ee23fd56 Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 10:41:12 +0800 Subject: [PATCH 11/38] test --- .buildkite/hooks/post-command | 8 +- .../meta-llama_Llama-3_1-70B-Instruct.yml | 115 ------------------ .../meta-llama_Llama-3_1-8B-Instruct.yml | 3 + 3 files changed, 7 insertions(+), 119 deletions(-) delete mode 100644 .buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command index 84b464ad45..3a7a0f3a11 100644 --- a/.buildkite/hooks/post-command +++ b/.buildkite/hooks/post-command @@ -5,9 +5,9 @@ echo "--- Post-command hook triggered ---" echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "passed" + echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." + buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed" else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "failed" + echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." + buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed" fi diff --git a/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml deleted file mode 100644 index 5adbb2c503..0000000000 --- a/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml +++ /dev/null @@ -1,115 +0,0 @@ -# meta-llama/Llama-3.1-70B-Instruct -agents: - queue: tpu_v6e_8_queue -steps: - - label: "Unit tests for meta-llama/Llama-3.1-70B-Instruct" - key: "ut_meta-llama_Llama-3_1-70B-Instruct" - commands: - # - replace_with_test_commands # TODO: Replaced to actual test commands - - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands - - label: "Notifications: Unit tests for meta-llama/Llama-3.1-70B-Instruct" - key: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct" - depends_on: "ut_meta-llama_Llama-3_1-70B-Instruct" - agents: - queue: tpu_v6e_8_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct - plugins: - - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1:: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:UnitTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:UnitTest" "failed" - fi - - - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct" - key: "it_meta-llama_Llama-3_1-70B-Instruct" - depends_on: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct" - commands: - # TODO: expected_accuracy need parameterized - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-70B-Instruct" - - echo "[DEBUG], integration testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands - - label: "Notifications: Integration tests for meta-llama/Llama-3.1-70B-Instruct" - key: "notifications_it_meta-llama_Llama-3_1-70B-Instruct" - depends_on: "it_meta-llama_Llama-3_1-70B-Instruct" - agents: - queue: tpu_v6e_8_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:IntTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:IntTest" "failed" - fi - - - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" - key: "pb_meta-llama_Llama-3_1-70B-Instruct" - depends_on: "notifications_it_meta-llama_Llama-3_1-70B-Instruct" - commands: - # - replace_with_test_command # TODO - - echo "[DEBUG], performance benchmarking for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands - - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" - key: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct" - depends_on: "pb_meta-llama_Llama-3_1-70B-Instruct" - agents: - queue: tpu_v6e_8_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:Benchmark" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:Benchmark" "failed" - fi - - - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct" - key: "st_meta-llama_Llama-3_1-70B-Instruct" - depends_on: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct" - commands: - # - our_stress_tests_script meta-llama/Llama-3.1-70B-Instruct expected_throughput # TODO: expected_throughput need parameterized - - echo "[DEBUG], stress testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands - - label: "Notifications: Stress tests for meta-llama/Llama-3.1-70B-Instruct" - key: "notifications_st_meta-llama_Llama-3_1-70B-Instruct" - depends_on: "st_meta-llama_Llama-3_1-70B-Instruct" - agents: - queue: tpu_v6e_8_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:StressTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:StressTest" "failed" - fi diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index 8d66a393f3..fbd6e7ae2e 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -12,6 +12,9 @@ steps: depends_on: "ut_meta-llama_Llama-3_1-8B-Instruct" agents: queue: tpu_v6e_queue + env: + EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct" + EXECUTE_STAGE: "UnitTest" commands: - | .buildkite/scripts/check_results.sh \ From 6adad371fda1c7495db18caf15ff97b6403fdc2f Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 10:50:50 +0800 Subject: [PATCH 12/38] test --- .buildkite/hooks/post-command | 14 +++++++------- .../models/meta-llama_Llama-3_1-8B-Instruct.yml | 6 ++++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command index 3a7a0f3a11..a0da4e5802 100644 --- a/.buildkite/hooks/post-command +++ b/.buildkite/hooks/post-command @@ -4,10 +4,10 @@ set -euo pipefail echo "--- Post-command hook triggered ---" echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" -if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." - buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed" -else - echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." - buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed" -fi +# if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then +# echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." +# buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed" +# else +# echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." +# buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed" +# fi diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index fbd6e7ae2e..f6e2868ec3 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -19,8 +19,10 @@ steps: - | .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct - plugins: - - ".buildkite": ~ + post-command: | + echo "Test post-command?" + # plugins: + # - ".buildkite": ~ - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct" key: "it_meta-llama_Llama-3_1-8B-Instruct" From 7615b6e90dfded480397fdf890f713a2852a69f8 Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 10:54:50 +0800 Subject: [PATCH 13/38] test --- .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index f6e2868ec3..58cf2f9b92 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -19,8 +19,9 @@ steps: - | .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct - post-command: | - echo "Test post-command?" + plugins: + - buildkite-plugins/hooks-plugin#v1.1.0: + directory: ".buildkite" # plugins: # - ".buildkite": ~ From 942b25f3052025e7957ba7a748687cf6457fafd4 Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 11:27:48 +0800 Subject: [PATCH 14/38] test post command --- .buildkite/hooks/post-command | 33 ++++++++++++++----- .../meta-llama_Llama-3_1-8B-Instruct.yml | 6 ++-- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command index a0da4e5802..1f7b660299 100644 --- a/.buildkite/hooks/post-command +++ b/.buildkite/hooks/post-command @@ -2,12 +2,27 @@ set -euo pipefail echo "--- Post-command hook triggered ---" -echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - -# if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then -# echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." -# buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed" -# else -# echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." -# buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed" -# fi + +if [ -n "$EXECUTE_MODEL" ] && \ + [ -n "$EXECUTE_STAGE" ] && \ + [[ "$BUILDKITE_STEP_KEY" == "notifications_"* ]]; then + + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + + # If all conditions are true, execute the logic here. + echo "EXECUTE_MODEL: $EXECUTE_MODEL" + echo "EXECUTE_STAGE: $EXECUTE_STAGE" + echo "BUILDKITE_STEP_KEY: $BUILDKITE_STEP_KEY" + + if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then + echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." + buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed" + else + echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." + buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed" + fi + +else + # If any condition is false, print a message and exit. + echo "One or more conditions were not met. Skipping execution." +fi diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index 58cf2f9b92..95dcb99a19 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -19,9 +19,9 @@ steps: - | .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct - plugins: - - buildkite-plugins/hooks-plugin#v1.1.0: - directory: ".buildkite" + # plugins: + # - buildkite-plugins/hooks-plugin#v1.1.0: + # directory: ".buildkite" # plugins: # - ".buildkite": ~ From d5c417c3b4c26c89c330e82b8d5f9b6e8aa4e6a7 Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 11:30:52 +0800 Subject: [PATCH 15/38] test --- .buildkite/hooks/post-command | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command index 1f7b660299..e85e681c97 100644 --- a/.buildkite/hooks/post-command +++ b/.buildkite/hooks/post-command @@ -3,9 +3,9 @@ set -euo pipefail echo "--- Post-command hook triggered ---" -if [ -n "$EXECUTE_MODEL" ] && \ - [ -n "$EXECUTE_STAGE" ] && \ - [[ "$BUILDKITE_STEP_KEY" == "notifications_"* ]]; then +if [ -n "${EXECUTE_MODEL:-}" ] && \ + [ -n "${EXECUTE_STAGE:-}" ] && \ + [[ "${BUILDKITE_STEP_KEY:-}" == "notifications_"* ]]; then echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" @@ -25,4 +25,4 @@ if [ -n "$EXECUTE_MODEL" ] && \ else # If any condition is false, print a message and exit. echo "One or more conditions were not met. Skipping execution." -fi +fi \ No newline at end of file From 2c7458709e8a2f643f24e561b8da5e5a7a30e76e Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 11:33:46 +0800 Subject: [PATCH 16/38] test all post --- .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index 95dcb99a19..9c83f909ae 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -37,6 +37,9 @@ steps: depends_on: "it_meta-llama_Llama-3_1-8B-Instruct" agents: queue: tpu_v6e_queue + env: + EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct" + EXECUTE_STAGE: "IntTest" commands: - | .buildkite/scripts/check_results.sh \ @@ -53,6 +56,9 @@ steps: depends_on: "pb_meta-llama_Llama-3_1-8B-Instruct" agents: queue: tpu_v6e_queue + env: + EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct" + EXECUTE_STAGE: "Benchmark" commands: - | .buildkite/scripts/check_results.sh \ @@ -69,6 +75,9 @@ steps: depends_on: "st_meta-llama_Llama-3_1-8B-Instruct" agents: queue: tpu_v6e_queue + env: + EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct" + EXECUTE_STAGE: "StressTest" commands: - | .buildkite/scripts/check_results.sh \ From a09184ce5f216a0f0ee1e62b0de4207c4bdc657c Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 11:57:02 +0800 Subject: [PATCH 17/38] test to check_results --- .buildkite/buildkite_ci_feature_template.yml | 60 ++++---------------- .buildkite/buildkite_ci_model_template.yml | 60 ++++---------------- .buildkite/hooks/post-command | 28 --------- .buildkite/scripts/check_results.sh | 23 +++++++- 4 files changed, 46 insertions(+), 125 deletions(-) delete mode 100644 .buildkite/hooks/post-command diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml index e907815416..e7286ccda4 100644 --- a/.buildkite/buildkite_ci_feature_template.yml +++ b/.buildkite/buildkite_ci_feature_template.yml @@ -12,22 +12,13 @@ steps: depends_on: "ut_{SAFE_FEATURE_NAME}" agents: queue: {QUEUE} + env: + EXECUTE_ENTITY: "{FEATURE_NAME}" + EXECUTE_STAGE: "UnitTest" commands: - | .buildkite/scripts/check_results.sh \ "Unit tests for {FEATURE_NAME}" ut_{SAFE_FEATURE_NAME} - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "{FEATURE_NAME}:UnitTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "{FEATURE_NAME}:UnitTest" "failed" - fi - label: "Integration tests for {FEATURE_NAME}" key: "it_{SAFE_FEATURE_NAME}" @@ -41,22 +32,13 @@ steps: depends_on: "it_{SAFE_FEATURE_NAME}" agents: queue: {QUEUE} + env: + EXECUTE_ENTITY: "{FEATURE_NAME}" + EXECUTE_STAGE: "IntTest" commands: - | .buildkite/scripts/check_results.sh \ "Integration tests for {FEATURE_NAME}" it_{SAFE_FEATURE_NAME} - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "{FEATURE_NAME}:IntTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "{FEATURE_NAME}:IntTest" "failed" - fi - label: "Performance benchmarks for {FEATURE_NAME}" key: "pb_{SAFE_FEATURE_NAME}" @@ -69,22 +51,13 @@ steps: depends_on: "pb_{SAFE_FEATURE_NAME}" agents: queue: {QUEUE} + env: + EXECUTE_ENTITY: "{FEATURE_NAME}" + EXECUTE_STAGE: "Benchmark" commands: - | .buildkite/scripts/check_results.sh \ "Performance benchmarks for {FEATURE_NAME}" pb_{SAFE_FEATURE_NAME} - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "{FEATURE_NAME}:Benchmark" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "{FEATURE_NAME}:Benchmark" "failed" - fi - label: "Stress tests for {FEATURE_NAME}" key: "st_{SAFE_FEATURE_NAME}" @@ -97,19 +70,10 @@ steps: depends_on: "st_{SAFE_FEATURE_NAME}" agents: queue: {QUEUE} + env: + EXECUTE_ENTITY: "{FEATURE_NAME}" + EXECUTE_STAGE: "StressTest" commands: - | .buildkite/scripts/check_results.sh \ "Stress tests for {FEATURE_NAME}" st_{SAFE_FEATURE_NAME} - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "{FEATURE_NAME}:StressTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "{FEATURE_NAME}:StressTest" "failed" - fi diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml index 48e451bc58..6c6c13910c 100644 --- a/.buildkite/buildkite_ci_model_template.yml +++ b/.buildkite/buildkite_ci_model_template.yml @@ -12,22 +12,13 @@ steps: depends_on: "ut_{SAFE_MODEL_NAME}" agents: queue: {QUEUE} + env: + EXECUTE_ENTITY: "{MODEL_NAME}" + EXECUTE_STAGE: "UnitTest" commands: - | .buildkite/scripts/check_results.sh \ "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME} - plugins: - - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1:: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "{MODEL_NAME}:UnitTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "{MODEL_NAME}:UnitTest" "failed" - fi - label: "Integration tests for {MODEL_NAME}" key: "it_{SAFE_MODEL_NAME}" @@ -41,22 +32,13 @@ steps: depends_on: "it_{SAFE_MODEL_NAME}" agents: queue: {QUEUE} + env: + EXECUTE_ENTITY: "{MODEL_NAME}" + EXECUTE_STAGE: "IntTest" commands: - | .buildkite/scripts/check_results.sh \ "Integration tests for {MODEL_NAME}" it_{SAFE_MODEL_NAME} - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "{MODEL_NAME}:IntTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "{MODEL_NAME}:IntTest" "failed" - fi - label: "Performance benchmarks for {MODEL_NAME}" key: "pb_{SAFE_MODEL_NAME}" @@ -69,22 +51,13 @@ steps: depends_on: "pb_{SAFE_MODEL_NAME}" agents: queue: {QUEUE} + env: + EXECUTE_ENTITY: "{MODEL_NAME}" + EXECUTE_STAGE: "Benchmark" commands: - | .buildkite/scripts/check_results.sh \ "Performance benchmarks for {MODEL_NAME}" pb_{SAFE_MODEL_NAME} - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "{MODEL_NAME}:Benchmark" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "{MODEL_NAME}:Benchmark" "failed" - fi - label: "Stress tests for {MODEL_NAME}" key: "st_{SAFE_MODEL_NAME}" @@ -97,19 +70,10 @@ steps: depends_on: "st_{SAFE_MODEL_NAME}" agents: queue: {QUEUE} + env: + EXECUTE_ENTITY: "{MODEL_NAME}" + EXECUTE_STAGE: "StressTest" commands: - | .buildkite/scripts/check_results.sh \ "Stress tests for {MODEL_NAME}" st_{SAFE_MODEL_NAME} - plugins: - - hooks#v1: - post-command: | - echo "--- Post-command hook triggered ---" - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading result..." - buildkite-agent meta-data set "{MODEL_NAME}:StressTest" "passed" - else - echo "The step failed. Uploading result..." - buildkite-agent meta-data set "{MODEL_NAME}:StressTest" "failed" - fi diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command deleted file mode 100644 index e85e681c97..0000000000 --- a/.buildkite/hooks/post-command +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -set -euo pipefail - -echo "--- Post-command hook triggered ---" - -if [ -n "${EXECUTE_MODEL:-}" ] && \ - [ -n "${EXECUTE_STAGE:-}" ] && \ - [[ "${BUILDKITE_STEP_KEY:-}" == "notifications_"* ]]; then - - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" - - # If all conditions are true, execute the logic here. - echo "EXECUTE_MODEL: $EXECUTE_MODEL" - echo "EXECUTE_STAGE: $EXECUTE_STAGE" - echo "BUILDKITE_STEP_KEY: $BUILDKITE_STEP_KEY" - - if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then - echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." - buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed" - else - echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..." - buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed" - fi - -else - # If any condition is false, print a message and exit. - echo "One or more conditions were not met. Skipping execution." -fi \ No newline at end of file diff --git a/.buildkite/scripts/check_results.sh b/.buildkite/scripts/check_results.sh index f57edbf1f3..913930dc2e 100755 --- a/.buildkite/scripts/check_results.sh +++ b/.buildkite/scripts/check_results.sh @@ -21,7 +21,28 @@ for KEY in "$@"; do fi done -if [ "${ANY_FAILED}" = "true" ] ; then +# Check Test Result and upload to buildkite meta-data +if [ -n "${EXECUTE_ENTITY:-}" ] && \ + [ -n "${EXECUTE_STAGE:-}" ] && \ + [[ "${BUILDKITE_STEP_KEY:-}" == "notifications_"* ]]; then + + # If all conditions are true, execute the logic here. + echo "EXECUTE_ENTITY: $EXECUTE_ENTITY" + echo "EXECUTE_STAGE: $EXECUTE_STAGE" + echo "BUILDKITE_STEP_KEY: $BUILDKITE_STEP_KEY" + + echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" + + if [ "${ANY_FAILED}" = "true" ]; then + echo "The step failed. Uploading $EXECUTE_ENTITY:$EXECUTE_STAGE result..." + buildkite-agent meta-data set "$EXECUTE_ENTITY:$EXECUTE_STAGE" "failed" + else + echo "The step passed. Uploading $EXECUTE_ENTITY:$EXECUTE_STAGE result..." + buildkite-agent meta-data set "$EXECUTE_ENTITY:$EXECUTE_STAGE" "passed" + fi +fi + +if [ "${ANY_FAILED}" = "true" ]; then cat <<- YAML | buildkite-agent pipeline upload steps: - label: "${FAILURE_LABEL}" From 736df21c967d35977c63e88f47e58d2e09203a4d Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 12:11:18 +0800 Subject: [PATCH 18/38] test --- .../models/meta-llama_Llama-3_1-8B-Instruct.yml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index 9c83f909ae..496407d1ac 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -13,17 +13,12 @@ steps: agents: queue: tpu_v6e_queue env: - EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct" + EXECUTE_ENTITY: "meta-llama/Llama-3.1-8B-Instruct" EXECUTE_STAGE: "UnitTest" commands: - | .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct - # plugins: - # - buildkite-plugins/hooks-plugin#v1.1.0: - # directory: ".buildkite" - # plugins: - # - ".buildkite": ~ - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct" key: "it_meta-llama_Llama-3_1-8B-Instruct" @@ -38,7 +33,7 @@ steps: agents: queue: tpu_v6e_queue env: - EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct" + EXECUTE_ENTITY: "meta-llama/Llama-3.1-8B-Instruct" EXECUTE_STAGE: "IntTest" commands: - | @@ -57,7 +52,7 @@ steps: agents: queue: tpu_v6e_queue env: - EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct" + EXECUTE_ENTITY: "meta-llama/Llama-3.1-8B-Instruct" EXECUTE_STAGE: "Benchmark" commands: - | @@ -76,7 +71,7 @@ steps: agents: queue: tpu_v6e_queue env: - EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct" + EXECUTE_ENTITY: "meta-llama/Llama-3.1-8B-Instruct" EXECUTE_STAGE: "StressTest" commands: - | From e15f755c1b3120c8924ccd14f23260044cc6608c Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 12:17:13 +0800 Subject: [PATCH 19/38] fix --- .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml index 496407d1ac..9aac0432d6 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml @@ -17,7 +17,7 @@ steps: EXECUTE_STAGE: "UnitTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct" @@ -37,7 +37,7 @@ steps: EXECUTE_STAGE: "IntTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Integration tests for meta-llama/Llama-3.1-8B-Instruct" it_meta-llama_Llama-3_1-8B-Instruct - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" @@ -56,7 +56,7 @@ steps: EXECUTE_STAGE: "Benchmark" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" pb_meta-llama_Llama-3_1-8B-Instruct - label: "Stress tests for meta-llama/Llama-3.1-8B-Instruct" @@ -75,5 +75,5 @@ steps: EXECUTE_STAGE: "StressTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Stress tests for meta-llama/Llama-3.1-8B-Instruct" st_meta-llama_Llama-3_1-8B-Instruct From 1a9e6369bcdbcaa44073b9cd22811d5ca18cff5e Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 12:19:27 +0800 Subject: [PATCH 20/38] fix --- .buildkite/scripts/check_results.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/.buildkite/scripts/check_results.sh b/.buildkite/scripts/check_results.sh index 913930dc2e..c961e53548 100755 --- a/.buildkite/scripts/check_results.sh +++ b/.buildkite/scripts/check_results.sh @@ -30,8 +30,6 @@ if [ -n "${EXECUTE_ENTITY:-}" ] && \ echo "EXECUTE_ENTITY: $EXECUTE_ENTITY" echo "EXECUTE_STAGE: $EXECUTE_STAGE" echo "BUILDKITE_STEP_KEY: $BUILDKITE_STEP_KEY" - - echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS" if [ "${ANY_FAILED}" = "true" ]; then echo "The step failed. Uploading $EXECUTE_ENTITY:$EXECUTE_STAGE result..." From 5f93dd238ecf494da4538c90b45b233100800e1b Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 14:58:41 +0800 Subject: [PATCH 21/38] test --- .buildkite/pipeline_dynamic.yml | 48 +++++++- .buildkite/scripts/dynamic_bootstrap.sh | 6 +- .buildkite/scripts/export_support_matrix.sh | 115 ++++++++++++++++++++ 3 files changed, 163 insertions(+), 6 deletions(-) create mode 100644 .buildkite/scripts/export_support_matrix.sh diff --git a/.buildkite/pipeline_dynamic.yml b/.buildkite/pipeline_dynamic.yml index dbc315abfa..139d6385ac 100644 --- a/.buildkite/pipeline_dynamic.yml +++ b/.buildkite/pipeline_dynamic.yml @@ -6,12 +6,54 @@ steps: agents: queue: tpu_v6e_queue commands: - - .buildkite/scripts/dynamic_bootstrap.sh + - bash .buildkite/scripts/dynamic_bootstrap.sh - wait: ~ - - label: "Generate support matrices" + - label: "Export support matrix report" + key: export_support_matrix agents: queue: tpu_v6e_queue commands: - - echo "Generate support matrices..." \ No newline at end of file + - echo "Generate support matrices..." + - bash .buildkite/scripts/export_support_matrix.sh + + # Handle PR builds: print model matrices and feature matrices + - label: "Handle Report" + if: build.pull_request.id != null + depends_on: export_support_matrix + agents: + queue: tpu_v6e_queue + command: | + buildkite-agent artifact download "model_support_matrix.csv" . + buildkite-agent artifact download "feature_support_matrix.csv" . + echo "--- Model Support Matrix ---" + cat model_support_matrix.csv + echo "--- Feature Support Matrix ---" + cat feature_support_matrix.csv + + # # Release Tag build: commit CSVs + # - label: "Commit CSVs on Release Tag" + # if: build.tag =~ /^v\.?[0-9]+(\.[0-9]+)*$/ + # depends_on: "set-results" + # command: | + # echo "=== Release Tag build ===" + # echo "BUILDKITE_TAG=$BUILDKITE_TAG" + + # # Checkout main branch and sync code + # git fetch origin main + # git checkout main + # git reset --hard origin/main + + # # Create target folder tpu_dev/result + # mkdir -p result + + # # Download all CSV artifacts + # buildkite-agent artifact download "model_support_matrix.csv" . + # buildkite-agent artifact download "feature_support_matrix.csv" . + + # # Stage and commit changes (skip CI to avoid infinite loop) + # git add *.csv + # git commit -m "[skip ci] Update CSVs for $BUILDKITE_TAG" || echo "No changes to commit" + + # git push origin main \ No newline at end of file diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh index 2b76261d3d..c8c667d261 100644 --- a/.buildkite/scripts/dynamic_bootstrap.sh +++ b/.buildkite/scripts/dynamic_bootstrap.sh @@ -109,7 +109,7 @@ if [[ -n "$tpu_model_list_str" ]]; then echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}" echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")" else - echo "--- No Model Names found to upload." + echo "--- No tpu-support Models found to upload." fi if [[ -n "$vllm_model_list_str" ]]; then @@ -117,7 +117,7 @@ if [[ -n "$vllm_model_list_str" ]]; then echo "${vllm_model_list_str}" | buildkite-agent meta-data set "${INFORMATIONAL_MODEL_LIST_KEY}" echo "Testing: $(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")" else - echo "--- No Model Names found to upload." + echo "--- No vllm-native Models found to upload." fi if [[ -n "$popular_model_list_str" ]]; then @@ -125,7 +125,7 @@ if [[ -n "$popular_model_list_str" ]]; then echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}" echo "Testing: $(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")" else - echo "--- No Model Names found to upload." + echo "--- No popular Models found to upload." fi diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh new file mode 100644 index 0000000000..ff6a408d36 --- /dev/null +++ b/.buildkite/scripts/export_support_matrix.sh @@ -0,0 +1,115 @@ +#!/bin/bash +set -euo pipefail + +ANY_FAILED=false + +MODEL_LIST_KEY="tpu-model-list" +INFORMATIONAL_MODEL_LIST_KEY="vllm-model-list" +POPURLAR_MODEL_LIST_KEY="popular-model-list" + +FEATURE_LIST_METADATA_KEY="feature-list" + +# tpu_model_list="Qwen/Qwen2.5-0.5B-Instruct Qwen/Qwen2.5-2B-Instruct" +# vllm_model_list="NousResearch/Nous-Hermes-1.4B NousResearch/Nous-Hermes-2.5B" +# popular_model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.2-8B-Instruct" +tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}") +vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}") +popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}") + +feature_list="f1 f2" +STAGES="UnitTest IntTest Benchmark StressTest" + +# Output CSV files +output_model_support_matrix_file="model_support_matrix.csv" +echo "Model,UnitTest,IntTest,Benchmark,StressTest" > "$output_model_support_matrix_file" + +output_feature_support_matrix_file="feature_support_matrix.csv" +echo "Feature,UnitTest,IntTest,Benchmark,StressTest" > "$output_feature_support_matrix_file" + +# All stages must pass for TPU models +check_tpu_model() { + local model="$1" + for stage in $STAGES; do + result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "not_run") + if [[ "$result" != "passed" ]]; then + echo "TPU model $model failed at $stage ($result)" + ANY_FAILED=true + fi + done +} + +# Only UnitTest and IntTest must pass for VLLM models +check_vllm_model() { + local model="$1" + local required="UnitTest IntTest" + for stage in $required; do + result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "not_run") + if [[ "$result" != "passed" ]]; then + echo "VLLM model $model failed at $stage ($result)" + ANY_FAILED=true + fi + done +} + +process_models() { + local model_list="$1" + local mode="$2" # tpu | vllm | popular + for model in $model_list; do + row="$model" + for stage in $STAGES; do + result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "${model}:${stage} not_run") + row="$row,$result" + done + echo "$row" >> "$output_model_support_matrix_file" + + # run checks + case $mode in + tpu) check_tpu_model "$model" ;; + vllm) check_vllm_model "$model" ;; + popular) ;; + esac + done +} + +process_features() { + local feature_list="$1" + for feature in $feature_list; do + row="$feature" + for stage in $STAGES; do + result=$(buildkite-agent meta-data get "${feature}:${stage}" || echo "${feature}:${stage} not_run") + row="$row,$result" + done + echo "$row" >> "$output_feature_support_matrix_file" + done +} + +echo "--- Checking TPU models Outcomes and Generating Reports ---" +process_models "$tpu_model_list" tpu + +echo "--- Checking VLLM models Outcomes and Generating Reports ---" +process_models "$vllm_model_list" vllm + +echo "--- Checking popular models Outcomes and Generating Reports ---" +process_models "$popular_model_list" popular + +echo "--- Checking features Outcomes and Generating Reports ---" +process_features "$feature_list" + +# Get commit hashes +VLLM_COMMIT_HASH=$(buildkite-agent meta-data get 'VLLM_COMMIT_HASH' || echo "not_set") +TPU_COMMONS_COMMIT_HASH=$(buildkite-agent meta-data get 'TPU_COMMONS_COMMIT_HASH' || echo "not_set") + +if [ "$ANY_FAILED" = true ]; then + echo "Some checks failed!" + echo "VLLM_COMMIT_HASH: $VLLM_COMMIT_HASH" + echo "TPU_COMMONS_COMMIT_HASH: $TPU_COMMONS_COMMIT_HASH" + exit 1 +else + echo "--- Uploading Commit Hash to Repo ---" + echo "Will commit to tpu_commons main" +fi + +echo "--- Uploading CSV Reports as Buildkite Artifacts ---" +buildkite-agent artifact upload "$output_model_support_matrix_file" +buildkite-agent artifact upload "$output_feature_support_matrix_file" +echo "Reports uploaded successfully." \ No newline at end of file From 2d53e701622eac54b3c9475b15afbafa98cb70d7 Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 15:22:38 +0800 Subject: [PATCH 22/38] test --- .buildkite/scripts/dynamic_bootstrap.sh | 2 +- .buildkite/scripts/export_support_matrix.sh | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh index c8c667d261..5d3d3bc9f3 100644 --- a/.buildkite/scripts/dynamic_bootstrap.sh +++ b/.buildkite/scripts/dynamic_bootstrap.sh @@ -107,7 +107,7 @@ model_list_string=$(printf "%s\n" "${model_names[@]}") if [[ -n "$tpu_model_list_str" ]]; then echo "--- Uploading tpu_model_list_str to Meta-data:${MODEL_LIST_KEY}" echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}" - echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")" + echo "Testing: $(buildkite-agent meta-data get "${MODEL_LIST_KEY}")" else echo "--- No tpu-support Models found to upload." fi diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh index ff6a408d36..6706ff8ab5 100644 --- a/.buildkite/scripts/export_support_matrix.sh +++ b/.buildkite/scripts/export_support_matrix.sh @@ -12,9 +12,9 @@ FEATURE_LIST_METADATA_KEY="feature-list" # tpu_model_list="Qwen/Qwen2.5-0.5B-Instruct Qwen/Qwen2.5-2B-Instruct" # vllm_model_list="NousResearch/Nous-Hermes-1.4B NousResearch/Nous-Hermes-2.5B" # popular_model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.2-8B-Instruct" -tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}") -vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}") -popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}") +tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}" --default "") +vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}" --default "") +popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}" --default "") feature_list="f1 f2" STAGES="UnitTest IntTest Benchmark StressTest" @@ -30,7 +30,7 @@ echo "Feature,UnitTest,IntTest,Benchmark,StressTest" > "$output_feature_support_ check_tpu_model() { local model="$1" for stage in $STAGES; do - result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "not_run") + result=$(buildkite-agent meta-data get "${model}:${stage}" --default "not_run") if [[ "$result" != "passed" ]]; then echo "TPU model $model failed at $stage ($result)" ANY_FAILED=true @@ -43,7 +43,7 @@ check_vllm_model() { local model="$1" local required="UnitTest IntTest" for stage in $required; do - result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "not_run") + result=$(buildkite-agent meta-data get "${model}:${stage}" --default "not_run") if [[ "$result" != "passed" ]]; then echo "VLLM model $model failed at $stage ($result)" ANY_FAILED=true @@ -57,7 +57,7 @@ process_models() { for model in $model_list; do row="$model" for stage in $STAGES; do - result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "${model}:${stage} not_run") + result=$(buildkite-agent meta-data get "${model}:${stage}" --default "${model}:${stage} not_run") row="$row,$result" done echo "$row" >> "$output_model_support_matrix_file" @@ -76,7 +76,7 @@ process_features() { for feature in $feature_list; do row="$feature" for stage in $STAGES; do - result=$(buildkite-agent meta-data get "${feature}:${stage}" || echo "${feature}:${stage} not_run") + result=$(buildkite-agent meta-data get "${feature}:${stage}" --default "${feature}:${stage} not_run") row="$row,$result" done echo "$row" >> "$output_feature_support_matrix_file" @@ -96,8 +96,8 @@ echo "--- Checking features Outcomes and Generating Reports ---" process_features "$feature_list" # Get commit hashes -VLLM_COMMIT_HASH=$(buildkite-agent meta-data get 'VLLM_COMMIT_HASH' || echo "not_set") -TPU_COMMONS_COMMIT_HASH=$(buildkite-agent meta-data get 'TPU_COMMONS_COMMIT_HASH' || echo "not_set") +VLLM_COMMIT_HASH=$(buildkite-agent meta-data get 'VLLM_COMMIT_HASH' --default "not_set") +TPU_COMMONS_COMMIT_HASH=$(buildkite-agent meta-data get 'TPU_COMMONS_COMMIT_HASH' --default "not_set") if [ "$ANY_FAILED" = true ]; then echo "Some checks failed!" From ea6d7c800e0c850bb1859d466ee7c13b6cda570d Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 15:32:10 +0800 Subject: [PATCH 23/38] test --- .buildkite/scripts/export_support_matrix.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh index 6706ff8ab5..5f2975ddee 100644 --- a/.buildkite/scripts/export_support_matrix.sh +++ b/.buildkite/scripts/export_support_matrix.sh @@ -109,6 +109,9 @@ else echo "Will commit to tpu_commons main" fi +echo "--- Print Model Report Content ---" +cat "$output_model_support_matrix_file" + echo "--- Uploading CSV Reports as Buildkite Artifacts ---" buildkite-agent artifact upload "$output_model_support_matrix_file" buildkite-agent artifact upload "$output_feature_support_matrix_file" From 99e502fc3828459011b85b1efcb9d24602cb8c3b Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 15:39:47 +0800 Subject: [PATCH 24/38] add test models --- .../NousResearch_Nous-Hermes-1_4B.yml | 79 +++++++++++++++++++ .../meta-llama_Llama-3_1-70B-Instruct.yml | 79 +++++++++++++++++++ .../popular/Qwen_Qwen2_5-2B-Instruct.yml | 79 +++++++++++++++++++ 3 files changed, 237 insertions(+) create mode 100644 .buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml create mode 100644 .buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml create mode 100644 .buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml diff --git a/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml b/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml new file mode 100644 index 0000000000..4e4890c79b --- /dev/null +++ b/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml @@ -0,0 +1,79 @@ +# NousResearch/Nous-Hermes-1.4B +agents: + queue: tpu_v6e_queue +steps: + - label: "Unit tests for NousResearch/Nous-Hermes-1.4B" + key: "ut_NousResearch_Nous-Hermes-1_4B" + commands: + # - replace_with_test_commands # TODO: Replaced to actual test commands + - echo "[DEBUG], unit testing for NousResearch/Nous-Hermes-1.4B" # TODO: Replace to actual test commands + - label: "Notifications: Unit tests for NousResearch/Nous-Hermes-1.4B" + key: "notifications_ut_NousResearch_Nous-Hermes-1_4B" + depends_on: "ut_NousResearch_Nous-Hermes-1_4B" + agents: + queue: tpu_v6e_queue + env: + EXECUTE_ENTITY: "NousResearch/Nous-Hermes-1.4B" + EXECUTE_STAGE: "UnitTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Unit tests for NousResearch/Nous-Hermes-1.4B" ut_NousResearch_Nous-Hermes-1_4B + + - label: "Integration tests for NousResearch/Nous-Hermes-1.4B" + key: "it_NousResearch_Nous-Hermes-1_4B" + depends_on: "notifications_ut_NousResearch_Nous-Hermes-1_4B" + commands: + # TODO: expected_accuracy need parameterized + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "NousResearch/Nous-Hermes-1.4B" + - echo "[DEBUG], integration testing for NousResearch/Nous-Hermes-1.4B" # TODO: Replace to actual test commands + - label: "Notifications: Integration tests for NousResearch/Nous-Hermes-1.4B" + key: "notifications_it_NousResearch_Nous-Hermes-1_4B" + depends_on: "it_NousResearch_Nous-Hermes-1_4B" + agents: + queue: tpu_v6e_queue + env: + EXECUTE_ENTITY: "NousResearch/Nous-Hermes-1.4B" + EXECUTE_STAGE: "IntTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Integration tests for NousResearch/Nous-Hermes-1.4B" it_NousResearch_Nous-Hermes-1_4B + + - label: "Performance benchmarks for NousResearch/Nous-Hermes-1.4B" + key: "pb_NousResearch_Nous-Hermes-1_4B" + depends_on: "notifications_it_NousResearch_Nous-Hermes-1_4B" + commands: + # - replace_with_test_command # TODO + - echo "[DEBUG], performance benchmarking for NousResearch/Nous-Hermes-1.4B" # TODO: Replace to actual test commands + - label: "Notifications: Performance benchmarks for NousResearch/Nous-Hermes-1.4B" + key: "notifications_pb_NousResearch_Nous-Hermes-1_4B" + depends_on: "pb_NousResearch_Nous-Hermes-1_4B" + agents: + queue: tpu_v6e_queue + env: + EXECUTE_ENTITY: "NousResearch/Nous-Hermes-1.4B" + EXECUTE_STAGE: "Benchmark" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Performance benchmarks for NousResearch/Nous-Hermes-1.4B" pb_NousResearch_Nous-Hermes-1_4B + + - label: "Stress tests for NousResearch/Nous-Hermes-1.4B" + key: "st_NousResearch_Nous-Hermes-1_4B" + depends_on: "notifications_pb_NousResearch_Nous-Hermes-1_4B" + commands: + # - our_stress_tests_script NousResearch/Nous-Hermes-1.4B expected_throughput # TODO: expected_throughput need parameterized + - echo "[DEBUG], stress testing for NousResearch/Nous-Hermes-1.4B" # TODO: Replace to actual test commands + - label: "Notifications: Stress tests for NousResearch/Nous-Hermes-1.4B" + key: "notifications_st_NousResearch_Nous-Hermes-1_4B" + depends_on: "st_NousResearch_Nous-Hermes-1_4B" + agents: + queue: tpu_v6e_queue + env: + EXECUTE_ENTITY: "NousResearch/Nous-Hermes-1.4B" + EXECUTE_STAGE: "StressTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Stress tests for NousResearch/Nous-Hermes-1.4B" st_NousResearch_Nous-Hermes-1_4B diff --git a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml new file mode 100644 index 0000000000..901933a724 --- /dev/null +++ b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml @@ -0,0 +1,79 @@ +# meta-llama/Llama-3.1-70B-Instruct +agents: + queue: tpu_v6e_8_queue +steps: + - label: "Unit tests for meta-llama/Llama-3.1-70B-Instruct" + key: "ut_meta-llama_Llama-3_1-70B-Instruct" + commands: + # - replace_with_test_commands # TODO: Replaced to actual test commands + - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Unit tests for meta-llama/Llama-3.1-70B-Instruct" + key: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "ut_meta-llama_Llama-3_1-70B-Instruct" + agents: + queue: tpu_v6e_8_queue + env: + EXECUTE_ENTITY: "meta-llama/Llama-3.1-70B-Instruct" + EXECUTE_STAGE: "UnitTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct + + - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct" + key: "it_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct" + commands: + # TODO: expected_accuracy need parameterized + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-70B-Instruct" + - echo "[DEBUG], integration testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Integration tests for meta-llama/Llama-3.1-70B-Instruct" + key: "notifications_it_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "it_meta-llama_Llama-3_1-70B-Instruct" + agents: + queue: tpu_v6e_8_queue + env: + EXECUTE_ENTITY: "meta-llama/Llama-3.1-70B-Instruct" + EXECUTE_STAGE: "IntTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct + + - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" + key: "pb_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "notifications_it_meta-llama_Llama-3_1-70B-Instruct" + commands: + # - replace_with_test_command # TODO + - echo "[DEBUG], performance benchmarking for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" + key: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "pb_meta-llama_Llama-3_1-70B-Instruct" + agents: + queue: tpu_v6e_8_queue + env: + EXECUTE_ENTITY: "meta-llama/Llama-3.1-70B-Instruct" + EXECUTE_STAGE: "Benchmark" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct + + - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct" + key: "st_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct" + commands: + # - our_stress_tests_script meta-llama/Llama-3.1-70B-Instruct expected_throughput # TODO: expected_throughput need parameterized + - echo "[DEBUG], stress testing for meta-llama/Llama-3.1-70B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Stress tests for meta-llama/Llama-3.1-70B-Instruct" + key: "notifications_st_meta-llama_Llama-3_1-70B-Instruct" + depends_on: "st_meta-llama_Llama-3_1-70B-Instruct" + agents: + queue: tpu_v6e_8_queue + env: + EXECUTE_ENTITY: "meta-llama/Llama-3.1-70B-Instruct" + EXECUTE_STAGE: "StressTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct diff --git a/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml b/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml new file mode 100644 index 0000000000..1531ca0551 --- /dev/null +++ b/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml @@ -0,0 +1,79 @@ +# Qwen/Qwen2.5-2B-Instruct +agents: + queue: tpu_v6e_queue +steps: + - label: "Unit tests for Qwen/Qwen2.5-2B-Instruct" + key: "ut_Qwen_Qwen2_5-2B-Instruct" + commands: + # - replace_with_test_commands # TODO: Replaced to actual test commands + - echo "[DEBUG], unit testing for Qwen/Qwen2.5-2B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Unit tests for Qwen/Qwen2.5-2B-Instruct" + key: "notifications_ut_Qwen_Qwen2_5-2B-Instruct" + depends_on: "ut_Qwen_Qwen2_5-2B-Instruct" + agents: + queue: tpu_v6e_queue + env: + EXECUTE_ENTITY: "Qwen/Qwen2.5-2B-Instruct" + EXECUTE_STAGE: "UnitTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Unit tests for Qwen/Qwen2.5-2B-Instruct" ut_Qwen_Qwen2_5-2B-Instruct + + - label: "Integration tests for Qwen/Qwen2.5-2B-Instruct" + key: "it_Qwen_Qwen2_5-2B-Instruct" + depends_on: "notifications_ut_Qwen_Qwen2_5-2B-Instruct" + commands: + # TODO: expected_accuracy need parameterized + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "Qwen/Qwen2.5-2B-Instruct" + - echo "[DEBUG], integration testing for Qwen/Qwen2.5-2B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Integration tests for Qwen/Qwen2.5-2B-Instruct" + key: "notifications_it_Qwen_Qwen2_5-2B-Instruct" + depends_on: "it_Qwen_Qwen2_5-2B-Instruct" + agents: + queue: tpu_v6e_queue + env: + EXECUTE_ENTITY: "Qwen/Qwen2.5-2B-Instruct" + EXECUTE_STAGE: "IntTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Integration tests for Qwen/Qwen2.5-2B-Instruct" it_Qwen_Qwen2_5-2B-Instruct + + - label: "Performance benchmarks for Qwen/Qwen2.5-2B-Instruct" + key: "pb_Qwen_Qwen2_5-2B-Instruct" + depends_on: "notifications_it_Qwen_Qwen2_5-2B-Instruct" + commands: + # - replace_with_test_command # TODO + - echo "[DEBUG], performance benchmarking for Qwen/Qwen2.5-2B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Performance benchmarks for Qwen/Qwen2.5-2B-Instruct" + key: "notifications_pb_Qwen_Qwen2_5-2B-Instruct" + depends_on: "pb_Qwen_Qwen2_5-2B-Instruct" + agents: + queue: tpu_v6e_queue + env: + EXECUTE_ENTITY: "Qwen/Qwen2.5-2B-Instruct" + EXECUTE_STAGE: "Benchmark" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Performance benchmarks for Qwen/Qwen2.5-2B-Instruct" pb_Qwen_Qwen2_5-2B-Instruct + + - label: "Stress tests for Qwen/Qwen2.5-2B-Instruct" + key: "st_Qwen_Qwen2_5-2B-Instruct" + depends_on: "notifications_pb_Qwen_Qwen2_5-2B-Instruct" + commands: + # - our_stress_tests_script Qwen/Qwen2.5-2B-Instruct expected_throughput # TODO: expected_throughput need parameterized + - echo "[DEBUG], stress testing for Qwen/Qwen2.5-2B-Instruct" # TODO: Replace to actual test commands + - label: "Notifications: Stress tests for Qwen/Qwen2.5-2B-Instruct" + key: "notifications_st_Qwen_Qwen2_5-2B-Instruct" + depends_on: "st_Qwen_Qwen2_5-2B-Instruct" + agents: + queue: tpu_v6e_queue + env: + EXECUTE_ENTITY: "Qwen/Qwen2.5-2B-Instruct" + EXECUTE_STAGE: "StressTest" + commands: + - | + .buildkite/scripts/check_results.sh \ + "Stress tests for Qwen/Qwen2.5-2B-Instruct" st_Qwen_Qwen2_5-2B-Instruct From 3005ea2c35f6e123e83056d171230d40bd0162bd Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 15:46:42 +0800 Subject: [PATCH 25/38] fix for test --- .buildkite/buildkite_ci_feature_template.yml | 8 ++++---- .buildkite/buildkite_ci_model_template.yml | 8 ++++---- .../informational/NousResearch_Nous-Hermes-1_4B.yml | 8 ++++---- .buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml | 8 ++++---- .buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml index e7286ccda4..cab1a9250e 100644 --- a/.buildkite/buildkite_ci_feature_template.yml +++ b/.buildkite/buildkite_ci_feature_template.yml @@ -17,7 +17,7 @@ steps: EXECUTE_STAGE: "UnitTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Unit tests for {FEATURE_NAME}" ut_{SAFE_FEATURE_NAME} - label: "Integration tests for {FEATURE_NAME}" @@ -37,7 +37,7 @@ steps: EXECUTE_STAGE: "IntTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Integration tests for {FEATURE_NAME}" it_{SAFE_FEATURE_NAME} - label: "Performance benchmarks for {FEATURE_NAME}" @@ -56,7 +56,7 @@ steps: EXECUTE_STAGE: "Benchmark" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Performance benchmarks for {FEATURE_NAME}" pb_{SAFE_FEATURE_NAME} - label: "Stress tests for {FEATURE_NAME}" @@ -75,5 +75,5 @@ steps: EXECUTE_STAGE: "StressTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Stress tests for {FEATURE_NAME}" st_{SAFE_FEATURE_NAME} diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml index 6c6c13910c..595ae98c05 100644 --- a/.buildkite/buildkite_ci_model_template.yml +++ b/.buildkite/buildkite_ci_model_template.yml @@ -17,7 +17,7 @@ steps: EXECUTE_STAGE: "UnitTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME} - label: "Integration tests for {MODEL_NAME}" @@ -37,7 +37,7 @@ steps: EXECUTE_STAGE: "IntTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Integration tests for {MODEL_NAME}" it_{SAFE_MODEL_NAME} - label: "Performance benchmarks for {MODEL_NAME}" @@ -56,7 +56,7 @@ steps: EXECUTE_STAGE: "Benchmark" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Performance benchmarks for {MODEL_NAME}" pb_{SAFE_MODEL_NAME} - label: "Stress tests for {MODEL_NAME}" @@ -75,5 +75,5 @@ steps: EXECUTE_STAGE: "StressTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Stress tests for {MODEL_NAME}" st_{SAFE_MODEL_NAME} diff --git a/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml b/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml index 4e4890c79b..ea71da11d4 100644 --- a/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml +++ b/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml @@ -17,7 +17,7 @@ steps: EXECUTE_STAGE: "UnitTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Unit tests for NousResearch/Nous-Hermes-1.4B" ut_NousResearch_Nous-Hermes-1_4B - label: "Integration tests for NousResearch/Nous-Hermes-1.4B" @@ -37,7 +37,7 @@ steps: EXECUTE_STAGE: "IntTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Integration tests for NousResearch/Nous-Hermes-1.4B" it_NousResearch_Nous-Hermes-1_4B - label: "Performance benchmarks for NousResearch/Nous-Hermes-1.4B" @@ -56,7 +56,7 @@ steps: EXECUTE_STAGE: "Benchmark" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Performance benchmarks for NousResearch/Nous-Hermes-1.4B" pb_NousResearch_Nous-Hermes-1_4B - label: "Stress tests for NousResearch/Nous-Hermes-1.4B" @@ -75,5 +75,5 @@ steps: EXECUTE_STAGE: "StressTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Stress tests for NousResearch/Nous-Hermes-1.4B" st_NousResearch_Nous-Hermes-1_4B diff --git a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml index 901933a724..818165e019 100644 --- a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml +++ b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml @@ -17,7 +17,7 @@ steps: EXECUTE_STAGE: "UnitTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct" @@ -37,7 +37,7 @@ steps: EXECUTE_STAGE: "IntTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" @@ -56,7 +56,7 @@ steps: EXECUTE_STAGE: "Benchmark" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct" @@ -75,5 +75,5 @@ steps: EXECUTE_STAGE: "StressTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct diff --git a/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml b/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml index 1531ca0551..d9191a3704 100644 --- a/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml +++ b/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml @@ -17,7 +17,7 @@ steps: EXECUTE_STAGE: "UnitTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Unit tests for Qwen/Qwen2.5-2B-Instruct" ut_Qwen_Qwen2_5-2B-Instruct - label: "Integration tests for Qwen/Qwen2.5-2B-Instruct" @@ -37,7 +37,7 @@ steps: EXECUTE_STAGE: "IntTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Integration tests for Qwen/Qwen2.5-2B-Instruct" it_Qwen_Qwen2_5-2B-Instruct - label: "Performance benchmarks for Qwen/Qwen2.5-2B-Instruct" @@ -56,7 +56,7 @@ steps: EXECUTE_STAGE: "Benchmark" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Performance benchmarks for Qwen/Qwen2.5-2B-Instruct" pb_Qwen_Qwen2_5-2B-Instruct - label: "Stress tests for Qwen/Qwen2.5-2B-Instruct" @@ -75,5 +75,5 @@ steps: EXECUTE_STAGE: "StressTest" commands: - | - .buildkite/scripts/check_results.sh \ + bash .buildkite/scripts/check_results.sh \ "Stress tests for Qwen/Qwen2.5-2B-Instruct" st_Qwen_Qwen2_5-2B-Instruct From 6a6af3ef623f889f8ad7762e53e9dbc5392c9d13 Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 16:05:12 +0800 Subject: [PATCH 26/38] test --- .buildkite/scripts/export_support_matrix.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh index 5f2975ddee..0dc78eedd2 100644 --- a/.buildkite/scripts/export_support_matrix.sh +++ b/.buildkite/scripts/export_support_matrix.sh @@ -9,13 +9,14 @@ POPURLAR_MODEL_LIST_KEY="popular-model-list" FEATURE_LIST_METADATA_KEY="feature-list" -# tpu_model_list="Qwen/Qwen2.5-0.5B-Instruct Qwen/Qwen2.5-2B-Instruct" -# vllm_model_list="NousResearch/Nous-Hermes-1.4B NousResearch/Nous-Hermes-2.5B" -# popular_model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.2-8B-Instruct" tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}" --default "") vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}" --default "") popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}" --default "") +echo "tl: $tpu_model_list" +echo "vl: $vllm_model_list" +echo "pl: $popular_model_list" + feature_list="f1 f2" STAGES="UnitTest IntTest Benchmark StressTest" @@ -106,12 +107,15 @@ if [ "$ANY_FAILED" = true ]; then exit 1 else echo "--- Uploading Commit Hash to Repo ---" - echo "Will commit to tpu_commons main" + # TODO: Will commit hash value to tpu_commons main fi echo "--- Print Model Report Content ---" cat "$output_model_support_matrix_file" +echo "--- Print Feature Report Content ---" +cat "$output_feature_support_matrix_file" + echo "--- Uploading CSV Reports as Buildkite Artifacts ---" buildkite-agent artifact upload "$output_model_support_matrix_file" buildkite-agent artifact upload "$output_feature_support_matrix_file" From 689d12b377c92411ca194abc24e1408283e41980 Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 16:23:21 +0800 Subject: [PATCH 27/38] test --- .buildkite/scripts/dynamic_bootstrap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh index 5d3d3bc9f3..f981625464 100644 --- a/.buildkite/scripts/dynamic_bootstrap.sh +++ b/.buildkite/scripts/dynamic_bootstrap.sh @@ -4,7 +4,7 @@ echo "--- Starting Special Buildkite Bootstrap ---" # for loop features and models upload to buildkite BUILDKITE_DIR=".buildkite" -TARGET_FOLDERS="models features models/informational" +TARGET_FOLDERS="models models/informational models/popular features" MODEL_LIST_KEY="tpu-model-list" INFORMATIONAL_MODEL_LIST_KEY="vllm-model-list" From 281d1a8600fd1dbb7e1e3fbdfb72ac431ae3141c Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 16:31:02 +0800 Subject: [PATCH 28/38] test --- .buildkite/scripts/dynamic_bootstrap.sh | 1 - .buildkite/scripts/export_support_matrix.sh | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh index f981625464..16a32bd0c3 100644 --- a/.buildkite/scripts/dynamic_bootstrap.sh +++ b/.buildkite/scripts/dynamic_bootstrap.sh @@ -128,7 +128,6 @@ else echo "--- No popular Models found to upload." fi - # --- Upload Dynamic Pipeline --- if [[ -n "$pipeline_steps" ]]; then diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh index 0dc78eedd2..c6de583f45 100644 --- a/.buildkite/scripts/export_support_matrix.sh +++ b/.buildkite/scripts/export_support_matrix.sh @@ -13,11 +13,7 @@ tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}" --default "") vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}" --default "") popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}" --default "") -echo "tl: $tpu_model_list" -echo "vl: $vllm_model_list" -echo "pl: $popular_model_list" - -feature_list="f1 f2" +feature_list=$(buildkite-agent meta-data get "${FEATURE_LIST_METADATA_KEY}" --default "") STAGES="UnitTest IntTest Benchmark StressTest" # Output CSV files From 43352a810178d9d60d9c296ecbf0831340493d5a Mon Sep 17 00:00:00 2001 From: StingLin Date: Wed, 24 Sep 2025 16:32:19 +0800 Subject: [PATCH 29/38] remove gz --- buildkite-script-dynamic.gz | Bin 10583 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 buildkite-script-dynamic.gz diff --git a/buildkite-script-dynamic.gz b/buildkite-script-dynamic.gz deleted file mode 100644 index 9062e95d9fc13a790f058d1a16c52a94da6bff10..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10583 zcmV-dDX7*TiwFR*VA5y+1MPilTjNI3Xg>Q_bi|wh6D%xWfsFGk1D*+QU|_*yHz&yv z!j`duEjhZtuw>rnd^!JMKfb?if61xpZb_}1Ex?#d)X(k&ORetes;=tls_v>I?eh=1 z*4OQI`P?%`{Y%5uznXKYR4T1TgM8JZg?v?Qv>MeK!RM^NOQlh1*6a0Vt5zo!rKL1m zUy;xA1|+R`j;q-K%8hPLbR)ao#5Ts7eRE-3-X!fk8wm;i`$CG+AfC~0D^*3Q)HX}C z3K_e`SZ^z>#zs@AZZ?{7wOZM%Ha9k#rKbkAD043%*RMZXA3p!r>&W-5B>w}yMESo_ zZ8g6ljYoq>D1UMO7wCT!ECs;BQ-2VNva|$mUSD(=5Wiryv z_0blQPEErluI{)F8CZ6t9SJHDa;e`yHP2OVPqsgF)WrLqH6CkbpEiqS2^zO`m~cH^ zy*AvTijAv&EqvcS+&$bP$E;4DxE9fRuBVMC7(siyxiZmvLyJhi>~`Lte*A5Xcs{^} zd65J;(f*X?LnM5%Tw`GLG}o|9XN%;-i79@iW@D(YPju)}walrA45@}CM!7Uhm&u>} zFG2(f;Ph-`;yUZSq29YxZQb!kt|L1`@@r6=Os6|Z1Co9`8F?3mDL^o^tV?I-1s=D=YrhdeS%Yq0N1-75bVwS z)4l!OANRT)b?5NlVEfaq+WF^RS3T)&cTZ2EO&J673z24gLCX3cq(XlCj<`eJjC>ye zGt>$46Paj^qxWTUIvH77A7>@Qxh%_4?8|^iWlC%^)?KaK*IbP_x=SRH2wO-X9I!+r z)VNOjnTZ5tNw(&2$FNm`g=CVCgHB7Z{^)7WAnd@AGf7Y)p`_h?!*jaZc z9u;~3;5r@mx?Y9VSvPw_Sb;8q9Q1Jj0k%}rdZWz$DV(!eYvScaf@Bniey^cSILmD~T*s8uR8h1!3stw!PhSxA}U|7j|fdUdlQ*OcmJrM0nH`-`#0U!n;7 zKWDBF%Pjv-wN{JqzXD$r{+~sZz^}7}s>{O)ZKc~e>5`Lf=ja4xeFYaky=v+z{mI_N_1v8H-vl;D*&*dC(Ik8|8jZOv_)0+o@wzyo zQpUg)K5&3*VvMM|pEZ8v#f(1~d~{S`%oAE)m#?_gZc?Jl8KIIuCyi(@=PCOX^z|QPlw&T4|_Y?-Mzz4 zbUXf7TkM|{MnlmV#i=Fuk3!tWx7?9uO3Jfp%ClO^vwF(2M#{5h%ClC=vyGHzn`tno z4kT?PO4?BTNqqPDN+f!6ERP3mS)i0dK2XQUvOo#}90BnHz6CT3*cFf};8H-LfH@Ii zBA!I_h**&dYx5#Xs{hAdwbX&Gxt^`7K_!q;a^H2Y$r*^XWs z*cK?XnmZgB=Y+pJf}f?*64})UhNGmaJe`TLcfhM4ku7$>iWW3Y+-B-|2%` zO&?I0r>46ypa+#sU~1C?XRa0Cf|B6_>yj+rik*L1rpK378SqPF2h~VCb(Eku9%TYe zlJ)GrQE}S%v`A2OmDYvEOv{aaA{&m1jjpVQxbjEQv2DwSQAavozD=|apkuoO%QO35 zORGUUwhp5+{Q-4M2S<&=NlaBfBC$2U2E$32jYP@fqH(=Z8DG%yp`1$HKZsNunuv&UQc(Nfj1h_;b3k3 zb5EbRq(lFMw;`ZYJ%hBg6*@!3wztTw{w0%&{t?OL^<^q`Bw1p3&`bj4*KMa~*=(L) zX(OW_$S8+a54d;PL3;v2s-q3Ub&oP)JHnSTKia+`txIb}Ql-`KiOiq)@SS+)j%&D{ z%Vrv_HMB+u7=x2OwXG|oulG^%dSl%BrBK)B*}g!WIK5wJzI1SKYJA%itdGe!v!=}QNX|hGO zMLZ(iaEg#?_$J_cVwhQ|m69-yU?~E2qaqHc%GhuKjquwc=em2X>n6FeJUb@$a00Pi zwwyUXg=LaUnaCnZMINt8#lQa&!~$&S0d9+pBQhP?1Sh!k0pfu04qZI48E$F}+JTuX z-6jb6g{_*x2V7hW>*#vCEq_S=unj(&gDDwEDp*_GVGaQ$^a|qXI;VIV$k*8SN`e*Q z6dg$a`Jev>yhr8H*JQ^s2gU^>S_re|Zhx(R5A&|-vc}waQ z@-2~mHGQ3AtQn@i()nxl0X!oE&+M_wV-}&hm~ZJRqn|RI%ybgjo+*;UiYo#5y$!_m z^*VW*e||*Ygwmp~v*`>Swi$_0gp5V<#xc zew1bSYRh{;3V48jE>8jpu%gQsGR$4BJ(ahYS7j1y5wwS)cE#2vm?+Egvfql|HWjL% zRsd&=Mj&eZOH=$@__7*&5l&l*c)ARQIFWqB=RqHp*!Ra<98c_^HPcHuu6*1?43-QB z{-X{4a5Xw-0AL4ZDjx#<4-UfN`bQ2OzKzx8$g0JFV=w5R&>85SDt)V}qz!uvRYhJ@ zRf#bpW6Zyrjbpm~kDUQ{s2$+$_P=_wS}*K>izxZ*f14Gh(JJhJudt-q|K`K>VVPX;C)A6LzOeE=nUznNg7>oV3BtUv@D&!m7i;08 z^Z1eNdrSeEVcnzg14biltgY|kzvY^&lsDd&_e^%Q_bx_1)?s1tn~TXWlyuClCoEru zx$nMZdi;m}4It3yJ@790UF?6&Ruj+v)GF1&{&hOVaRESD6-2YEji+|nBH~Z`rw@J z0&)IOJ=(ME4EepLYt4Y&$A8CeIRz@W<4veJ6*i!@0ehG^0LP+cW%hv4vS!dfb^Z6x zQO;?ORdVNlY5?CW!2gw2t5&T4izqYlf7B`sxn9}agqO9#|Mv=u!2dqi&pi=TC3pTO z&i_hFLH)nT|Md7}`u^{C?eiS8z}fr1dYJ#Kh4Q~zuNCrtAtfgNtDCX>e;W z)oaD}?=_aR{oj1JJ}fi#f30S{6660$qf#mMe~TzDaQ_$GUUjM+aDw-xyG*MS!JqtzCq6FrjN`#!Abkv`koeh;9(YVV*+jw#uCXq-3 zX%QMp%W!WfrV^NYtQdkqZ2vq%7gG>0Jd}H6IV%QG`SWZvg`vb$ih(c?A;#(a=9_4j z_rm>HrN06Gf-@O5Jwrx!61QTlEWV> zzTrb~bYIj&TPc@~3)8ao_SA2ne-rhNdwdF%xZ`KS-(whh{Og`|Rb~fTvRaU#if#j4 z>>jI6gtxo}Tc@^kSGULIvEk6b)$9$@w@l9+wws$%rwP2^h)GPC$BF~_6cgqF#OI%8 zMMh(<)vKx>BW(_STwbX?tvA$FZ_>w8CSl_*G637%qOZjxuf?OT#Url8qpigwt;M6P z#Urf6qpQUts|lGXS>P{v@Jg+zin{S0!_elc+Nub3ACGyaCMIIUK=v1dtVzTVu*QR{ z@xW?4s2UHb#)GNxKx#aQ8pG6M@BdTnzf6mrTOW7u|3Rst{ij~!e_2eK?*D7nt1Wr6 zxw+AT3Y&!i`1KWm{~6cM-TQYZ{~zOjrP*j!3jg0CN~-*y8GY++ea!a%h37x2)hfvU zCd`;Z{x71;IRBw+H08Qdt<`JQ&3YmKUtbaUf6iPVmg)XKMQJpeG5)Vt8uh~ew}`St zZp8p#UrOQjDVE7K@DvJ1FGE8I&BPjMNz6Fw`B_<0YTPN2xRc0~+n1|xp)KX zOpu8;3e;S3@#e-%G#BqQJVmncPOG@se7w^~O40$P4HOx9$zx{ZjUapr$wW^?KZA_C znULklD!vz*fpRgJk(Z_Nb7E3r#d3uENwu;wgfBgw%Z-Z5AgvdANnE0!NmoikQi^~y zViCvPnQ0sXL%@D9WG82ziO~|OH57D2KAV->RwokA}2Rar&V06z{)>rk`8nswK9amqe-pz zKAE1M+!8^{Be_mo=-A1n8aVe;^rmrIn!K_+8*{t?R8(3yDr2JALF4r(FUGQ`KDNj65P=c$rKV9Stg zPi^f>mQ?@Gac6t?pd*j_^XVg}|EE!_RBB4S#qNI;{{MxPDgK|PQmI!r8*&X^Hk-|A zaRKZV7J>ig%=KZJasR7QZ?>ZRuQVI=M&bWiL|F>sz?Div*PS?9>!Bx525kgwj%99h zER}E_{b;y9c<1C=Kc`BH;qdTrp3S^5Tx+3m!@?+Y{`EqchGpw{wvB5lU6cLKnOTfS z9P{gMl}cx4XKC)It)L2D&4b_{QGZdop<-P!~+M~Oo7)MT4mNnQW> zxaZYJ{_}s$N>!ojf3w~!*8fG6>E}Q2-nU$@*EU4IGhhv3G<)-3-Uj}ABFs1M9G!^1>SDd8%6x5S6CAJzqxUJSZ3`1 zTCnkp@qY#WD)xViD8l|P44wGVk2#L}J=os$tsV_9jp5L-g5yWv$4KndVD8cDkw^6Y zAP=4IP_KjMT4UJO^|aCGCd@`6#vWha*$wu3v7I0W!FK3=(b0oeM>@N@8AR@;9r5HH zm~F;zSk!v%Ao1H~fSzM?avmcC&Fp^AGr3e+S|W$g^p&AsvwbcnIk~X35#N!i#z}G_Q5_IK2Mv| z8-}4OFvK>8dkxbUke+7Jc<~x>49vdZZ)qbLU^Mg%n0?LeqjxDpIScEL1GaSYNq={uF z)0+T9c!2tBb&a2Qpc7L_b#bNHhIWqRrd}MHlOjF9{u;|ppZ|BVZi#*~j|z+rQKH%MA@LgU-RSYbA>5bxMNAMhFjip@!9XnCVPhO*aeTQe^( z@dD1&(a5@H3>)vvfUj20_!XlwI)<|#;NaP65E`HHaX%(FSVp275JWT)2+hPJGrN#n z`p>`r=Rg1czwkdYWdaJdX5^MiYcFjUWwn@gavtjbo7WAjtQ7hZyVQ$x_`$kHem!+` z{JBJwUuAz%v-uU}y;A9v6KE>maqpv7s}v;^!&gd9YFwI-Kj2 zYtgZ{$6nEDm`MaAnZu+CNb%lKv$dX!$$3V`CEyO`B4FT5K3iimEImcTnR@1@Xh=); zf=CSIJvC`w>NhmU4;>C2{q&K9pNp9muBlMQ_rF``4g(GaJx{%G7BOa?ty1cBK5MuJ zJcxR;a$p1$japMVNm({53IJ6f{MCqpFRVvYx}u;OSv{V-h4>;Ogh1sGs1XRLIF$wd zSt8Xq!^GE!G?qzSAM!IzxH#lvo3!poG`%r-C;FA|!8~GP(JjRTS8TSn`zyXr(J%o` zsI^k=S_+iEASYY|5i4gimesS>V2a*`Yr~f_;UZ6vE=~UoG8%!SYLcJm3qX%94X9`s zOclSGa81F()LMYGp$BuIjLI3RRWNvc)aNpv%I7ocL^|WM{Wg{#4Z+$5CCdbD9F=9* z7C3@yN=)mMzQgG=)zDY<(v}^n0?B8=0uzpgj)7(?V=8d^FkJsc%NeXSthu!H39zQK zB#1OMp5fm3V`itRQVEcbdL|LV`QXdfTr(m~EfVKM6zva1 z(o^s2^Oh9*e`p8x6CH^D&wKQd-~QXIRH}8n|JiC4=YJMbX88Y_)rzcCtDBp(YNd$( z@e0e7^d=AF`moHp|6QxpWBgyMREzUJ3n@!qub&&{I#tk1;Zfuuhx7^GIESkKdw*K6 zGs&eTs;Z1EYZ4e!sWQTitSm_m-C_g>jlz@0yL-oNDV&2+sk?prvC~yQ9PaOSj!)V= zsdv~i-yRw!ON0xnw>APGnCiB_chXgV?EKu8@XQj^6h{U|wA}mj;qdr?24maaSL3xY z4sE3R(c#hQ@&5L4qUOY!cq7e@R4(}{2Ca6`>2B|Cceh!6UoYV``+9GLCf2eh*oCoeu?gHHvDV{naGV*h_`ovW`pO%9E>08JgrIG9 zi@dpwQtk_k79ku^ZIET61-1N5wAOc|Z-Ju4Dz)E4=EeFO;oDLoJ2`vFlX#cdA@*O$ z*QDGh68};nzlq1y8QA||*5obnfivL4y9cy0WOzTZJ(AR~kHp04 zLe}zuVLL7ftRWY=>jpM@sz8_6k#`#d^+B!nn-%T)k}Q8ZZnu?NiUsnf@*Taq@MRfD zCUlmbj$Y_7Fzey-WuAK8$C>N9c>?&Ex3HZg{Y9qn#6>}x9t@vIvla?ci+XEipgKGa!khDxn(Yh zBGr<@DSxy6$`Pp`%Cp=}y%nFwgvlo>G~!^RB$Rz1VQvM=d|8&s9tJH9*=0G>{c_|U zO3=UVXuVP?F<+f5@P(T&F<84%W`fPHze#soQj znS|uQPJ^_3DPHCiG+ywKrcnEXzn-x)@3Jq5)Xn=##Oc;#b^IU33IZt?JnJtCs!0cj|zY?Pe2Sx_jPTsxi9DXRpa}UNuW#mM0 z>9{gLb}Y2}y1Gn07$6$RJ}%E&fvUJ; znOC~)G988<5J0mm0$uj?n?5(J@Rl*E>H$*%SKiz%Mdj8nfBWsr>XuZZAPtE0znou9 z2^W)OHK`PRhL<>js(HR#^+)*=WK1L zuL>0t&$EEbXaV4hehcJG`&JQD_}5Xip@k}$~SQwPTNeyFq+a@K9uI}BR_&?HF{r+XKWTlLZoxlOj@LW7AHqW z|1nCWx0R0=d3v}D9WvXy_oOc|Z6hleFm>jM^oXpUGwG0M?}UCvw}Y_AsZmuyVpo-< zH_N}8%jtE#O%&iA#R;$mN+NZr zMo|)>3xX5xiz-)jhrmsO2tn=A$(U7Uzwq>tT+b;g}Ow=R&BhG6y1PeY# z#e>y#z+lQVhyISfg-b%j6K7&gwFBt;qewZbwIZOt*AoYal?yB|8HM7s%xpy zVaW3LsF_gE6smC?9G-sas-1rxb#}Vyap&Z8zuT5RXrKg1rNizIonv+T_~S`?1vaP5 zuntP(8$iW{iy-|L*wVQXR*9Hi@w}2}x@BfZE3>oOq8$;_BBt~o$@)i&Gc*PWviK~^ zI7vtdI5iFZO^Uugw{`6@tto0{WQLt3(tW?XMY@lk^rtGr@Xp1*t zXs9NCJUXS04K&#VhQljuW97k4j%iF1YZXaqeYH1cL;MP`oYn5_lGr5E}6q6@!D^1 zjc{v!Y#8M$0AXcv-CteLf@CWWBX*fpc~yX$O5f%JM_sn!7f<=)DTY+SU z?KCKm%SbvTiTYDP)RQ0rC=5Psp!k|x*l7IygGm49hYxD^@W;-lwnWx-x3{jFSB7nw zXl)KCe9;?OULUrjJzIAj|J;^Cv>v^IL~jA%p=4H}eLiN}jq-3%cCAa@qz6b1kT8GJ zz+aBnw}`7#*Jiv=@;zDa>sRZhHyV+5@7@t90Fc^%SEWGyZ1?flm;V~~KJmun*nT#s ztBuezcf;M6$Q>!4BC?i%@%7Pm%!$Aq!Rw|oH$+0}UrzgJSVzc}&M1M4-*Isw4 z{6)$Vqw%^#M2auGefI!JYgr=Se^U)wp@$dwxoO7yPO?dKVQK8z$}XENu&7?>VN;=4 z7Df?I92*@(>}`86(mt!E`I)1yYg0Qj&G-hi;s*#y6P!XAY?Nk#>l=1kS|6M4I-vN{ zS?6!p{dBHFYpjb^-~4cR&jZPt)oV% zl-82WdJHI|SHJw);uzR1Ouun9oE2x4hqmKz*<(^3WORoVwMKoFAtD?L`yIAi{$sTy zKpuWF1z@N&8#IH#CNzj3v6g%S_Iwu;%&)zJ?T_qiFF(bDNGen^u`kD`=a& zT4&OSx(Lh6G#oVaWh*H+LEyj~LjPd^s%O%rq>S#cD`*Grz_AA=tTS{PvY*ayTy~6q zfz)n(F z2LOinQ~JU`N@UN*591Hv;Gqr*R4lQ@|WF4N6`kuT)ho5?>->8cCqxwAw;w=k%KJ1nrq~3nt*>9sQtwhaU zqFa!j2}OO9hQ2=ZTv#hnO{Axdf9(8>izP0zfnDzKw7Y|PP|`0=osrZfZLww~>XX!Q z!*xsj(d5rR`4AmE5v=zk1`m3GjnyQ)1$55x4t;A3M_>4ueBBOwp5!<3%{OW9{r;dr zEaJ96pCDsAAdeaC{LaEkU#Q8w7Ez}AfAJocyiusO>ql+>u^p0CfBpEq6-QQP_ zJAXgzoOGGaL)Fcs1 zq{nN7`D|!FF|dh&6Z)>vlEpy~VQSWIR@ z1GfdC)_om&)-?n{3Iyi`f7d?K&{g<-@*rQH7F^5j4S5?eb?uWC8wfkr$%o@#zl)AW zJXy%qzzX4q4(Qdm3fjvIvv1IAc#g-MgDa-xA4vG!L+1cj0qMz1UyTfD_vx@3pJ_bW z(z&gfy&=qMXSjxLtTi6PkKtxDF<;S7mO5uM$%P|(7^xmgsH`KH#nH=GT7>DqFW`mh zf8>(i1Nz9X|2Lr2Q2(zfwL<@2M47JtH|y1wyxH8`XepJAdcp0luO#_D!u4}W1~5ba zr~DG-f2Cfn7V-ZVQKrfNJTVya^ijzj|EW={HPQaxg8vore-UN6{BKk@K>lwus;z3R zQJnvOg(b=V^WyrjRHny&swoX6#{cl6IRC$pvNSW+Zz*)+q=qC&Nz8U?8m^E{cUBK# zE@97AXHCF+ zZ$4?8?}{6eV>B%EN0HO{5pz1vqSF@A^_k>&z6-2*^4ag1u71I@o7|79jeGI&J!L^s z$i_0*YrURlYrUK7{Pif8b6hecH23B@hRmjc%pz8Kim7$Is;{5xW^af=zJRFoaR33f zQFbw!&3HE1q%0|K#OSkFVgK;i5vB9}=1T6boKB z<`MiyEC&GI03m@8yE}hR&L1xQbMyC|RBT@WW=Ymb8Uhw8Ugt!EC^qj$k=$QBg^Ya& zo=A?76@oTHdM!}UzW3?+5=5;CY#w2I;@ar>zn==_%t67<&E9Du*79r&%>^T%0@hLa zEuoo7nF6ic9om+6F~mF|35{KNm#MQSgtrqSbg?4)L6nTrbqlyv|<>`PIFBhY3D=%+Sx7bUgK^8e592Dd6X97A{T|2FE)2C3APR-?%Ovyd|5 z{76XYqNKGwn|DHb0*~*jX$#`3$_p^B3R`BWFH;@eJbT|LkeR_B~LP l=`=2c^)nOJ&);pF4Q}B#E=o~~Qj|rO{|^8;-8%qO0RW1;3nBmj From b7eed10e64cb9530db9afc784fb437cbccd5eb07 Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 25 Sep 2025 15:13:14 +0800 Subject: [PATCH 30/38] ready for test --- .buildkite/README_dynamic.md | 138 +++++++++++++++++++ .buildkite/buildkite_ci_feature_template.yml | 17 ++- .buildkite/buildkite_ci_model_template.yml | 17 ++- .buildkite/pipeline_dynamic.yml | 4 +- .buildkite/scripts/run_in_docker.sh | 32 ----- 5 files changed, 156 insertions(+), 52 deletions(-) create mode 100644 .buildkite/README_dynamic.md diff --git a/.buildkite/README_dynamic.md b/.buildkite/README_dynamic.md new file mode 100644 index 0000000000..9be88140d4 --- /dev/null +++ b/.buildkite/README_dynamic.md @@ -0,0 +1,138 @@ +# Buildkite Pipeline Generator + +This document outlines the process for using Python scripts to automatically generate Buildkite CI/CD pipeline configuration files. These scripts leverage templates to create consistent testing pipelines for both models and features. + +## Overview + +The primary goal of these tools is to streamline the creation of Buildkite pipelines. Instead of manually creating and editing YAML files for each new model or feature, you can run a simple command to generate a standardized pipeline file. + +There are two main generators: + +1. **Model Pipeline Generator** (`generate_model_buildkite.py`): Creates a pipeline file for testing a specific machine learning model. +2. **Feature Pipeline Generator** (`generate_feature_buildkite.py`): Creates a pipeline file for testing a new feature. + +Both scripts work by reading a corresponding template file (`.yml`), replacing placeholder variables with your command-line arguments, and saving the result as a new YAML file in a designated output directory. + +## Directory Structure + +To use the scripts, your files should be arranged as follows. The output directories (`models/` and `features/`) will be created automatically if they do not exist. + +``` +. +├── generate_model_buildkite.py +├── buildkite_ci_model_template.yml +├── generate_feature_buildkite.py +├── buildkite_ci_feature_template.yml +└── README.md +``` + +----- + +## How to Use + +### 1\. Generating a Model Pipeline + +Use the `generate_model_buildkite.py` script to create a CI pipeline for a new model. + +**Command:** + +```bash +python generate_model_buildkite.py --model-name --queue +``` + +**Arguments:** + + * `--model-name` (required): The name of the model to be tested. If the name contains special characters like `/` or `.`, they will be replaced with `_` in the output filename and for Buildkite step keys. + * `--queue` (required): The name of the Buildkite agent queue where the jobs will run (e.g., `tpu_v6e_queue`). + +**Example:** + +```bash +python generate_model_buildkite.py --model-name meta-llama/Llama-3.1-8B-Instruct --queue tpu_v6e_queue +``` + +**Output:** + +This command will generate a new file located at `models/meta-llama_Llama-3_1-8B-Instruct.yml`. + +----- + +### 2\. Generating a Feature Pipeline + +Use the `generate_feature_buildkite.py` script to create a CI pipeline for a new feature. + +**Command:** + +```bash +python generate_feature_buildkite.py --feature-name --queue +``` + +**Arguments:** + + * `--feature-name` (required): The name of the feature to be tested. + * `--queue` (required): The name of the Buildkite agent queue. + +**Example:** + +```bash +python generate_feature_buildkite.py --feature-name Feat-A --queue tpu_v6e_queue +``` + +**Output:** + +This command will generate a new file located at `features/Feat-A.yml`. + +----- + +## Important Notes: Placeholders & Customization + +The scripts work by performing a find-and-replace on specific placeholders within the template files. You can customize the `buildkite_ci_*_template.yml` files to change the structure of the generated pipelines. + +#### **Model Template Placeholders (`buildkite_ci_model_template.yml`)** + + * `{MODEL_NAME}`: Replaced with the exact string provided to `--model-name`. This is typically used in human-readable fields like step `label`. + * `{SAFE_MODEL_NAME}`: A sanitized version of the model name, automatically generated by replacing characters like `/` and `.` with `_`. This is used for machine-readable fields like the step `key` and the output filename to ensure validity. + * `{QUEUE}`: Replaced with the string provided to `--queue`. + +#### **Feature Template Placeholders (`buildkite_ci_feature_template.yml`)** + + * `{FEATURE_NAME}`: Replaced with the exact string provided to `--feature-name`. + * `{SAFE_FEATURE_NAME}`: A sanitized version of the feature name. + * `{QUEUE}`: Replaced with the string provided to `--queue`. + +## Integration with the Main Pipeline + +After generating a pipeline `.yml` file, you must place it in the correct subdirectory within the `.buildkite/` folder. The generator scripts create these files in the top-level `models/` and `features/` directories, so you will need to **manually move** them to the corresponding location inside `.buildkite/`. + +This is a crucial step, as the main CI process relies on the `dynamic_bootstrap.sh` script to automatically scan these specific directories to discover and upload the pipeline steps. Also, remember to **replace the script in your main pipeline's configuration to execute `dynamic_bootstrap.sh`**. + +### Target Directories + +Place your generated `.yml` files into the following directories for detection: + + * **Standard Models**: Move the generated file to `.buildkite/models/`. + * **Informational Models**: For models considered "informational" (e.g., VLLM-native models), move the generated file to `.buildkite/models/informational/`. + * **Popular Models**: For models designated as "popular," move the generated file to `.buildkite/models/popular/`. + * **Features**: Move the generated feature file to `.buildkite/features/`. + +### Example Workflow + +1. Generate a pipeline for a new model that you consider "popular": + + ```bash + python generate_model_buildkite.py --model-name my-popular-model --queue tpu_v6e_queue + ``` + + This creates `models/my-popular-model.yml`. + +2. Move the file to the correct directory for the bootstrap script to find it: + + ```bash + # Create the directory if it doesn't exist + mkdir -p .buildkite/models/popular + + # Move the file + mv models/my-popular-model.yml .buildkite/models/popular/ + ``` + +Once the file is in the correct `.buildkite/` subdirectory and committed, the `dynamic_bootstrap.sh` script will automatically find it and add its steps to the Buildkite pipeline. \ No newline at end of file diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml index cab1a9250e..6825c44a4e 100644 --- a/.buildkite/buildkite_ci_feature_template.yml +++ b/.buildkite/buildkite_ci_feature_template.yml @@ -5,8 +5,8 @@ steps: - label: "Unit tests for {FEATURE_NAME}" key: "ut_{SAFE_FEATURE_NAME}" commands: - # - replace_with_test_commands # TODO: Replaced to actual test commands - - echo "[DEBUG], unit testing for {FEATURE_NAME}" # TODO: Replace to actual test commands + # Replace to actual test commands + - echo "[DEBUG], unit testing for {FEATURE_NAME}" - label: "Notifications: Unit tests for {FEATURE_NAME}" key: "notifications_ut_{SAFE_FEATURE_NAME}" depends_on: "ut_{SAFE_FEATURE_NAME}" @@ -24,9 +24,8 @@ steps: key: "it_{SAFE_FEATURE_NAME}" depends_on: "notifications_ut_{SAFE_FEATURE_NAME}" commands: - # TODO: expected_accuracy need parameterized - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "{FEATURE_NAME}" - - echo "[DEBUG], integration testing for {FEATURE_NAME}" # TODO: Replace to actual test commands + # Replace to actual test commands + - echo "[DEBUG], integration testing for {FEATURE_NAME}" - label: "Notifications: Integration tests for {FEATURE_NAME}" key: "notifications_it_{SAFE_FEATURE_NAME}" depends_on: "it_{SAFE_FEATURE_NAME}" @@ -44,8 +43,8 @@ steps: key: "pb_{SAFE_FEATURE_NAME}" depends_on: "notifications_it_{SAFE_FEATURE_NAME}" commands: - # - replace_with_test_command # TODO - - echo "[DEBUG], performance benchmarking for {FEATURE_NAME}" # TODO: Replace to actual test commands + # Replace to actual test commands + - echo "[DEBUG], performance benchmarking for {FEATURE_NAME}" - label: "Notifications: Performance benchmarks for {FEATURE_NAME}" key: "notifications_pb_{SAFE_FEATURE_NAME}" depends_on: "pb_{SAFE_FEATURE_NAME}" @@ -63,8 +62,8 @@ steps: key: "st_{SAFE_FEATURE_NAME}" depends_on: "notifications_pb_{SAFE_FEATURE_NAME}" commands: - # - our_stress_tests_script {FEATURE_NAME} expected_throughput # TODO: expected_throughput need parameterized - - echo "[DEBUG], stress testing for {FEATURE_NAME}" # TODO: Replace to actual test commands + # Replace to actual test commands + - echo "[DEBUG], stress testing for {FEATURE_NAME}" - label: "Notifications: Stress tests for {FEATURE_NAME}" key: "notifications_st_{SAFE_FEATURE_NAME}" depends_on: "st_{SAFE_FEATURE_NAME}" diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml index 595ae98c05..9cf355a4a3 100644 --- a/.buildkite/buildkite_ci_model_template.yml +++ b/.buildkite/buildkite_ci_model_template.yml @@ -5,8 +5,8 @@ steps: - label: "Unit tests for {MODEL_NAME}" key: "ut_{SAFE_MODEL_NAME}" commands: - # - replace_with_test_commands # TODO: Replaced to actual test commands - - echo "[DEBUG], unit testing for {MODEL_NAME}" # TODO: Replace to actual test commands + # Replace to actual test commands + - echo "[DEBUG], unit testing for {MODEL_NAME}" - label: "Notifications: Unit tests for {MODEL_NAME}" key: "notifications_ut_{SAFE_MODEL_NAME}" depends_on: "ut_{SAFE_MODEL_NAME}" @@ -24,9 +24,8 @@ steps: key: "it_{SAFE_MODEL_NAME}" depends_on: "notifications_ut_{SAFE_MODEL_NAME}" commands: - # TODO: expected_accuracy need parameterized - # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "{MODEL_NAME}" - - echo "[DEBUG], integration testing for {MODEL_NAME}" # TODO: Replace to actual test commands + # Replace to actual test commands + - echo "[DEBUG], integration testing for {MODEL_NAME}" - label: "Notifications: Integration tests for {MODEL_NAME}" key: "notifications_it_{SAFE_MODEL_NAME}" depends_on: "it_{SAFE_MODEL_NAME}" @@ -44,8 +43,8 @@ steps: key: "pb_{SAFE_MODEL_NAME}" depends_on: "notifications_it_{SAFE_MODEL_NAME}" commands: - # - replace_with_test_command # TODO - - echo "[DEBUG], performance benchmarking for {MODEL_NAME}" # TODO: Replace to actual test commands + # Replace to actual test commands + - echo "[DEBUG], performance benchmarking for {MODEL_NAME}" - label: "Notifications: Performance benchmarks for {MODEL_NAME}" key: "notifications_pb_{SAFE_MODEL_NAME}" depends_on: "pb_{SAFE_MODEL_NAME}" @@ -63,8 +62,8 @@ steps: key: "st_{SAFE_MODEL_NAME}" depends_on: "notifications_pb_{SAFE_MODEL_NAME}" commands: - # - our_stress_tests_script {MODEL_NAME} expected_throughput # TODO: expected_throughput need parameterized - - echo "[DEBUG], stress testing for {MODEL_NAME}" # TODO: Replace to actual test commands + # Replace to actual test commands + - echo "[DEBUG], stress testing for {MODEL_NAME}" - label: "Notifications: Stress tests for {MODEL_NAME}" key: "notifications_st_{SAFE_MODEL_NAME}" depends_on: "st_{SAFE_MODEL_NAME}" diff --git a/.buildkite/pipeline_dynamic.yml b/.buildkite/pipeline_dynamic.yml index 139d6385ac..ef80b1efcd 100644 --- a/.buildkite/pipeline_dynamic.yml +++ b/.buildkite/pipeline_dynamic.yml @@ -15,10 +15,10 @@ steps: agents: queue: tpu_v6e_queue commands: - - echo "Generate support matrices..." + - echo "Generate support matrices report..." - bash .buildkite/scripts/export_support_matrix.sh - # Handle PR builds: print model matrices and feature matrices + # Print model matrices and feature matrices - label: "Handle Report" if: build.pull_request.id != null depends_on: export_support_matrix diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh index 241347d172..5c105c5a60 100755 --- a/.buildkite/scripts/run_in_docker.sh +++ b/.buildkite/scripts/run_in_docker.sh @@ -11,36 +11,6 @@ if [ "$#" -eq 0 ]; then exit 1 fi -MOUNT_EXPECT_RESULT="False" -OTHER_ARGS=() - -while [[ $# -gt 0 ]]; do - case "$1" in - --mount-expect-result) - MOUNT_EXPECT_RESULT="True" - shift 1 - ;; - *) - OTHER_ARGS+=("$@") - break - ;; - esac -done - -# TBD: To support the functionality of connecting GPU and TPU expected values in the future -EXPECT_VOLUME=() -EXPECT_ENV=() -if [ "$MOUNT_EXPECT_RESULT" = "True" ]; then - touch "$EXPECT_VALUES_FILENAME" - echo "[DEBUG] Path: $EXPECT_VALUES_PATH, Filename: $EXPECT_VALUES_FILENAME, " - - EXPECT_VOLUME=(-v "$(pwd)/$EXPECT_VALUES_FILENAME":"$EXPECT_VALUES_PATH$EXPECT_VALUES_FILENAME") - echo "docker -v cmd: " "${EXPECT_VOLUME[@]}" - - EXPECT_ENV=(-e EXPECT_VALUES_PATH="$EXPECT_VALUES_PATH" -e EXPECT_VALUES_FILENAME="$EXPECT_VALUES_FILENAME") - echo "docker -e cmd: " "${EXPECT_ENV[@]}" -fi - if ! grep -q "^HF_TOKEN=" /etc/environment; then gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \ sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)" @@ -103,8 +73,6 @@ exec docker run \ --shm-size=16G \ --rm \ -v "$LOCAL_HF_HOME":"$DOCKER_HF_HOME" \ - "${EXPECT_VOLUME[@]}" \ - "${EXPECT_ENV[@]}" \ -e HF_HOME="$DOCKER_HF_HOME" \ -e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \ -e HF_TOKEN="$HF_TOKEN" \ From 3b203be00d3664f1cfbbf7a1d2552354b0cde9f1 Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 25 Sep 2025 15:17:13 +0800 Subject: [PATCH 31/38] test accuracy --- .buildkite/pipeline_jax.yml | 221 +++++++++++++++++++----------------- 1 file changed, 114 insertions(+), 107 deletions(-) diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml index 3184e36e60..187d6c4b31 100644 --- a/.buildkite/pipeline_jax.yml +++ b/.buildkite/pipeline_jax.yml @@ -2,120 +2,127 @@ steps: # ----------------------------------------------------------------- # TEST STEPS - Calling wrapper # ----------------------------------------------------------------- - - label: "E2E MLPerf tests for JAX models" - key: test_0 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + - label: "Integration Test llama-3.1-8B on TPU" + key: integration_test_llama_3_1_8B_tpu + soft_fail: true + agents: + queue: tpu_v6e_queue + commands: + - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct" -e "0.8" + # - label: "E2E MLPerf tests for JAX models" + # key: test_0 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E MLPerf tests for JAX models with quantization" - key: test_1 - soft_fail: true - env: - QUANTIZATION: "True" - agents: - queue: tpu_v6e_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E MLPerf tests for JAX models with quantization" + # key: test_1 + # soft_fail: true + # env: + # QUANTIZATION: "True" + # agents: + # queue: tpu_v6e_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E MLPerf tests for JAX new models" - key: test_2 - soft_fail: true - env: - NEW_MODEL_DESIGN: "True" - agents: - queue: tpu_v6e_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E MLPerf tests for JAX new models" + # key: test_2 + # soft_fail: true + # env: + # NEW_MODEL_DESIGN: "True" + # agents: + # queue: tpu_v6e_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E MLPerf tests for JAX + vLLM models" - key: test_3 - soft_fail: true - env: - MODEL_IMPL_TYPE: "vllm" - agents: - queue: tpu_v6e_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E MLPerf tests for JAX + vLLM models" + # key: test_3 + # soft_fail: true + # env: + # MODEL_IMPL_TYPE: "vllm" + # agents: + # queue: tpu_v6e_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E MLperf tests for Llama4 models" - key: test_4 - soft_fail: true - env: - NEW_MODEL_DESIGN: "True" - USE_V6E8_QUEUE: "True" - agents: - queue: tpu_v6e_8_queue - commands: - - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh + # - label: "E2E MLperf tests for Llama4 models" + # key: test_4 + # soft_fail: true + # env: + # NEW_MODEL_DESIGN: "True" + # USE_V6E8_QUEUE: "True" + # agents: + # queue: tpu_v6e_8_queue + # commands: + # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh - - label: "E2E multi modality test" - key: test_5 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \ - bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh' + # - label: "E2E multi modality test" + # key: test_5 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \ + # bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh' - - label: "E2E speculative decoding test" - key: test_6 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py' + # - label: "E2E speculative decoding test" + # key: test_6 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py' - - label: "JAX unit tests" - key: test_7 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \ - --ignore=/workspace/tpu_commons/tests/kernels \ - --ignore=/workspace/tpu_commons/tests/e2e \ - --ignore=/workspace/tpu_commons/tpu_commons/mock \ - --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69 + # - label: "JAX unit tests" + # key: test_7 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \ + # --ignore=/workspace/tpu_commons/tests/kernels \ + # --ignore=/workspace/tpu_commons/tests/e2e \ + # --ignore=/workspace/tpu_commons/tpu_commons/mock \ + # --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69 - - label: "JAX unit tests - kernels" - key: test_8 - soft_fail: true - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/run_in_docker.sh \ - python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \ - --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \ - --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py + # - label: "JAX unit tests - kernels" + # key: test_8 + # soft_fail: true + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/run_in_docker.sh \ + # python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \ + # --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \ + # --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py - # ----------------------------------------------------------------- - # NOTIFICATION STEP - # ----------------------------------------------------------------- - - label: "TPU Test Notification" - depends_on: - - test_0 - - test_1 - - test_2 - - test_3 - - test_4 - - test_5 - - test_6 - - test_7 - - test_8 - agents: - queue: tpu_v6e_queue - commands: - - | - .buildkite/scripts/check_results.sh \ - "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 + # # ----------------------------------------------------------------- + # # NOTIFICATION STEP + # # ----------------------------------------------------------------- + # - label: "TPU Test Notification" + # depends_on: + # - test_0 + # - test_1 + # - test_2 + # - test_3 + # - test_4 + # - test_5 + # - test_6 + # - test_7 + # - test_8 + # agents: + # queue: tpu_v6e_queue + # commands: + # - | + # .buildkite/scripts/check_results.sh \ + # "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 From f87e0c47097881956144e64c2afd6e72d959f5f7 Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 25 Sep 2025 15:18:24 +0800 Subject: [PATCH 32/38] test --- .buildkite/scripts/bootstrap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh index 44fa7bf64b..aff2c6b9d4 100755 --- a/.buildkite/scripts/bootstrap.sh +++ b/.buildkite/scripts/bootstrap.sh @@ -23,6 +23,6 @@ echo "--- Starting Buildkite Bootstrap ---" # # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml # fi -buildkite-agent pipeline upload .buildkite/pipeline_dynamic.yml +buildkite-agent pipeline upload .buildkite/pipeline_jax.yml echo "--- Buildkite Bootstrap Finished ---" From df4fd874a268afbb5436d6e7abd4f63cfac74ee2 Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 25 Sep 2025 15:32:52 +0800 Subject: [PATCH 33/38] test --- scripts/vllm/integration/conftest.py | 5 +++++ scripts/vllm/integration/test_accuracy.py | 12 +++++++++++- tests/e2e/benchmarking/test_accuracy.sh | 14 ++++++-------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/scripts/vllm/integration/conftest.py b/scripts/vllm/integration/conftest.py index b1c2ba1872..2020620933 100644 --- a/scripts/vllm/integration/conftest.py +++ b/scripts/vllm/integration/conftest.py @@ -16,6 +16,11 @@ def pytest_addoption(parser): help="This is used to specify the JSON file that stores the expected values. " + "The results from running test_accuracy on a GPU will be saved to this file, " + "and when running on a TPU, the results will be read from this file for comparison.") + parser.addoption( + "--expected-value", + type=float, + default=None, + help="This value will be used to compare the measure value and determine if the test passes or fails.") parser.addoption( "--model-names", action="store", diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py index 68fb69e643..1a06017fdd 100644 --- a/scripts/vllm/integration/test_accuracy.py +++ b/scripts/vllm/integration/test_accuracy.py @@ -150,9 +150,14 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, requ tp_size = request.config.getoption("--tensor-parallel-size") expected_json_filepath = request.config.getoption("--expected-values-file") - + expected_value = request.config.getoption("--expected-value") + expected_values_data = read_expected_value(expected_json_filepath) + # Add expected-value to expected_values_data with model name + if expected-value is not None: + expected_values_data[model] = float(expected_value) + if tp_size is None: tp_size = 1 elif tp_size < 1 or tp_size > 8: @@ -181,9 +186,14 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( tp_size = request.config.getoption("--tensor-parallel-size") expected_json_filepath = request.config.getoption("--expected-values-file") + expected_value = request.config.getoption("--expected-value") expected_values_data = read_expected_value(expected_json_filepath) + # Add expected-value to expected_values_data with model name + if expected-value is not None: + expected_values_data[model] = float(expected_value) + if tp_size is None: tp_size = 1 elif tp_size < 1 or tp_size > 8: diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh index 0ce96d9e42..5f003533dd 100644 --- a/tests/e2e/benchmarking/test_accuracy.sh +++ b/tests/e2e/benchmarking/test_accuracy.sh @@ -2,7 +2,7 @@ model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.1-70B-Instruct" tensor_parallel_size=1 -gpu_enabled=false +expected_value=0 extra_serve_args=() echo extra_serve_args: "${extra_serve_args[@]}" @@ -17,6 +17,7 @@ helpFunction() echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)" echo -e "\t-m A space-separated list of HuggingFace model ids to use (default: meta-llama/Llama-3.1-8B-Instruct and meta-llama/Llama-3.1-70B-Instruct)" echo -e "\t-t Tensor parallel size (default: 1)" + echo -e "\t-e Excepted value" exit 1 } @@ -37,8 +38,9 @@ while [[ "$#" -gt 0 ]]; do shift shift ;; - -g|--gpu) - gpu_enabled=true + -e|--excepted-value) + expected_value="$2" + shift shift ;; -h|--help) @@ -66,10 +68,6 @@ echo "Running integration for models: $comma_model_list" echo "--------------------------------------------------" # Default action -if $gpu_enabled; then - python3 -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" -else - python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" -fi +python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" --expected_value="$expected_value" exit $exit_code \ No newline at end of file From 66cada4869b8a6c96748c7dac5ccfce4e32f7028 Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 25 Sep 2025 15:47:43 +0800 Subject: [PATCH 34/38] fix --- requirements_benchmarking.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements_benchmarking.txt b/requirements_benchmarking.txt index 4484d13728..cdfbc6e506 100644 --- a/requirements_benchmarking.txt +++ b/requirements_benchmarking.txt @@ -4,4 +4,5 @@ nltk evaluate datasets rouge-score -scikit-learn \ No newline at end of file +scikit-learn +tblib==3.1.0 \ No newline at end of file From 2c7e75b62e0021f99829cf2d7c2f1a1be8868daf Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 25 Sep 2025 15:56:04 +0800 Subject: [PATCH 35/38] fix --- tests/e2e/benchmarking/test_accuracy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh index 5f003533dd..1bae5455af 100644 --- a/tests/e2e/benchmarking/test_accuracy.sh +++ b/tests/e2e/benchmarking/test_accuracy.sh @@ -68,6 +68,6 @@ echo "Running integration for models: $comma_model_list" echo "--------------------------------------------------" # Default action -python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" --expected_value="$expected_value" +python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" --expected-value="$expected_value" exit $exit_code \ No newline at end of file From 277559eea9b4444e26938aaa4d3bd25b599e3336 Mon Sep 17 00:00:00 2001 From: StingLin Date: Thu, 25 Sep 2025 16:04:41 +0800 Subject: [PATCH 36/38] fix --- scripts/vllm/integration/test_accuracy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py index 1a06017fdd..34916a4b0a 100644 --- a/scripts/vllm/integration/test_accuracy.py +++ b/scripts/vllm/integration/test_accuracy.py @@ -155,7 +155,7 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, requ expected_values_data = read_expected_value(expected_json_filepath) # Add expected-value to expected_values_data with model name - if expected-value is not None: + if expected_value is not None: expected_values_data[model] = float(expected_value) if tp_size is None: @@ -191,7 +191,7 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( expected_values_data = read_expected_value(expected_json_filepath) # Add expected-value to expected_values_data with model name - if expected-value is not None: + if expected_value is not None: expected_values_data[model] = float(expected_value) if tp_size is None: From 597e943d4b8b78ff03dcef5fb9a3dc76661319cd Mon Sep 17 00:00:00 2001 From: StingLin Date: Fri, 26 Sep 2025 10:46:59 +0800 Subject: [PATCH 37/38] test --- scripts/vllm/integration/conftest.py | 23 ++-- scripts/vllm/integration/test_accuracy.py | 134 ++++------------------ tests/e2e/benchmarking/test_accuracy.sh | 42 +++++-- 3 files changed, 61 insertions(+), 138 deletions(-) diff --git a/scripts/vllm/integration/conftest.py b/scripts/vllm/integration/conftest.py index 2020620933..612f7a264f 100644 --- a/scripts/vllm/integration/conftest.py +++ b/scripts/vllm/integration/conftest.py @@ -9,28 +9,21 @@ def pytest_addoption(parser): default=1, help="The tensor parallel size to use for the test." ) - parser.addoption( - "--expected-values-file", - type=str, - default=None, - help="This is used to specify the JSON file that stores the expected values. " + - "The results from running test_accuracy on a GPU will be saved to this file, " + - "and when running on a TPU, the results will be read from this file for comparison.") parser.addoption( "--expected-value", type=float, default=None, help="This value will be used to compare the measure value and determine if the test passes or fails.") parser.addoption( - "--model-names", - action="store", - # default="meta-llama/Llama-3.1-8B-Instruct", + "--model-name", + type=str, default=None, - help="Comma-separated list of model names to test (e.g., 'model1,model2')" - ) + help= + "Model name to test (e.g., 'model1')") parser.addoption( - "--fp8-kv-model-names", - action="store", + "--fp8-kv-model-name", + type=str, default=None, - help="Comma-separated list of model names to test fp8-kv (e.g., 'model1,model2')" + help= + "Model name to test fp8-kv (e.g., 'model1')" ) \ No newline at end of file diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py index 34916a4b0a..e88bcf3545 100644 --- a/scripts/vllm/integration/test_accuracy.py +++ b/scripts/vllm/integration/test_accuracy.py @@ -28,77 +28,11 @@ RTOL = 0.03 _JSON_WRITE_LOCK = threading.Lock() -EXPECTED_VALUES = { - "Qwen/Qwen3-1.7B": 0.68, - "google/gemma-3-1b-it": 0.25, - "meta-llama/Llama-3.1-8B-Instruct": 0.76, - "meta-llama/Llama-3.1-70B-Instruct": 0.876, -} - -# Parametrize test cases based on CLI arguments or default values -def parametrize_by_cli_or_default(metafunc, fixture_name, cli_parameter, default_list): - if fixture_name in metafunc.fixturenames: - print(f"Checking CLI parameter '{cli_parameter}' for '{fixture_name}'") - names_str = metafunc.config.getoption(cli_parameter) - if names_str: - print(f"Using '{cli_parameter}' parameter for '{fixture_name}'") - param_list = [name.strip() for name in names_str.split(',') if name.strip()] - metafunc.parametrize(fixture_name, param_list) - else: - print(f"Using default list for '{fixture_name}'") - metafunc.parametrize(fixture_name, default_list) - -def pytest_generate_tests(metafunc): - parametrize_by_cli_or_default(metafunc, fixture_name="model", cli_parameter="--model-names", default_list=MODEL_NAMES) - parametrize_by_cli_or_default(metafunc, fixture_name="fp8_kv_model", cli_parameter="--fp8-kv-model-names", default_list=FP8_KV_MODEL_NAMES) - -# Write expected values to json file -# TBD: To support the functionality of connecting GPU and TPU expected values in the future -def write_expected_value_to_json(model_name, measured_value, json_filepath): - with _JSON_WRITE_LOCK: - data = {} - try: - with open(json_filepath, 'r', encoding='utf-8') as f: - data = json.load(f) - except (FileNotFoundError, json.JSONDecodeError): - print( - f"'{json_filepath}' not found or is empty/invalid. A new one will be created." - ) - data = {} - - data[model_name] = measured_value - - try: - with open(json_filepath, 'w', encoding='utf-8') as f: - json.dump(data, f, indent=4) - print( - f"Successfully updated '{json_filepath}' with the result for {model_name}." - ) - except IOError as e: - print( - f"Error: Failed to write to file '{json_filepath}'. Reason: {e}" - ) - raise - -# Read expected values from json file if exist -# TBD: To support the functionality of connecting GPU and TPU expected values in the future -def read_expected_value(expected_json_filepath=None): - expected_values_data = {} - if expected_json_filepath is None: - expected_values_data = EXPECTED_VALUES - else: - path_obj = Path(expected_json_filepath) - # Read expected values from json file if exist - if path_obj.is_file() and os.path.getsize(expected_json_filepath) > 0: - print(f"\n[Fixture] Loading from: {expected_json_filepath}") - with open(expected_json_filepath, 'r', encoding='utf-8') as f: - expected_values_data = json.load(f) - else: - raise FileNotFoundError(f"Expected values file not found: {expected_json_filepath}") - return expected_values_data - - -def run_test(model_name, expected_values_data, expected_json_filepath, more_args=None): + +def run_test(model_name, + expected_value, + expected_json_filepath, + more_args=None): """Run the end to end accuracy test.""" print(f"Running test for model: {model_name}") @@ -112,51 +46,25 @@ def run_test(model_name, expected_values_data, expected_json_filepath, more_args tasks="gsm8k", batch_size="auto", ) - - # Execute default behavior when `expected_json_filepath` is not set. - if expected_json_filepath is None: - print(f"Execute default behavior") - measured_value = results["results"][TASK][FILTER] - assert model_name in EXPECTED_VALUES, ( - f"Cannot find the expected value for the model {model_name=}") - expected_value = EXPECTED_VALUES[model_name] - assert (measured_value - RTOL < expected_value - and measured_value + RTOL > expected_value - ), f"Expected: {expected_value} | Measured: {measured_value}" - else: - print(f"Execute specific models behavior") - measured_value = results["results"][TASK][FILTER] - expected_value = expected_values_data.get(model_name) - - # Model expected value not exist, write in file - if model_name not in expected_values_data: - print(f"Warning: No expected value found for {model_name}. " - "Skipping accuracy check.") - print(f"Measured value: {measured_value}") - write_expected_value_to_json(model_name, measured_value, expected_json_filepath) - - else: - print(f"Found expected value! {model_name=}, {measured_value=}, {expected_value=}") - assert (measured_value - RTOL < expected_value - and measured_value + RTOL > expected_value - ), f"Expected: {expected_value} | Measured: {measured_value}" + + measured_value = results["results"][TASK][FILTER] + assert (measured_value - RTOL < expected_value < measured_value + RTOL + ), f"Expected: {expected_value} | Measured: {measured_value}" @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 is currently only supported on CUDA and TPU") -def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest): +def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch, + request: pytest.FixtureRequest): """Run with the V1 Engine.""" + model = request.config.getoption("--model-name") print(f"Testing model: {model}...") tp_size = request.config.getoption("--tensor-parallel-size") - expected_json_filepath = request.config.getoption("--expected-values-file") expected_value = request.config.getoption("--expected-value") - expected_values_data = read_expected_value(expected_json_filepath) - - # Add expected-value to expected_values_data with model name - if expected_value is not None: - expected_values_data[model] = float(expected_value) + if expected_value is None: + raise ValueError if tp_size is None: tp_size = 1 @@ -173,26 +81,24 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, requ print(f"common args: {more_args}") - run_test(model, expected_values_data, expected_json_filepath, more_args) + run_test(model, expected_value, expected_json_filepath, + more_args) @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 is currently only supported on CUDA and TPU") def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( - fp8_kv_model, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest): + monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest): """Run with the V1 Engine.""" + fp8_kv_model = request.config.getoption("--fp8-kv-model-name") print(f"Testing fp8_kv_model: {fp8_kv_model}...") tp_size = request.config.getoption("--tensor-parallel-size") - expected_json_filepath = request.config.getoption("--expected-values-file") expected_value = request.config.getoption("--expected-value") - expected_values_data = read_expected_value(expected_json_filepath) - - # Add expected-value to expected_values_data with model name - if expected_value is not None: - expected_values_data[model] = float(expected_value) + if expected_value is None: + raise ValueError if tp_size is None: tp_size = 1 diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh index 1bae5455af..570e3fce65 100644 --- a/tests/e2e/benchmarking/test_accuracy.sh +++ b/tests/e2e/benchmarking/test_accuracy.sh @@ -1,6 +1,6 @@ #!/bin/bash -model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.1-70B-Instruct" +model_name="" tensor_parallel_size=1 expected_value=0 @@ -15,9 +15,9 @@ helpFunction() echo "" echo "Usage: $0 [-r full_path_to_root_dir -m model_id]" echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)" - echo -e "\t-m A space-separated list of HuggingFace model ids to use (default: meta-llama/Llama-3.1-8B-Instruct and meta-llama/Llama-3.1-70B-Instruct)" + echo -e "\t-m A space-separated list of HuggingFace model ids to use (Required)" echo -e "\t-t Tensor parallel size (default: 1)" - echo -e "\t-e Excepted value" + echo -e "\t-e Excepted value (Required)" exit 1 } @@ -29,7 +29,7 @@ while [[ "$#" -gt 0 ]]; do shift ;; -m|--model) - model_list="$2" + model_name="$2" shift shift ;; @@ -53,21 +53,45 @@ while [[ "$#" -gt 0 ]]; do esac done +# Check if model_name is provided and not empty +if [[ -z "$model_name" ]]; then + echo "Error: Model name (-m) is a required argument." >&2 + has_error=1 +fi + +# Check if tensor_parallel_size is an integer and greater than 0 +if ! [[ "$tensor_parallel_size" =~ ^[1-9][0-9]*$ ]]; then + echo "Error: Tensor parallel size (-t) must be an integer greater than 0. Got: '$tensor_parallel_size'" >&2 + has_error=1 +fi + +# Check if expected_value is a float and greater than 0 +if ! awk -v num="$expected_value" 'BEGIN { exit !(num > 0) }'; then + echo "Error: Expected value (-e) must be a number greater than 0. Got: '$expected_value'" >&2 + has_error=1 +fi + +# If any validation failed, print help and exit +if [[ "$has_error" -ne 0 ]]; then + helpFunction +fi + + echo "Using the root directory at $root_dir" -echo "Testing $model_list prompts" cd "$root_dir"/vllm/tests/entrypoints/llm || exit # Overwrite a few of the vLLM benchmarking scripts with the TPU Commons ones cp "$root_dir"/tpu_commons/scripts/vllm/integration/*.py "$root_dir"/vllm/tests/entrypoints/llm/ -comma_model_list=${model_list// /,} - echo "--------------------------------------------------" -echo "Running integration for models: $comma_model_list" +echo "Running integration for model: $model_name" echo "--------------------------------------------------" # Default action -python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" --expected-value="$expected_value" +python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine \ + --tensor-parallel-size="$tensor_parallel_size" \ + --model-name="$model_name" \ + --expected-value="$expected_value" exit $exit_code \ No newline at end of file From cc8f5efa8757073943c81734ecdcbb06576bb8b2 Mon Sep 17 00:00:00 2001 From: StingLin Date: Fri, 26 Sep 2025 10:55:19 +0800 Subject: [PATCH 38/38] fix --- scripts/vllm/integration/test_accuracy.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py index e88bcf3545..d0511dfdd9 100644 --- a/scripts/vllm/integration/test_accuracy.py +++ b/scripts/vllm/integration/test_accuracy.py @@ -31,7 +31,6 @@ def run_test(model_name, expected_value, - expected_json_filepath, more_args=None): """Run the end to end accuracy test.""" print(f"Running test for model: {model_name}") @@ -81,8 +80,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch, print(f"common args: {more_args}") - run_test(model, expected_value, expected_json_filepath, - more_args) + run_test(model, expected_value, more_args) @pytest.mark.skipif(not current_platform.is_cuda() @@ -116,4 +114,4 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( print(f"common args: {more_args}") - run_test(fp8_kv_model, expected_values_data, expected_json_filepath, more_args) \ No newline at end of file + run_test(fp8_kv_model, expected_values_data, more_args) \ No newline at end of file