From d73e3d1e2704930d1a9dd444e23f0375a5276295 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 21 Aug 2025 12:08:41 +0800
Subject: [PATCH 01/38] accomplish dev intergration test, this change adds a
 new test accuracy on TPU to the CI pipeline. The test covers the
 Llama-3.1-8B-Instruct and Llama-3.1-70B-Instruct models, modifying the test
 to support comparing `EXPECTED_VALUE`. It also allows users to input
 `tensor-parallel-size` and `model-names` parameters for greater flexibility
 during execution

---
 .buildkite/pipeline_jax.yml               |  42 ++++-
 .buildkite/scripts/bootstrap.sh           |  39 ++--
 .buildkite/scripts/run_in_docker.sh       |  44 ++++-
 docker/Dockerfile                         |   4 +-
 requirements_benchmarking.txt             |   3 +
 scripts/vllm/integration/conftest.py      |  30 ++++
 scripts/vllm/integration/test_accuracy.py | 209 ++++++++++++++++++++++
 tests/e2e/benchmarking/test_accuracy.sh   |  66 +++++++
 8 files changed, 413 insertions(+), 24 deletions(-)
 create mode 100644 scripts/vllm/integration/conftest.py
 create mode 100644 scripts/vllm/integration/test_accuracy.py
 create mode 100644 tests/e2e/benchmarking/test_accuracy.sh

diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
index 060178fabd..385113eb2e 100644
--- a/.buildkite/pipeline_jax.yml
+++ b/.buildkite/pipeline_jax.yml
@@ -110,6 +110,43 @@ steps:
          .buildkite/scripts/run_in_docker.sh \
            bash -c 'SKIP_JAX_PRECOMPILE=1 MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_commons/tests/lora/test_lora.py'
 
+   - label: "Integration Test llama-3.1-8B on TPU"
+     key: integration_test_llama_3_1_8B_tpu
+     depends_on:
+       - test_0
+       - test_1
+       - test_2
+       - test_3
+       - test_4
+       - test_5
+       - test_6
+       - test_7
+       - test_8
+       - test_9
+     soft_fail: true
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct"
+
+   - label: "Integration Test llama-3.1-70B on TPU"
+     key: integration_test_llama_3_1_70B_tpu
+     depends_on:
+       - test_0
+       - test_1
+       - test_2
+       - test_3
+       - test_4
+       - test_5
+       - test_6
+       - test_7
+       - test_8
+       - test_9
+     soft_fail: true
+     agents:
+       queue: tpu_v6e_8_queue
+     commands:
+       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 8 -m "meta-llama/Llama-3.1-70B-Instruct"
 
   # -----------------------------------------------------------------
   # NOTIFICATION STEP
@@ -126,9 +163,12 @@ steps:
        - test_7
        - test_8
        - test_9
+       - integration_test_llama_3_1_8B_tpu
+       - integration_test_llama_3_1_70B_tpu
      agents:
        queue: tpu_v6e_queue
      commands:
        - |
          .buildkite/scripts/check_results.sh \
-           "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9
+           "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 \
+           integration_test_llama_3_1_8B_tpu integration_test_llama_3_1_70B_tpu
diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh
index e5c892d31d..d4113bda16 100755
--- a/.buildkite/scripts/bootstrap.sh
+++ b/.buildkite/scripts/bootstrap.sh
@@ -2,25 +2,26 @@
 
 echo "--- Starting Buildkite Bootstrap ---"
 
-# Check if the current build is a pull request
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-  echo "This is a Pull Request build."
-  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+# # Check if the current build is a pull request
+# if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+#   echo "This is a Pull Request build."
+#   PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
 
-  # If it's a PR, check for the specific label
-  if [[ $PR_LABELS == *"ready"* ]]; then
-    echo "Found 'ready' label on PR. Uploading main pipeline..."
-    buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
-    # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
-  else
-    echo "No 'ready' label found on PR. Skipping main pipeline upload."
-    exit 0 # Exit with 0 to indicate success (no error, just skipped)
-  fi
-else
-  # If it's NOT a Pull Request (e.g., branch push, tag, manual build)
-  echo "This is not a Pull Request build. Uploading main pipeline."
-  buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
-  # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
-fi
+#   # If it's a PR, check for the specific label
+#   if [[ $PR_LABELS == *"ready"* ]]; then
+#     echo "Found 'ready' label on PR. Uploading main pipeline..."
+#     buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+#     # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
+#   else
+#     echo "No 'ready' label found on PR. Skipping main pipeline upload."
+#     exit 0 # Exit with 0 to indicate success (no error, just skipped)
+#   fi
+# else
+#   # If it's NOT a Pull Request (e.g., branch push, tag, manual build)
+#   echo "This is not a Pull Request build. Uploading main pipeline."
+#   buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+#   # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
+# fi
 
+buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
 echo "--- Buildkite Bootstrap Finished ---"
diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh
index 13292e206b..7f51f86c06 100755
--- a/.buildkite/scripts/run_in_docker.sh
+++ b/.buildkite/scripts/run_in_docker.sh
@@ -11,6 +11,36 @@ if [ "$#" -eq 0 ]; then
   exit 1
 fi
 
+MOUNT_EXPECT_RESULT="False"
+OTHER_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --mount-expect-result)
+            MOUNT_EXPECT_RESULT="True"
+            shift 1
+            ;;
+        *)
+            OTHER_ARGS+=("$@")
+            break
+            ;;
+    esac
+done
+
+# TBD: To support the functionality of connecting GPU and TPU expected values in the future
+EXPECT_VOLUME=()
+EXPECT_ENV=()
+if [ "$MOUNT_EXPECT_RESULT" = "True" ]; then
+    touch "$EXPECT_VALUES_FILENAME"
+    echo "[DEBUG] Path: $EXPECT_VALUES_PATH, Filename: $EXPECT_VALUES_FILENAME, "
+
+    EXPECT_VOLUME=(-v "$(pwd)/$EXPECT_VALUES_FILENAME":"$EXPECT_VALUES_PATH$EXPECT_VALUES_FILENAME")
+    echo "docker -v cmd: ${EXPECT_VOLUME[@]}"
+
+    EXPECT_ENV=(-e EXPECT_VALUES_PATH="$EXPECT_VALUES_PATH" -e EXPECT_VALUES_FILENAME="$EXPECT_VALUES_FILENAME")
+    echo "docker -e cmd: ${EXPECT_ENV[@]}"
+fi
+
 if ! grep -q "^HF_TOKEN=" /etc/environment; then
   gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \
   sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)"
@@ -46,6 +76,9 @@ else
 fi
 DOCKER_HF_HOME="/tmp/hf_home"
 
+# Prune older images on the host to save space.
+docker system prune -a -f --filter "until=3h"
+
 # (TODO): Consider creating a remote registry to cache and share between agents.
 # Subsequent builds on the same host should be cached.
 
@@ -76,7 +109,10 @@ fi
 
 echo "Cleanup complete."
 
-docker build --no-cache -f docker/Dockerfile -t "vllm-tpu:${BUILDKITE_COMMIT}" .
+IMAGE_NAME="vllm-tpu"
+docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
+
+echo "Execute Cmd: $@ on Image: ${IMAGE_NAME}:${BUILDKITE_COMMIT}"
 
 exec docker run \
   --privileged \
@@ -84,6 +120,8 @@ exec docker run \
   --shm-size=16G \
   --rm \
   -v "$LOCAL_HF_HOME":"$DOCKER_HF_HOME" \
+  "${EXPECT_VOLUME[@]}" \
+  "${EXPECT_ENV[@]}" \
   -e HF_HOME="$DOCKER_HF_HOME" \
   -e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \
   -e HF_TOKEN="$HF_TOKEN" \
@@ -93,5 +131,5 @@ exec docker run \
   ${QUANTIZATION:+-e QUANTIZATION="$QUANTIZATION"} \
   ${NEW_MODEL_DESIGN:+-e NEW_MODEL_DESIGN="$NEW_MODEL_DESIGN"} \
   ${USE_V6E8_QUEUE:+-e USE_V6E8_QUEUE="$USE_V6E8_QUEUE"} \
-  "vllm-tpu:${BUILDKITE_COMMIT}" \
-  "$@" # Pass all script arguments as the command to run in the container
+  "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
+  "$@" # Pass all script arguments as the command to run in the container
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 168a22d337..a11b596c62 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -15,12 +15,14 @@ RUN apt-get update && apt-get install -y \
 WORKDIR /workspace/vllm
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 RUN git clone $VLLM_REPO /workspace/vllm
+
 RUN pip install -r requirements/tpu.txt
 RUN VLLM_TARGET_DEVICE="tpu" pip install -e .
 
 # Install test dependencies
 RUN python3 -m pip install -e tests/vllm_test_utils
-RUN python3 -m pip install --no-cache-dir git+https://github.com/thuml/depyf.git pytest pytest-asyncio tpu-info datasets 'lm_eval[api]==0.4.4'
+RUN python3 -m pip install --no-cache-dir git+https://github.com/thuml/depyf.git pytest pytest-asyncio tpu-info datasets
+RUN python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 RUN python3 -m pip install pytest-cov
 RUN python3 -m pip install numba
 
diff --git a/requirements_benchmarking.txt b/requirements_benchmarking.txt
index 04350c2cae..e9fa110218 100644
--- a/requirements_benchmarking.txt
+++ b/requirements_benchmarking.txt
@@ -5,3 +5,6 @@ evaluate
 datasets
 rouge-score
 scikit-learn
+openai
+lm_eval
+pytest
\ No newline at end of file
diff --git a/scripts/vllm/integration/conftest.py b/scripts/vllm/integration/conftest.py
new file mode 100644
index 0000000000..9e16362777
--- /dev/null
+++ b/scripts/vllm/integration/conftest.py
@@ -0,0 +1,30 @@
+import pytest
+import json
+
+def pytest_addoption(parser):
+    """Adds custom command-line options to pytest."""
+    parser.addoption(
+        "--tensor-parallel-size",
+        type=int,
+        default=1,
+        help="The tensor parallel size to use for the test."
+    )
+    parser.addoption(
+        "--expected-values-file",
+        type=str,
+        default=None,
+        help="Path to a JSON file with expected accuracy values."
+    )
+    parser.addoption(
+        "--model-names",
+        action="store",
+        # default="meta-llama/Llama-3.1-8B-Instruct",
+        default=None,
+        help="Comma-separated list of model names to test (e.g., 'model1,model2')"
+    )
+    parser.addoption(
+        "--fp8-kv-model-names",
+        action="store",
+        default=None,
+        help="Comma-separated list of model names to test fp8-kv (e.g., 'model1,model2')"
+    )
\ No newline at end of file
diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py
new file mode 100644
index 0000000000..381c3eabf1
--- /dev/null
+++ b/scripts/vllm/integration/test_accuracy.py
@@ -0,0 +1,209 @@
+# Copied from vLLM: https://github.com/vllm-project/vllm/blob/839ab00/tests/entrypoints/llm/test_accuracy.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+import json
+import threading
+import os
+
+from pathlib import Path
+from vllm.platforms import current_platform
+
+MODEL_NAMES = [
+    "Qwen/Qwen3-1.7B",
+    "google/gemma-3-1b-it",
+    # "meta-llama/Llama-3.1-8B-Instruct",
+]
+FP8_KV_MODEL_NAMES = [
+    "Qwen/Qwen3-1.7B",
+]
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+_JSON_WRITE_LOCK = threading.Lock()
+
+EXPECTED_VALUES = {
+    "Qwen/Qwen3-1.7B": 0.68,
+    "google/gemma-3-1b-it": 0.25,
+    "meta-llama/Llama-3.1-8B-Instruct": 0.76,
+    "meta-llama/Llama-3.1-70B-Instruct": 0.876,
+}
+
+# Parametrize test cases based on CLI arguments or default values
+def parametrize_by_cli_or_default(metafunc, fixture_name, cli_parameter, default_list):
+    if fixture_name in metafunc.fixturenames:
+        print(f"Checking CLI parameter '{cli_parameter}' for '{fixture_name}'")
+        names_str = metafunc.config.getoption(cli_parameter)
+        if names_str:
+            print(f"Using '{cli_parameter}' parameter for '{fixture_name}'")
+            param_list = [name.strip() for name in names_str.split(',') if name.strip()]
+            metafunc.parametrize(fixture_name, param_list)
+        else:
+            print(f"Using default list for '{fixture_name}'")
+            metafunc.parametrize(fixture_name, default_list)
+
+def pytest_generate_tests(metafunc):
+    parametrize_by_cli_or_default(metafunc, fixture_name="model", cli_parameter="--model-names", default_list=MODEL_NAMES)
+    parametrize_by_cli_or_default(metafunc, fixture_name="fp8_kv_model", cli_parameter="--fp8-kv-model-names", default_list=FP8_KV_MODEL_NAMES)
+
+# Write expected values to json file
+# TBD: To support the functionality of connecting GPU and TPU expected values in the future
+def write_expected_value_to_json(model_name, measured_value, json_filepath):
+    with _JSON_WRITE_LOCK:
+        data = {}
+        try:
+            with open(json_filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        except (FileNotFoundError, json.JSONDecodeError):
+            print(f"'{json_filepath}' not found or is empty/invalid. A new one will be created.")
+            data = {}
+        
+        data[model_name] = measured_value
+        
+        try:
+            with open(json_filepath, 'w', encoding='utf-8') as f:
+                json.dump(data, f, indent=4)
+            print(f"Successfully updated '{json_filepath}' with the result for {model_name}.")
+        except IOError as e:
+            print(f"Error: Failed to write to file '{json_filepath}'. Reason: {e}")
+
+# Read expected values from json file if exist
+# TBD: To support the functionality of connecting GPU and TPU expected values in the future
+def read_expected_value(expected_json_filepath=None):
+    expected_values_data = {}
+    if expected_json_filepath is None:
+        expected_values_data = EXPECTED_VALUES
+    else:
+        path_obj = Path(expected_json_filepath)
+        # Read expected values from json file if exist
+        if path_obj.is_file() and os.path.getsize(expected_json_filepath) > 0:
+            print(f"\n[Fixture] Loading from: {expected_json_filepath}")
+            with open(expected_json_filepath, 'r', encoding='utf-8') as f:
+                expected_values_data = json.load(f)
+        else:
+            raise FileNotFoundError(f"Expected values file not found: {expected_json_filepath}")
+    return expected_values_data
+
+
+def run_test(model_name, expected_values_data, expected_json_filepath, more_args=None):
+    """Run the end to end accuracy test."""
+    print(f"Running test for model: {model_name}")
+
+    model_args = f"pretrained={model_name},max_model_len=4096"
+    
+    download_path = "/mnt/disks/persist"
+    # download_path = "/tmp/hf_model"
+    if os.path.isdir(download_path) and os.access(download_path, os.R_OK) and os.access(download_path, os.W_OK):
+        model_args = f"{model_args},download_dir={download_path}"
+    
+    if more_args is not None:
+        model_args = "{},{}".format(model_args, more_args)
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    # Execute default behavior when `expected_json_filepath` is not set.
+    if expected_json_filepath is None:
+        print(f"Execute default behavior")
+        measured_value = results["results"][TASK][FILTER]
+        assert model_name in EXPECTED_VALUES, (
+            f"Cannot find the expected value for the model {model_name=}")
+        expected_value = EXPECTED_VALUES[model_name]
+        assert (measured_value - RTOL < expected_value
+                and measured_value + RTOL > expected_value
+                ), f"Expected: {expected_value} |  Measured: {measured_value}"
+    else:
+        print(f"Execute specific models behavior")
+        measured_value = results["results"][TASK][FILTER]
+        expected_value = expected_values_data.get(model_name)
+
+        # Model expected value not exist, write in file
+        if model_name not in expected_values_data:
+            print(f"Warning: No expected value found for {model_name}. "
+                "Skipping accuracy check.")
+            print(f"Measured value: {measured_value}")
+            write_expected_value_to_json(model_name, measured_value, expected_json_filepath)
+
+        else:
+            print(f"Found expected value! {model_name=}, {measured_value=}, {expected_value=}")
+            assert (measured_value - RTOL < expected_value
+                and measured_value + RTOL > expected_value
+                ), f"Expected: {expected_value} |  Measured: {measured_value}"
+
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
+                    reason="V1 is currently only supported on CUDA and TPU")
+def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest):
+    """Run with the V1 Engine."""
+    print(f"Testing model: {model}...")
+
+    tp_size = request.config.getoption("--tensor-parallel-size")
+    expected_json_filepath = request.config.getoption("--expected-values-file")
+            
+    expected_values_data = read_expected_value(expected_json_filepath)
+
+    if tp_size is None:
+        tp_size = 1
+    elif tp_size < 1 or tp_size > 8:
+        raise ValueError
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        
+        more_args = None
+        if current_platform.is_tpu():
+            more_args = "max_model_len=2048,max_num_seqs=64"
+            tp_size_str = f"tensor_parallel_size={tp_size}"
+            more_args += ",{}".format(tp_size_str)
+        
+        print(f"common args: {more_args}")
+
+        run_test(model, expected_values_data, expected_json_filepath, more_args)
+
+
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
+                    reason="V1 is currently only supported on CUDA and TPU")
+def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
+        fp8_kv_model, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest):
+    """Run with the V1 Engine."""
+    print(f"Testing fp8_kv_model: {fp8_kv_model}...")
+
+    tp_size = request.config.getoption("--tensor-parallel-size")
+    expected_json_filepath = request.config.getoption("--expected-values-file")
+
+    expected_values_data = read_expected_value(expected_json_filepath)
+
+    if tp_size is None:
+        tp_size = 1
+    elif tp_size < 1 or tp_size > 8:
+        raise ValueError
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        more_args = None
+        if current_platform.is_tpu():
+            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
+            tp_size_str = f"tensor_parallel_size={tp_size}"
+            more_args += ",{}".format(tp_size_str)
+            
+        print(f"common args: {more_args}")
+
+        run_test(fp8_kv_model, expected_values_data, expected_json_filepath, more_args)
\ No newline at end of file
diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh
new file mode 100644
index 0000000000..cf65860b2d
--- /dev/null
+++ b/tests/e2e/benchmarking/test_accuracy.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.1-70B-Instruct"
+tensor_parallel_size=1
+
+extra_serve_args=()
+echo extra_serve_args: "${extra_serve_args[@]}"
+
+root_dir=/workspace
+exit_code=0
+
+helpFunction()
+{
+   echo ""
+   echo "Usage: $0 [-r full_path_to_root_dir -m model_id]"
+   echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)"
+   echo -e "\t-m A space-separated list of HuggingFace model ids to use (default: meta-llama/Llama-3.1-8B-Instruct and meta-llama/Llama-3.1-70B-Instruct)"
+   echo -e "\t-t Tensor parallel size (default: 1)"
+   exit 1
+}
+
+while [[ "$#" -gt 0 ]]; do
+    case "$1" in
+        -r|--root-dir-path)
+            root_dir="$2"
+            shift
+            shift
+            ;;
+        -m|--model)
+            model_list="$2"
+            shift
+            shift
+            ;;
+        -t|--tensor-parallel-size)
+            tensor_parallel_size="$2"
+            shift
+            shift
+            ;;
+        -h|--help)
+            helpFunction
+            ;;
+        *) # unknown option
+            echo "Unknown option: $1"
+            helpFunction
+            ;;
+    esac
+done
+
+echo "Using the root directory at $root_dir"
+echo "Testing $model_list prompts"
+
+cd "$root_dir"/vllm/tests/entrypoints/llm || exit
+
+# Overwrite a few of the vLLM benchmarking scripts with the TPU Commons ones
+cp "$root_dir"/tpu_commons/scripts/vllm/integration/*.py "$root_dir"/vllm/tests/entrypoints/llm/
+
+comma_model_list=${model_list// /,}
+
+echo "--------------------------------------------------"
+echo "Running integration for models: $comma_model_list"
+echo "--------------------------------------------------"
+
+# Default action
+python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list"
+
+exit $exit_code
\ No newline at end of file

From ecd93ada2ef6647a45a55d2e8334114bfa93544e Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 4 Sep 2025 11:24:04 +0800
Subject: [PATCH 02/38] squash 32 commit for next dev

---
 .buildkite/README_generate.md                 | 101 ++++++
 .buildkite/buildkite_ci_feature_template.yml  |  70 ++++
 .buildkite/buildkite_ci_model_template.yml    |  70 ++++
 .buildkite/features/Feat-A.yml                |  70 ++++
 .buildkite/generate_feature_buildkite.py      |  97 ++++++
 .buildkite/generate_model_buildkite.py        |  97 ++++++
 .../meta-llama_Llama-3_1-70B-Instruct.yml     |  70 ++++
 .../meta-llama_Llama-3_1-8B-Instruct.yml      |  71 ++++
 .buildkite/pipeline_jax.yml                   | 321 +++++++++---------
 .buildkite/scripts/bootstrap.sh               |  26 +-
 .buildkite/scripts/dynamic_upload.sh          |  81 +++++
 .buildkite/scripts/run_in_docker.sh           |  18 +-
 docker/Dockerfile                             |  13 +-
 docker/Dockerfile.cuda                        |  72 ++++
 requirements_benchmarking.txt                 |   5 +-
 scripts/vllm/integration/conftest.py          |   5 +-
 scripts/vllm/integration/test_accuracy.py     |  32 +-
 tests/e2e/benchmarking/test_accuracy.sh       |  11 +-
 18 files changed, 1024 insertions(+), 206 deletions(-)
 create mode 100644 .buildkite/README_generate.md
 create mode 100644 .buildkite/buildkite_ci_feature_template.yml
 create mode 100644 .buildkite/buildkite_ci_model_template.yml
 create mode 100644 .buildkite/features/Feat-A.yml
 create mode 100644 .buildkite/generate_feature_buildkite.py
 create mode 100644 .buildkite/generate_model_buildkite.py
 create mode 100644 .buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
 create mode 100644 .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
 create mode 100644 .buildkite/scripts/dynamic_upload.sh
 create mode 100644 docker/Dockerfile.cuda

diff --git a/.buildkite/README_generate.md b/.buildkite/README_generate.md
new file mode 100644
index 0000000000..4f243068f0
--- /dev/null
+++ b/.buildkite/README_generate.md
@@ -0,0 +1,101 @@
+# Buildkite Pipeline Generator
+
+This document outlines the process for using Python scripts to automatically generate Buildkite CI/CD pipeline configuration files. These scripts leverage templates to create consistent testing pipelines for both models and features.
+
+## Overview
+
+The primary goal of these tools is to streamline the creation of Buildkite pipelines. Instead of manually creating and editing YAML files for each new model or feature, you can run a simple command to generate a standardized pipeline file.
+
+There are two main generators:
+
+1.  **Model Pipeline Generator** (`generate_model_buildkite.py`): Creates a pipeline file for testing a specific machine learning model.
+2.  **Feature Pipeline Generator** (`generate_feature_buildkite.py`): Creates a pipeline file for testing a new feature.
+
+Both scripts work by reading a corresponding template file (`.yml`), replacing placeholder variables with your command-line arguments, and saving the result as a new YAML file in a designated output directory.
+
+## Directory Structure
+
+To use the scripts, your files should be arranged as follows. The output directories (`models/` and `features/`) will be created automatically if they do not exist.
+
+```
+.
+├── generate_model_buildkite.py
+├── buildkite_ci_model_template.yml
+├── generate_feature_buildkite.py
+├── buildkite_ci_feature_template.yml
+└── README.md
+```
+
+-----
+
+## How to Use
+
+### 1\. Generating a Model Pipeline
+
+Use the `generate_model_buildkite.py` script to create a CI pipeline for a new model.
+
+**Command:**
+
+```bash
+python generate_model_buildkite.py --model-name <MODEL_NAME> --queue <QUEUE_NAME>
+```
+
+**Arguments:**
+
+  * `--model-name` (required): The name of the model to be tested. If the name contains special characters like `/` or `.`, they will be replaced with `_` in the output filename and for Buildkite step keys.
+  * `--queue` (required): The name of the Buildkite agent queue where the jobs will run (e.g., `tpu_v6e_queue`).
+
+**Example:**
+
+```bash
+python generate_model_buildkite.py --model-name meta-llama/Llama-3.1-8B-Instruct --queue tpu_v6e_queue
+```
+
+**Output:**
+
+This command will generate a new file located at `models/meta-llama_Llama-3_1-8B-Instruct.yml`.
+
+-----
+
+### 2\. Generating a Feature Pipeline
+
+Use the `generate_feature_buildkite.py` script to create a CI pipeline for a new feature.
+
+**Command:**
+
+```bash
+python generate_feature_buildkite.py --feature-name <FEATURE_NAME> --queue <QUEUE_NAME>
+```
+
+**Arguments:**
+
+  * `--feature-name` (required): The name of the feature to be tested.
+  * `--queue` (required): The name of the Buildkite agent queue.
+
+**Example:**
+
+```bash
+python generate_feature_buildkite.py --feature-name Feat-A --queue tpu_v6e_queue
+```
+
+**Output:**
+
+This command will generate a new file located at `features/Feat-A.yml`.
+
+-----
+
+## Important Notes: Placeholders & Customization
+
+The scripts work by performing a find-and-replace on specific placeholders within the template files. You can customize the `buildkite_ci_*_template.yml` files to change the structure of the generated pipelines.
+
+#### **Model Template Placeholders (`buildkite_ci_model_template.yml`)**
+
+  * `{MODEL_NAME}`: Replaced with the exact string provided to `--model-name`. This is typically used in human-readable fields like step `label`.
+  * `{SAFE_MODEL_NAME}`: A sanitized version of the model name, automatically generated by replacing characters like `/` and `.` with `_`. This is used for machine-readable fields like the step `key` and the output filename to ensure validity.
+  * `{QUEUE}`: Replaced with the string provided to `--queue`.
+
+#### **Feature Template Placeholders (`buildkite_ci_feature_template.yml`)**
+
+  * `{FEATURE_NAME}`: Replaced with the exact string provided to `--feature-name`.
+  * `{SAFE_FEATURE_NAME}`: A sanitized version of the feature name.
+  * `{QUEUE}`: Replaced with the string provided to `--queue`.
\ No newline at end of file
diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml
new file mode 100644
index 0000000000..52b6a350af
--- /dev/null
+++ b/.buildkite/buildkite_ci_feature_template.yml
@@ -0,0 +1,70 @@
+# {FEATURE_NAME}
+agents:
+  queue: {QUEUE}
+steps:
+  - label: "Unit tests for {FEATURE_NAME}"
+    key: "ut_{SAFE_FEATURE_NAME}"
+    commands:
+      # - replace_with_test_commands  # TODO: Replaced to actual test commands
+      - echo "[DEBUG], unit testing for {FEATURE_NAME}"  # TODO: Replace to actual test commands
+  - label: "Notifications: Unit tests for {FEATURE_NAME}"
+    key: "notifications_ut_{SAFE_FEATURE_NAME}"
+    depends_on: "ut_{SAFE_FEATURE_NAME}"
+    soft_fail: true
+    agents:
+      queue: {QUEUE}
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Unit tests for {FEATURE_NAME}" ut_{SAFE_FEATURE_NAME}
+
+  - label: "Integration tests for {FEATURE_NAME}"
+    key: "it_{SAFE_FEATURE_NAME}"
+    depends_on: "notifications_ut_{SAFE_FEATURE_NAME}"
+    commands:
+      # TODO: expected_accuracy need parameterized
+      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "{FEATURE_NAME}"
+      - echo "[DEBUG], integration testing for {FEATURE_NAME}"  # TODO: Replace to actual test commands
+  - label: "Notifications: Integration tests for {FEATURE_NAME}"
+    key: "notifications_it_{SAFE_FEATURE_NAME}"
+    depends_on: "it_{SAFE_FEATURE_NAME}"
+    soft_fail: true
+    agents:
+      queue: {QUEUE}
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Integration tests for {FEATURE_NAME}" it_{SAFE_FEATURE_NAME}
+
+  - label: "Performance benchmarks for {FEATURE_NAME}"
+    key: "pb_{SAFE_FEATURE_NAME}"
+    depends_on: "notifications_it_{SAFE_FEATURE_NAME}"
+    commands:
+      # - replace_with_test_command  # TODO
+      - echo "[DEBUG], performance benchmarking for {FEATURE_NAME}"  # TODO: Replace to actual test commands
+  - label: "Notifications: Performance benchmarks for {FEATURE_NAME}"
+    key: "notifications_pb_{SAFE_FEATURE_NAME}"
+    depends_on: "pb_{SAFE_FEATURE_NAME}"
+    soft_fail: true
+    agents:
+      queue: {QUEUE}
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Performance benchmarks for {FEATURE_NAME}" pb_{SAFE_FEATURE_NAME}
+
+  - label: "Stress tests for {FEATURE_NAME}"
+    key: "st_{SAFE_FEATURE_NAME}"
+    depends_on: "notifications_pb_{SAFE_FEATURE_NAME}"
+    commands:
+      # - our_stress_tests_script {FEATURE_NAME} expected_throughput # TODO: expected_throughput need parameterized
+      - echo "[DEBUG], stress testing for {FEATURE_NAME}"  # TODO: Replace to actual test commands
+  - label: "Notifications: Stress tests for {FEATURE_NAME}"
+    key: "notifications_st_{SAFE_FEATURE_NAME}"
+    depends_on: "st_{SAFE_FEATURE_NAME}"
+    agents:
+      queue: {QUEUE}
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Stress tests for {FEATURE_NAME}" st_{SAFE_FEATURE_NAME}
diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml
new file mode 100644
index 0000000000..aa5425b11f
--- /dev/null
+++ b/.buildkite/buildkite_ci_model_template.yml
@@ -0,0 +1,70 @@
+# {MODEL_NAME}
+agents:
+  queue: {QUEUE}
+steps:
+  - label: "Unit tests for {MODEL_NAME}"
+    key: "ut_{SAFE_MODEL_NAME}"
+    commands:
+      # - replace_with_test_commands  # TODO: Replaced to actual test commands
+      - echo "[DEBUG], unit testing for {MODEL_NAME}"  # TODO: Replace to actual test commands
+  - label: "Notifications: Unit tests for {MODEL_NAME}"
+    key: "notifications_ut_{SAFE_MODEL_NAME}"
+    depends_on: "ut_{SAFE_MODEL_NAME}"
+    soft_fail: true
+    agents:
+      queue: {QUEUE}
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME}
+
+  - label: "Integration tests for {MODEL_NAME}"
+    key: "it_{SAFE_MODEL_NAME}"
+    depends_on: "notifications_ut_{SAFE_MODEL_NAME}"
+    commands:
+      # TODO: expected_accuracy need parameterized
+      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "{MODEL_NAME}"
+      - echo "[DEBUG], integration testing for {MODEL_NAME}"  # TODO: Replace to actual test commands
+  - label: "Notifications: Integration tests for {MODEL_NAME}"
+    key: "notifications_it_{SAFE_MODEL_NAME}"
+    depends_on: "it_{SAFE_MODEL_NAME}"
+    soft_fail: true
+    agents:
+      queue: {QUEUE}
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Integration tests for {MODEL_NAME}" it_{SAFE_MODEL_NAME}
+
+  - label: "Performance benchmarks for {MODEL_NAME}"
+    key: "pb_{SAFE_MODEL_NAME}"
+    depends_on: "notifications_it_{SAFE_MODEL_NAME}"
+    commands:
+      # - replace_with_test_command  # TODO
+      - echo "[DEBUG], performance benchmarking for {MODEL_NAME}"  # TODO: Replace to actual test commands
+  - label: "Notifications: Performance benchmarks for {MODEL_NAME}"
+    key: "notifications_pb_{SAFE_MODEL_NAME}"
+    depends_on: "pb_{SAFE_MODEL_NAME}"
+    soft_fail: true
+    agents:
+      queue: {QUEUE}
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Performance benchmarks for {MODEL_NAME}" pb_{SAFE_MODEL_NAME}
+
+  - label: "Stress tests for {MODEL_NAME}"
+    key: "st_{SAFE_MODEL_NAME}"
+    depends_on: "notifications_pb_{SAFE_MODEL_NAME}"
+    commands:
+      # - our_stress_tests_script {MODEL_NAME} expected_throughput # TODO: expected_throughput need parameterized
+      - echo "[DEBUG], stress testing for {MODEL_NAME}"  # TODO: Replace to actual test commands
+  - label: "Notifications: Stress tests for {MODEL_NAME}"
+    key: "notifications_st_{SAFE_MODEL_NAME}"
+    depends_on: "st_{SAFE_MODEL_NAME}"
+    agents:
+      queue: {QUEUE}
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Stress tests for {MODEL_NAME}" st_{SAFE_MODEL_NAME}
diff --git a/.buildkite/features/Feat-A.yml b/.buildkite/features/Feat-A.yml
new file mode 100644
index 0000000000..20ee8caeb8
--- /dev/null
+++ b/.buildkite/features/Feat-A.yml
@@ -0,0 +1,70 @@
+# Feat-A
+agents:
+  queue: tpu_v6e_queue
+steps:
+  - label: "Unit tests for Feat-A"
+    key: "ut_Feat-A"
+    commands:
+      # - replace_with_test_commands  # TODO: Replaced to actual test commands
+      - echo "[DEBUG], unit testing for Feat-A"  # TODO: Replace to actual test commands
+  - label: "Notifications: Unit tests for Feat-A"
+    key: "notifications_ut_Feat-A"
+    depends_on: "ut_Feat-A"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Unit tests for Feat-A" ut_Feat-A
+
+  - label: "Integration tests for Feat-A"
+    key: "it_Feat-A"
+    depends_on: "notifications_ut_Feat-A"
+    commands:
+      # TODO: expected_accuracy need parameterized
+      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "Feat-A"
+      - echo "[DEBUG], integration testing for Feat-A"  # TODO: Replace to actual test commands
+  - label: "Notifications: Integration tests for Feat-A"
+    key: "notifications_it_Feat-A"
+    depends_on: "it_Feat-A"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Integration tests for Feat-A" it_Feat-A
+
+  - label: "Performance benchmarks for Feat-A"
+    key: "pb_Feat-A"
+    depends_on: "notifications_it_Feat-A"
+    commands:
+      # - replace_with_test_command  # TODO
+      - echo "[DEBUG], performance benchmarking for Feat-A"  # TODO: Replace to actual test commands
+  - label: "Notifications: Performance benchmarks for Feat-A"
+    key: "notifications_pb_Feat-A"
+    depends_on: "pb_Feat-A"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Performance benchmarks for Feat-A" pb_Feat-A
+
+  - label: "Stress tests for Feat-A"
+    key: "st_Feat-A"
+    depends_on: "notifications_pb_Feat-A"
+    commands:
+      # - our_stress_tests_script Feat-A expected_throughput # TODO: expected_throughput need parameterized
+      - echo "[DEBUG], stress testing for Feat-A"  # TODO: Replace to actual test commands
+  - label: "Notifications: Stress tests for Feat-A"
+    key: "notifications_st_Feat-A"
+    depends_on: "st_Feat-A"
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Stress tests for Feat-A" st_Feat-A
diff --git a/.buildkite/generate_feature_buildkite.py b/.buildkite/generate_feature_buildkite.py
new file mode 100644
index 0000000000..4403469425
--- /dev/null
+++ b/.buildkite/generate_feature_buildkite.py
@@ -0,0 +1,97 @@
+import argparse
+from pathlib import Path
+
+# Define the template filename and output directory as constants for easy modification.
+TEMPLATE_FILENAME = "buildkite_ci_feature_template.yml"
+OUTPUT_DIR = Path("features")
+
+def generate_from_template(feature_name: str, queue: str) -> None:
+    """
+    Generates a buildkite yml file from a template.
+
+    Args:
+        feature_name (str): The feature_name parameter.
+        queue (str): The Queue parameter.
+    """
+    print(f"--- Starting to generate for Feature '{feature_name}' ---")
+
+    # Check if the template file exists.
+    template_path = Path(TEMPLATE_FILENAME)
+    if not template_path.is_file():
+        print(f"Error: Template file '{TEMPLATE_FILENAME}' not found!")
+        return
+
+    # Ensure the output directory exists. If not, create it.
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    print(f"Output directory '{OUTPUT_DIR}' is ready.")
+
+    # Read the content of the template file.
+    try:
+        with open(template_path, 'r', encoding='utf-8') as f:
+            template_content = f.read()
+        print("Template file read successfully.")
+    except Exception as e:
+        print(f"Error reading template file: {e}")
+        return
+
+    # Replace '/' and "." with an underscore for valid filenames and buildkite's key.
+    safe_feature_name = feature_name.replace("/", "_").replace(".", "_")
+
+    # Substitute the placeholders with the provided arguments.
+    try:
+        generated_content = template_content.format(
+            FEATURE_NAME=feature_name,
+            SAFE_FEATURE_NAME=safe_feature_name,
+            QUEUE=queue,
+        )
+        print("Parameter substitution complete.")
+    except KeyError as e:
+        print(f"Error: A placeholder key {e} was not found in the provided arguments.")
+        print("Please check for mismatches between your template file and script.")
+        return
+
+    # Define the output filename and path.
+    # The filename is based on the feature_name with a .yml extension.
+    output_filename = f"{safe_feature_name}.yml"
+    output_path = OUTPUT_DIR / output_filename
+
+    # Write the generated content to the file.
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(generated_content)
+        print(f"✅ Success! Config file generated at: '{output_path}'")
+    except Exception as e:
+        print(f"Error writing output file: {e}")
+
+    print("-" * 40 + "\n")
+
+def main():
+    """
+    Main function to parse command-line arguments and run the generator.
+    """
+    parser = argparse.ArgumentParser(
+        description="Generate a Buildkite CI config file from a template."
+    )
+
+    # Add the command-line arguments. Both are required.
+    parser.add_argument(
+        "--feature-name",
+        type=str,
+        required=True,
+        help="""
+             The name of the feature to use in the template (e.g., 'Feature-A'). 
+             If have '/' or '.' in the feature name, it will be replaced with '_' in the generated file name.
+        """
+    )
+    parser.add_argument(
+        "--queue",
+        type=str,
+        required=True,
+        help="The name of the agent queue to use (e.g., 'tpu_v6e_queue' or 'tpu_v6e_8_queue')."
+    )
+
+    args = parser.parse_args()
+    generate_from_template(feature_name=args.feature_name, queue=args.queue)
+
+if __name__ == "__main__":
+    main()
diff --git a/.buildkite/generate_model_buildkite.py b/.buildkite/generate_model_buildkite.py
new file mode 100644
index 0000000000..8928345fbe
--- /dev/null
+++ b/.buildkite/generate_model_buildkite.py
@@ -0,0 +1,97 @@
+import argparse
+from pathlib import Path
+
+# Define the template filename and output directory as constants for easy modification.
+TEMPLATE_FILENAME = "buildkite_ci_model_template.yml"
+OUTPUT_DIR = Path("models")
+
+def generate_from_template(model_name: str, queue: str) -> None:
+    """
+    Generates a buildkite yml file from a template.
+
+    Args:
+        model_name (str): The model_name parameter.
+        queue (str): The Queue parameter.
+    """
+    print(f"--- Starting to generate for model '{model_name}' ---")
+
+    # Check if the template file exists.
+    template_path = Path(TEMPLATE_FILENAME)
+    if not template_path.is_file():
+        print(f"Error: Template file '{TEMPLATE_FILENAME}' not found!")
+        return
+
+    # Ensure the output directory exists. If not, create it.
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    print(f"Output directory '{OUTPUT_DIR}' is ready.")
+
+    # Read the content of the template file.
+    try:
+        with open(template_path, 'r', encoding='utf-8') as f:
+            template_content = f.read()
+        print("Template file read successfully.")
+    except Exception as e:
+        print(f"Error reading template file: {e}")
+        return
+
+    # Replace '/' and "." with an underscore for valid filenames and buildkite's key.
+    safe_model_name = model_name.replace("/", "_").replace(".", "_")
+
+    # Substitute the placeholders with the provided arguments.
+    try:
+        generated_content = template_content.format(
+            MODEL_NAME=model_name,
+            SAFE_MODEL_NAME=safe_model_name,
+            QUEUE=queue,
+        )
+        print("Parameter substitution complete.")
+    except KeyError as e:
+        print(f"Error: A placeholder key {e} was not found in the provided arguments.")
+        print("Please check for mismatches between your template file and script.")
+        return
+
+    # Define the output filename and path.
+    # The filename is based on the model_name with a .yml extension.
+    output_filename = f"{safe_model_name}.yml"
+    output_path = OUTPUT_DIR / output_filename
+
+    # Write the generated content to the file.
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(generated_content)
+        print(f"✅ Success! Config file generated at: '{output_path}'")
+    except Exception as e:
+        print(f"Error writing output file: {e}")
+
+    print("-" * 40 + "\n")
+
+def main():
+    """
+    Main function to parse command-line arguments and run the generator.
+    """
+    parser = argparse.ArgumentParser(
+        description="Generate a Buildkite CI config file from a template."
+    )
+
+    # Add the command-line arguments. Both are required.
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        required=True,
+        help="""
+             The name of the model to use in the template (e.g., 'meta-llama/Llama-3.1-8B-Instruct'). 
+             If have '/' or '.' in the model name, it will be replaced with '_' in the generated file name.
+        """
+    )
+    parser.add_argument(
+        "--queue",
+        type=str,
+        required=True,
+        help="The name of the agent queue to use (e.g., 'tpu_v6e_queue' or 'tpu_v6e_8_queue')."
+    )
+
+    args = parser.parse_args()
+    generate_from_template(model_name=args.model_name, queue=args.queue)
+
+if __name__ == "__main__":
+    main()
diff --git a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
new file mode 100644
index 0000000000..da93c2dc2d
--- /dev/null
+++ b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
@@ -0,0 +1,70 @@
+# meta-llama/Llama-3.1-70B-Instruct
+agents:
+  queue: tpu_v6e_8_queue
+steps:
+  - label: "Unit tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "ut_meta-llama_Llama-3_1-70B-Instruct"
+    commands:
+      # - replace_with_test_commands  # TODO: Replaced to actual test commands
+      - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Unit tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "ut_meta-llama_Llama-3_1-70B-Instruct"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_8_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct
+
+  - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "it_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct"
+    commands:
+      # TODO: expected_accuracy need parameterized
+      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-70B-Instruct"
+      - echo "[DEBUG], integration testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Integration tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "notifications_it_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "it_meta-llama_Llama-3_1-70B-Instruct"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_8_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct
+
+  - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
+    key: "pb_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "notifications_it_meta-llama_Llama-3_1-70B-Instruct"
+    commands:
+      # - replace_with_test_command  # TODO
+      - echo "[DEBUG], performance benchmarking for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
+    key: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "pb_meta-llama_Llama-3_1-70B-Instruct"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_8_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct
+
+  - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "st_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct"
+    commands:
+      # - our_stress_tests_script meta-llama/Llama-3.1-70B-Instruct expected_throughput # TODO: expected_throughput need parameterized
+      - echo "[DEBUG], stress testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Stress tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "notifications_st_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "st_meta-llama_Llama-3_1-70B-Instruct"
+    agents:
+      queue: tpu_v6e_8_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct
diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
new file mode 100644
index 0000000000..6d4e4288ab
--- /dev/null
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -0,0 +1,71 @@
+# meta-llama/Llama-3.1-8B-Instruct
+agents:
+  queue: tpu_v6e_queue
+steps:
+  - label: "Unit tests for meta-llama/Llama-3.1-8B-Instruct"
+    key: "ut_meta-llama_Llama-3_1-8B-Instruct"
+    commands:
+      # - replace_with_test_commands  # TODO: Replaced to actual test commands
+      # - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-8B-Instruct"  # TODO: Replace to actual test commands
+      - echo "Running..."; sleep 20;echo "End"
+  - label: "Notifications: Unit tests for meta-llama/Llama-3.1-8B-Instruct"
+    key: "notifications_ut_meta-llama_Llama-3_1-8B-Instruct"
+    depends_on: "ut_meta-llama_Llama-3_1-8B-Instruct"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
+
+  - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct"
+    key: "it_meta-llama_Llama-3_1-8B-Instruct"
+    depends_on: "notifications_ut_meta-llama_Llama-3_1-8B-Instruct"
+    commands:
+      # TODO: expected_accuracy need parameterized
+      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct"
+      - echo "[DEBUG], integration testing for meta-llama/Llama-3.1-8B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Integration tests for meta-llama/Llama-3.1-8B-Instruct"
+    key: "notifications_it_meta-llama_Llama-3_1-8B-Instruct"
+    depends_on: "it_meta-llama_Llama-3_1-8B-Instruct"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Integration tests for meta-llama/Llama-3.1-8B-Instruct" it_meta-llama_Llama-3_1-8B-Instruct
+
+  - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
+    key: "pb_meta-llama_Llama-3_1-8B-Instruct"
+    depends_on: "notifications_it_meta-llama_Llama-3_1-8B-Instruct"
+    commands:
+      # - replace_with_test_command  # TODO
+      - echo "[DEBUG], performance benchmarking for meta-llama/Llama-3.1-8B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
+    key: "notifications_pb_meta-llama_Llama-3_1-8B-Instruct"
+    depends_on: "pb_meta-llama_Llama-3_1-8B-Instruct"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" pb_meta-llama_Llama-3_1-8B-Instruct
+
+  - label: "Stress tests for meta-llama/Llama-3.1-8B-Instruct"
+    key: "st_meta-llama_Llama-3_1-8B-Instruct"
+    depends_on: "notifications_pb_meta-llama_Llama-3_1-8B-Instruct"
+    commands:
+      # - our_stress_tests_script meta-llama/Llama-3.1-8B-Instruct expected_throughput # TODO: expected_throughput need parameterized
+      - echo "[DEBUG], stress testing for meta-llama/Llama-3.1-8B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Stress tests for meta-llama/Llama-3.1-8B-Instruct"
+    key: "notifications_st_meta-llama_Llama-3_1-8B-Instruct"
+    depends_on: "st_meta-llama_Llama-3_1-8B-Instruct"
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Stress tests for meta-llama/Llama-3.1-8B-Instruct" st_meta-llama_Llama-3_1-8B-Instruct
diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
index 385113eb2e..5170f89d57 100644
--- a/.buildkite/pipeline_jax.yml
+++ b/.buildkite/pipeline_jax.yml
@@ -1,174 +1,191 @@
 steps:
   # -----------------------------------------------------------------
   # TEST STEPS - Calling wrapper
-  # -----------------------------------------------------------------
-   - label: "E2E MLPerf tests for JAX models"
-     key: test_0
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  # -----------------------------------------------------------------"
+  #  - label: "E2E MLPerf tests for JAX models"
+  #    key: test_0
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-   - label: "E2E MLPerf tests for JAX models with quantization"
-     key: test_1
-     soft_fail: true
-     env:
-       QUANTIZATION: "True"
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  #  - label: "E2E MLPerf tests for JAX models with quantization"
+  #    key: test_1
+  #    soft_fail: true
+  #    env:
+  #      QUANTIZATION: "True"
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-   - label: "E2E MLPerf tests for JAX new models"
-     key: test_2
-     soft_fail: true
-     env:
-       NEW_MODEL_DESIGN: "True"
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  #  - label: "E2E MLPerf tests for JAX new models"
+  #    key: test_2
+  #    soft_fail: true
+  #    env:
+  #      NEW_MODEL_DESIGN: "True"
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-   - label: "E2E MLPerf tests for JAX + vLLM models"
-     key: test_3
-     soft_fail: true
-     env:
-       MODEL_IMPL_TYPE: "vllm"
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  #  - label: "E2E MLPerf tests for JAX + vLLM models"
+  #    key: test_3
+  #    soft_fail: true
+  #    env:
+  #      MODEL_IMPL_TYPE: "vllm"
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-   - label: "E2E MLperf tests for Llama4 models"
-     key: test_4
-     soft_fail: true
-     env:
-       NEW_MODEL_DESIGN: "True"
-       USE_V6E8_QUEUE: "True"
-     agents:
-       queue: tpu_v6e_8_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  #  - label: "E2E MLperf tests for Llama4 models"
+  #    key: test_4
+  #    soft_fail: true
+  #    env:
+  #      NEW_MODEL_DESIGN: "True"
+  #      USE_V6E8_QUEUE: "True"
+  #    agents:
+  #      queue: tpu_v6e_8_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
+  #  - label: "E2E multi modality test"
+  #    key: test_5
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \
+  #           bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh'
 
-   - label: "E2E multi modality test"
-     key: test_5
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \
-            bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh'
+  #  - label: "E2E speculative decoding test"
+  #    key: test_6
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py'
 
-   - label: "E2E speculative decoding test"
-     key: test_6
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py'
+  #  - label: "JAX unit tests"
+  #    key: test_7
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \
+  #          --ignore=/workspace/tpu_commons/tests/kernels \
+  #          --ignore=/workspace/tpu_commons/tests/lora \
+  #          --ignore=/workspace/tpu_commons/tests/e2e \
+  #          --ignore=/workspace/tpu_commons/tpu_commons/mock \
+  #          --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69
 
-   - label: "JAX unit tests"
-     key: test_7
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \
-           --ignore=/workspace/tpu_commons/tests/kernels \
-           --ignore=/workspace/tpu_commons/tests/lora \
-           --ignore=/workspace/tpu_commons/tests/e2e \
-           --ignore=/workspace/tpu_commons/tpu_commons/mock \
-           --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69
+  #  - label: "JAX unit tests - kernels"
+  #    key: test_8
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \
+  #          --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
+  #          --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py
 
-   - label: "JAX unit tests - kernels"
-     key: test_8
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \
-           --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
-           --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py
+  #  - label: "lora tests for JAX + vLLM models"
+  #    key: test_9
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          bash -c 'SKIP_JAX_PRECOMPILE=1 MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_commons/tests/lora/test_lora.py'
 
-   - label: "lora tests for JAX + vLLM models"
-     key: test_9
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           bash -c 'SKIP_JAX_PRECOMPILE=1 MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_commons/tests/lora/test_lora.py'
+  #  - label: "Integration Test llama-3.1-8B on TPU"
+  #    key: integration_test_llama_3_1_8B_tpu
+  #    depends_on:
+  #      - test_0
+  #      - test_1
+  #      - test_2
+  #      - test_3
+  #      - test_4
+  #      - test_5
+  #      - test_6
+  #      - test_7
+  #      - test_8
+  #      - test_9
+  #    soft_fail: true
+  #    agents:
+  #      # Need check agent
+  #      queue: tpu_v6e_queue
+  #    env:
+  #      EXPECT_VALUES_PATH: "/workspace/"
+  #      EXPECT_VALUES_FILENAME: "expect_values.json"
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct"
+
+  #  - label: "Integration Test llama-3.1-70B on TPU"
+  #    key: integration_test_llama_3_1_70B_tpu
+  #    depends_on:
+  #      - test_0
+  #      - test_1
+  #      - test_2
+  #      - test_3
+  #      - test_4
+  #      - test_5
+  #      - test_6
+  #      - test_7
+  #      - test_8
+  #      - test_9
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_8_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 8 -m "meta-llama/Llama-3.1-70B-Instruct"
+
+  # # -----------------------------------------------------------------
+  # # NOTIFICATION STEP
+  # # -----------------------------------------------------------------
+  #  - label: "TPU Test Notification"
+  #    depends_on:
+  #      - test_0
+  #      - test_1
+  #      - test_2
+  #      - test_3
+  #      - test_4
+  #      - test_5
+  #      - test_6
+  #      - test_7
+  #      - test_8
+  #      - test_9
+  #      - integration_test_llama_3_1_8B_tpu
+  #      - integration_test_llama_3_1_70B_tpu
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/check_results.sh \
+  #          "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 \
+  #          integration_test_llama_3_1_8B_tpu integration_test_llama_3_1_70B_tpu
 
-   - label: "Integration Test llama-3.1-8B on TPU"
-     key: integration_test_llama_3_1_8B_tpu
-     depends_on:
-       - test_0
-       - test_1
-       - test_2
-       - test_3
-       - test_4
-       - test_5
-       - test_6
-       - test_7
-       - test_8
-       - test_9
-     soft_fail: true
+   - label: "Upload Dynamic Pipeline Test"
      agents:
        queue: tpu_v6e_queue
      commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct"
+       - .buildkite/scripts/dynamic_upload.sh
 
-   - label: "Integration Test llama-3.1-70B on TPU"
-     key: integration_test_llama_3_1_70B_tpu
-     depends_on:
-       - test_0
-       - test_1
-       - test_2
-       - test_3
-       - test_4
-       - test_5
-       - test_6
-       - test_7
-       - test_8
-       - test_9
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_8_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 8 -m "meta-llama/Llama-3.1-70B-Instruct"
+   - wait: ~
 
-  # -----------------------------------------------------------------
-  # NOTIFICATION STEP
-  # -----------------------------------------------------------------
-   - label: "TPU Test Notification"
-     depends_on:
-       - test_0
-       - test_1
-       - test_2
-       - test_3
-       - test_4
-       - test_5
-       - test_6
-       - test_7
-       - test_8
-       - test_9
-       - integration_test_llama_3_1_8B_tpu
-       - integration_test_llama_3_1_70B_tpu
+   - label: "Generate support matrices"
      agents:
        queue: tpu_v6e_queue
      commands:
-       - |
-         .buildkite/scripts/check_results.sh \
-           "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 \
-           integration_test_llama_3_1_8B_tpu integration_test_llama_3_1_70B_tpu
+       - echo "Generate support matrices..."
\ No newline at end of file
diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh
index d4113bda16..d7a901f4ce 100755
--- a/.buildkite/scripts/bootstrap.sh
+++ b/.buildkite/scripts/bootstrap.sh
@@ -1,27 +1,7 @@
 #!/bin/bash
 
-echo "--- Starting Buildkite Bootstrap ---"
-
-# # Check if the current build is a pull request
-# if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-#   echo "This is a Pull Request build."
-#   PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
-
-#   # If it's a PR, check for the specific label
-#   if [[ $PR_LABELS == *"ready"* ]]; then
-#     echo "Found 'ready' label on PR. Uploading main pipeline..."
-#     buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
-#     # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
-#   else
-#     echo "No 'ready' label found on PR. Skipping main pipeline upload."
-#     exit 0 # Exit with 0 to indicate success (no error, just skipped)
-#   fi
-# else
-#   # If it's NOT a Pull Request (e.g., branch push, tag, manual build)
-#   echo "This is not a Pull Request build. Uploading main pipeline."
-#   buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
-#   # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
-# fi
+echo "--- Starting Special Buildkite Bootstrap ---"
 
 buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
-echo "--- Buildkite Bootstrap Finished ---"
+
+echo "--- Buildkite Special Bootstrap Finished ---"
diff --git a/.buildkite/scripts/dynamic_upload.sh b/.buildkite/scripts/dynamic_upload.sh
new file mode 100644
index 0000000000..d2376176f2
--- /dev/null
+++ b/.buildkite/scripts/dynamic_upload.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+echo "--- Starting Special Buildkite Bootstrap ---"
+
+# for loop features and models upload to buildkite
+BUILDKITE_DIR=".buildkite"
+TARGET_FOLDERS="models features"
+MODEL_LIST_METADATA_KEY="model-names-list"
+
+declare -a model_names
+declare -a pipeline_steps
+
+echo "--- Scanning: ${TARGET_FOLDERS}"
+
+for folder in $TARGET_FOLDERS; do
+  folder=$BUILDKITE_DIR/$folder
+  # Check if the folder exists
+  if [[ ! -d "$folder" ]]; then
+    echo "Warning: Folder '$folder' not found. Skipping."
+    continue
+  fi
+
+  # Use find command to locate all .yml or .yaml files
+  # -print0 and read -r -d '' are a safe way to handle filenames with special characters (like spaces)
+  while IFS= read -r -d '' yml_file; do
+    echo "--- handling yml file: ${yml_file}"
+
+    # Read the first line for getting model name
+    first_line=$(awk 'NR==1{print $0; exit}' "${yml_file}")
+
+    # Check if the first line contains the '# ' comment marker
+    if [[ "$first_line" == "# "* ]]; then
+      model_name=${first_line#\# }
+      echo "Model Name: ${model_name}"
+      model_names+=("${model_name}")
+    else
+      echo "Warning: The first line of ${yml_file} is not in the expected comment format (ex: '# model-name')."
+    fi
+
+    # --- Dynamic Buildkite Pipeline Step ---
+    # For each found .yml file, generate a command step
+    # Here we assume the .yml file itself is an executable buildkite pipeline step script
+    pipeline_yaml=$(cat <<EOF
+- label: "Upload: ${yml_file}"
+  command: "buildkite-agent pipeline upload ${yml_file}"
+  agents:
+    queue: tpu_v6e_queue
+EOF
+)
+
+  pipeline_steps+=("${pipeline_yaml}")
+
+  done < <(find "$folder" -type f \( -name "*.yml" -o -name "*.yaml" \) -print0)
+done
+
+# Convert array to a newline-separated string
+model_list_string=$(printf "%s\n" "${model_names[@]}")
+
+if [[ -n "$model_list_string" ]]; then
+  echo "--- Uploading Model Name List to Meta-data"
+  echo "${model_list_string}" | buildkite-agent meta-data set "${MODEL_LIST_METADATA_KEY}"
+  echo "Testing: $(buildkite-agent meta-data get "model-names-list")"
+else
+  echo "--- No Model Names found to upload."
+fi
+
+
+# --- Upload Dynamic Pipeline ---
+
+if [[ -n "$pipeline_steps" ]]; then
+  echo "--- Uploading Dynamic Pipeline Steps"
+  final_pipeline_yaml="steps:"$'\n'
+  final_pipeline_yaml+=$(printf "%s\n" "${pipeline_steps[@]}")
+  echo "Upload YML: ${final_pipeline_yaml}"
+  echo -e "${final_pipeline_yaml}" | buildkite-agent pipeline upload
+else
+  echo "--- No .yml files found, no new Pipeline Steps to upload."
+  # buildkite-agent step update --state "passed"
+fi
+
+echo "--- Buildkite Special Bootstrap Finished ---"
diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh
index 7f51f86c06..57ced35f2a 100755
--- a/.buildkite/scripts/run_in_docker.sh
+++ b/.buildkite/scripts/run_in_docker.sh
@@ -12,6 +12,7 @@ if [ "$#" -eq 0 ]; then
 fi
 
 MOUNT_EXPECT_RESULT="False"
+GPU_BASE="False"
 OTHER_ARGS=()
 
 while [[ $# -gt 0 ]]; do
@@ -20,6 +21,10 @@ while [[ $# -gt 0 ]]; do
             MOUNT_EXPECT_RESULT="True"
             shift 1
             ;;
+        --gpu)
+            GPU_BASE="True"
+            shift 1
+            ;;
         *)
             OTHER_ARGS+=("$@")
             break
@@ -110,7 +115,18 @@ fi
 echo "Cleanup complete."
 
 IMAGE_NAME="vllm-tpu"
-docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
+if [ $GPU_BASE == "True" ]; then
+  echo "Docker build gpu image"
+  IMAGE_NAME="vllm-gpu"
+  docker build --no-cache -f docker/Dockerfile.cuda -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
+  # DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 \
+  #   --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true \
+  #   --build-arg INSTALL_KV_CONNECTORS=true --tag "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
+  #   --target vllm-openai --progress plain -f docker/Dockerfile.cuda.dev .
+    # public.ecr.aws/q9t5s3a7/vllm-release-repo
+else
+  docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
+fi
 
 echo "Execute Cmd: $@ on Image: ${IMAGE_NAME}:${BUILDKITE_COMMIT}"
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index a11b596c62..32f3ee1b25 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -21,10 +21,15 @@ RUN VLLM_TARGET_DEVICE="tpu" pip install -e .
 
 # Install test dependencies
 RUN python3 -m pip install -e tests/vllm_test_utils
-RUN python3 -m pip install --no-cache-dir git+https://github.com/thuml/depyf.git pytest pytest-asyncio tpu-info datasets
-RUN python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
-RUN python3 -m pip install pytest-cov
-RUN python3 -m pip install numba
+RUN python3 -m pip install --no-cache-dir \
+    git+https://github.com/thuml/depyf.git \
+    pytest \
+    pytest-asyncio \
+    tpu-info \
+    datasets \
+    git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
+    pytest-cov \
+    numba
 
 # Install tpu_commons
 WORKDIR /workspace/tpu_commons
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
new file mode 100644
index 0000000000..6f27ad8f93
--- /dev/null
+++ b/docker/Dockerfile.cuda
@@ -0,0 +1,72 @@
+ARG CUDA_VERSION=12.8.1
+ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+FROM $BASE_IMAGE
+
+ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
+ARG PYTHON_VERSION=3.12
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+       software-properties-common \
+       curl \
+    && add-apt-repository ppa:deadsnakes/ppa -y \
+    && apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+       python${PYTHON_VERSION} \
+       python${PYTHON_VERSION}-dev \
+       python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version \
+    # Clean up apt caches to reduce image size
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+
+# Remove existing versions of dependencies
+RUN pip uninstall -y torch torch_xla torchvision
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    git \
+    libopenblas-base libopenmpi-dev libomp-dev
+
+# Build vLLM
+WORKDIR /workspace/vllm
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+RUN git clone $VLLM_REPO /workspace/vllm
+
+RUN pip install -r requirements/cuda.txt
+RUN VLLM_TARGET_DEVICE="cuda" pip install -e .
+
+# Install test dependencies
+RUN python3 -m pip install -e tests/vllm_test_utils
+RUN python3 -m pip install --no-cache-dir \
+    git+https://github.com/thuml/depyf.git \
+    pytest \
+    pytest-asyncio \
+    tpu-info \
+    datasets \
+    git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
+    pytest-cov \
+    numba
+
+# Install tpu_commons
+WORKDIR /workspace/tpu_commons
+# Install requirements first and cache so we don't need to re-install on code change.
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+COPY requirements_benchmarking.txt .
+# These are needed for the E2E benchmarking tests (i.e. tests/e2e/benchmarking/mlperf.sh)
+RUN pip install -r requirements_benchmarking.txt
+COPY . .
+RUN pip install -e .
+
+CMD ["/bin/bash"]
diff --git a/requirements_benchmarking.txt b/requirements_benchmarking.txt
index e9fa110218..4484d13728 100644
--- a/requirements_benchmarking.txt
+++ b/requirements_benchmarking.txt
@@ -4,7 +4,4 @@ nltk
 evaluate
 datasets
 rouge-score
-scikit-learn
-openai
-lm_eval
-pytest
\ No newline at end of file
+scikit-learn
\ No newline at end of file
diff --git a/scripts/vllm/integration/conftest.py b/scripts/vllm/integration/conftest.py
index 9e16362777..b1c2ba1872 100644
--- a/scripts/vllm/integration/conftest.py
+++ b/scripts/vllm/integration/conftest.py
@@ -13,8 +13,9 @@ def pytest_addoption(parser):
         "--expected-values-file",
         type=str,
         default=None,
-        help="Path to a JSON file with expected accuracy values."
-    )
+        help="This is used to specify the JSON file that stores the expected values. " +
+            "The results from running test_accuracy on a GPU will be saved to this file, " +
+            "and when running on a TPU, the results will be read from this file for comparison.")
     parser.addoption(
         "--model-names",
         action="store",
diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py
index 381c3eabf1..68fb69e643 100644
--- a/scripts/vllm/integration/test_accuracy.py
+++ b/scripts/vllm/integration/test_accuracy.py
@@ -20,14 +20,8 @@
 from pathlib import Path
 from vllm.platforms import current_platform
 
-MODEL_NAMES = [
-    "Qwen/Qwen3-1.7B",
-    "google/gemma-3-1b-it",
-    # "meta-llama/Llama-3.1-8B-Instruct",
-]
-FP8_KV_MODEL_NAMES = [
-    "Qwen/Qwen3-1.7B",
-]
+MODEL_NAMES = []
+FP8_KV_MODEL_NAMES = []
 NUM_CONCURRENT = 500
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
@@ -67,7 +61,9 @@ def write_expected_value_to_json(model_name, measured_value, json_filepath):
             with open(json_filepath, 'r', encoding='utf-8') as f:
                 data = json.load(f)
         except (FileNotFoundError, json.JSONDecodeError):
-            print(f"'{json_filepath}' not found or is empty/invalid. A new one will be created.")
+            print(
+                f"'{json_filepath}' not found or is empty/invalid. A new one will be created."
+            )
             data = {}
         
         data[model_name] = measured_value
@@ -75,9 +71,14 @@ def write_expected_value_to_json(model_name, measured_value, json_filepath):
         try:
             with open(json_filepath, 'w', encoding='utf-8') as f:
                 json.dump(data, f, indent=4)
-            print(f"Successfully updated '{json_filepath}' with the result for {model_name}.")
+            print(
+                f"Successfully updated '{json_filepath}' with the result for {model_name}."
+            )
         except IOError as e:
-            print(f"Error: Failed to write to file '{json_filepath}'. Reason: {e}")
+            print(
+                f"Error: Failed to write to file '{json_filepath}'. Reason: {e}"
+            )
+            raise
 
 # Read expected values from json file if exist
 # TBD: To support the functionality of connecting GPU and TPU expected values in the future
@@ -102,12 +103,6 @@ def run_test(model_name, expected_values_data, expected_json_filepath, more_args
     print(f"Running test for model: {model_name}")
 
     model_args = f"pretrained={model_name},max_model_len=4096"
-    
-    download_path = "/mnt/disks/persist"
-    # download_path = "/tmp/hf_model"
-    if os.path.isdir(download_path) and os.access(download_path, os.R_OK) and os.access(download_path, os.W_OK):
-        model_args = f"{model_args},download_dir={download_path}"
-    
     if more_args is not None:
         model_args = "{},{}".format(model_args, more_args)
 
@@ -166,9 +161,8 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, requ
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
         
-        more_args = None
+        more_args = "max_model_len=2048,max_num_seqs=64"
         if current_platform.is_tpu():
-            more_args = "max_model_len=2048,max_num_seqs=64"
             tp_size_str = f"tensor_parallel_size={tp_size}"
             more_args += ",{}".format(tp_size_str)
         
diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh
index cf65860b2d..0ce96d9e42 100644
--- a/tests/e2e/benchmarking/test_accuracy.sh
+++ b/tests/e2e/benchmarking/test_accuracy.sh
@@ -2,6 +2,7 @@
 
 model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.1-70B-Instruct"
 tensor_parallel_size=1
+gpu_enabled=false
 
 extra_serve_args=()
 echo extra_serve_args: "${extra_serve_args[@]}"
@@ -36,6 +37,10 @@ while [[ "$#" -gt 0 ]]; do
             shift
             shift
             ;;
+        -g|--gpu)
+            gpu_enabled=true
+            shift
+            ;;
         -h|--help)
             helpFunction
             ;;
@@ -61,6 +66,10 @@ echo "Running integration for models: $comma_model_list"
 echo "--------------------------------------------------"
 
 # Default action
-python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list"
+if $gpu_enabled; then
+    python3 -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list"
+else
+    python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list"
+fi
 
 exit $exit_code
\ No newline at end of file

From e78c465fe237fbd080da6cd3213910223f02c925 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Tue, 23 Sep 2025 16:58:49 +0800
Subject: [PATCH 03/38] clean for test

---
 .buildkite/buildkite_ci_feature_template.yml  |  51 +++-
 .buildkite/buildkite_ci_model_template.yml    |  51 +++-
 .buildkite/features/Feat-A.yml                |  70 -----
 .../meta-llama_Llama-3_1-70B-Instruct.yml     |  51 +++-
 .../meta-llama_Llama-3_1-8B-Instruct.yml      |  54 +++-
 .buildkite/pipeline_dynamic.yml               |  17 ++
 .buildkite/pipeline_jax.yml                   | 276 +++++++-----------
 .buildkite/scripts/bootstrap.sh               |  25 +-
 .buildkite/scripts/dynamic_bootstrap.sh       | 145 +++++++++
 .buildkite/scripts/dynamic_upload.sh          |  81 -----
 .buildkite/scripts/run_in_docker.sh           |  59 +---
 buildkite-script-dynamic.gz                   | Bin 0 -> 10583 bytes
 12 files changed, 494 insertions(+), 386 deletions(-)
 delete mode 100644 .buildkite/features/Feat-A.yml
 rename .buildkite/models/{ => informational}/meta-llama_Llama-3_1-70B-Instruct.yml (58%)
 create mode 100644 .buildkite/pipeline_dynamic.yml
 create mode 100644 .buildkite/scripts/dynamic_bootstrap.sh
 delete mode 100644 .buildkite/scripts/dynamic_upload.sh
 create mode 100644 buildkite-script-dynamic.gz

diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml
index 52b6a350af..e907815416 100644
--- a/.buildkite/buildkite_ci_feature_template.yml
+++ b/.buildkite/buildkite_ci_feature_template.yml
@@ -10,13 +10,24 @@ steps:
   - label: "Notifications: Unit tests for {FEATURE_NAME}"
     key: "notifications_ut_{SAFE_FEATURE_NAME}"
     depends_on: "ut_{SAFE_FEATURE_NAME}"
-    soft_fail: true
     agents:
       queue: {QUEUE}
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for {FEATURE_NAME}" ut_{SAFE_FEATURE_NAME}
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "{FEATURE_NAME}:UnitTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "{FEATURE_NAME}:UnitTest" "failed"
+            fi
 
   - label: "Integration tests for {FEATURE_NAME}"
     key: "it_{SAFE_FEATURE_NAME}"
@@ -28,13 +39,24 @@ steps:
   - label: "Notifications: Integration tests for {FEATURE_NAME}"
     key: "notifications_it_{SAFE_FEATURE_NAME}"
     depends_on: "it_{SAFE_FEATURE_NAME}"
-    soft_fail: true
     agents:
       queue: {QUEUE}
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Integration tests for {FEATURE_NAME}" it_{SAFE_FEATURE_NAME}
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "{FEATURE_NAME}:IntTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "{FEATURE_NAME}:IntTest" "failed"
+            fi
 
   - label: "Performance benchmarks for {FEATURE_NAME}"
     key: "pb_{SAFE_FEATURE_NAME}"
@@ -45,13 +67,24 @@ steps:
   - label: "Notifications: Performance benchmarks for {FEATURE_NAME}"
     key: "notifications_pb_{SAFE_FEATURE_NAME}"
     depends_on: "pb_{SAFE_FEATURE_NAME}"
-    soft_fail: true
     agents:
       queue: {QUEUE}
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Performance benchmarks for {FEATURE_NAME}" pb_{SAFE_FEATURE_NAME}
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "{FEATURE_NAME}:Benchmark" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "{FEATURE_NAME}:Benchmark" "failed"
+            fi
 
   - label: "Stress tests for {FEATURE_NAME}"
     key: "st_{SAFE_FEATURE_NAME}"
@@ -68,3 +101,15 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Stress tests for {FEATURE_NAME}" st_{SAFE_FEATURE_NAME}
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "{FEATURE_NAME}:StressTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "{FEATURE_NAME}:StressTest" "failed"
+            fi
diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml
index aa5425b11f..f58b557557 100644
--- a/.buildkite/buildkite_ci_model_template.yml
+++ b/.buildkite/buildkite_ci_model_template.yml
@@ -10,13 +10,24 @@ steps:
   - label: "Notifications: Unit tests for {MODEL_NAME}"
     key: "notifications_ut_{SAFE_MODEL_NAME}"
     depends_on: "ut_{SAFE_MODEL_NAME}"
-    soft_fail: true
     agents:
       queue: {QUEUE}
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME}
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "{MODEL_NAME}:UnitTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "{MODEL_NAME}:UnitTest" "failed"
+            fi
 
   - label: "Integration tests for {MODEL_NAME}"
     key: "it_{SAFE_MODEL_NAME}"
@@ -28,13 +39,24 @@ steps:
   - label: "Notifications: Integration tests for {MODEL_NAME}"
     key: "notifications_it_{SAFE_MODEL_NAME}"
     depends_on: "it_{SAFE_MODEL_NAME}"
-    soft_fail: true
     agents:
       queue: {QUEUE}
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Integration tests for {MODEL_NAME}" it_{SAFE_MODEL_NAME}
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "{MODEL_NAME}:IntTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "{MODEL_NAME}:IntTest" "failed"
+            fi
 
   - label: "Performance benchmarks for {MODEL_NAME}"
     key: "pb_{SAFE_MODEL_NAME}"
@@ -45,13 +67,24 @@ steps:
   - label: "Notifications: Performance benchmarks for {MODEL_NAME}"
     key: "notifications_pb_{SAFE_MODEL_NAME}"
     depends_on: "pb_{SAFE_MODEL_NAME}"
-    soft_fail: true
     agents:
       queue: {QUEUE}
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Performance benchmarks for {MODEL_NAME}" pb_{SAFE_MODEL_NAME}
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "{MODEL_NAME}:Benchmark" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "{MODEL_NAME}:Benchmark" "failed"
+            fi
 
   - label: "Stress tests for {MODEL_NAME}"
     key: "st_{SAFE_MODEL_NAME}"
@@ -68,3 +101,15 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Stress tests for {MODEL_NAME}" st_{SAFE_MODEL_NAME}
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "{MODEL_NAME}:StressTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "{MODEL_NAME}:StressTest" "failed"
+            fi
diff --git a/.buildkite/features/Feat-A.yml b/.buildkite/features/Feat-A.yml
deleted file mode 100644
index 20ee8caeb8..0000000000
--- a/.buildkite/features/Feat-A.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Feat-A
-agents:
-  queue: tpu_v6e_queue
-steps:
-  - label: "Unit tests for Feat-A"
-    key: "ut_Feat-A"
-    commands:
-      # - replace_with_test_commands  # TODO: Replaced to actual test commands
-      - echo "[DEBUG], unit testing for Feat-A"  # TODO: Replace to actual test commands
-  - label: "Notifications: Unit tests for Feat-A"
-    key: "notifications_ut_Feat-A"
-    depends_on: "ut_Feat-A"
-    soft_fail: true
-    agents:
-      queue: tpu_v6e_queue
-    commands:
-      - |
-        .buildkite/scripts/check_results.sh \
-          "Unit tests for Feat-A" ut_Feat-A
-
-  - label: "Integration tests for Feat-A"
-    key: "it_Feat-A"
-    depends_on: "notifications_ut_Feat-A"
-    commands:
-      # TODO: expected_accuracy need parameterized
-      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "Feat-A"
-      - echo "[DEBUG], integration testing for Feat-A"  # TODO: Replace to actual test commands
-  - label: "Notifications: Integration tests for Feat-A"
-    key: "notifications_it_Feat-A"
-    depends_on: "it_Feat-A"
-    soft_fail: true
-    agents:
-      queue: tpu_v6e_queue
-    commands:
-      - |
-        .buildkite/scripts/check_results.sh \
-          "Integration tests for Feat-A" it_Feat-A
-
-  - label: "Performance benchmarks for Feat-A"
-    key: "pb_Feat-A"
-    depends_on: "notifications_it_Feat-A"
-    commands:
-      # - replace_with_test_command  # TODO
-      - echo "[DEBUG], performance benchmarking for Feat-A"  # TODO: Replace to actual test commands
-  - label: "Notifications: Performance benchmarks for Feat-A"
-    key: "notifications_pb_Feat-A"
-    depends_on: "pb_Feat-A"
-    soft_fail: true
-    agents:
-      queue: tpu_v6e_queue
-    commands:
-      - |
-        .buildkite/scripts/check_results.sh \
-          "Performance benchmarks for Feat-A" pb_Feat-A
-
-  - label: "Stress tests for Feat-A"
-    key: "st_Feat-A"
-    depends_on: "notifications_pb_Feat-A"
-    commands:
-      # - our_stress_tests_script Feat-A expected_throughput # TODO: expected_throughput need parameterized
-      - echo "[DEBUG], stress testing for Feat-A"  # TODO: Replace to actual test commands
-  - label: "Notifications: Stress tests for Feat-A"
-    key: "notifications_st_Feat-A"
-    depends_on: "st_Feat-A"
-    agents:
-      queue: tpu_v6e_queue
-    commands:
-      - |
-        .buildkite/scripts/check_results.sh \
-          "Stress tests for Feat-A" st_Feat-A
diff --git a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml
similarity index 58%
rename from .buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
rename to .buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml
index da93c2dc2d..9539111d5b 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
+++ b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml
@@ -10,13 +10,24 @@ steps:
   - label: "Notifications: Unit tests for meta-llama/Llama-3.1-70B-Instruct"
     key: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct"
     depends_on: "ut_meta-llama_Llama-3_1-70B-Instruct"
-    soft_fail: true
     agents:
       queue: tpu_v6e_8_queue
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:UnitTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:UnitTest" "failed"
+            fi
 
   - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct"
     key: "it_meta-llama_Llama-3_1-70B-Instruct"
@@ -28,13 +39,24 @@ steps:
   - label: "Notifications: Integration tests for meta-llama/Llama-3.1-70B-Instruct"
     key: "notifications_it_meta-llama_Llama-3_1-70B-Instruct"
     depends_on: "it_meta-llama_Llama-3_1-70B-Instruct"
-    soft_fail: true
     agents:
       queue: tpu_v6e_8_queue
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:IntTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:IntTest" "failed"
+            fi
 
   - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
     key: "pb_meta-llama_Llama-3_1-70B-Instruct"
@@ -45,13 +67,24 @@ steps:
   - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
     key: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct"
     depends_on: "pb_meta-llama_Llama-3_1-70B-Instruct"
-    soft_fail: true
     agents:
       queue: tpu_v6e_8_queue
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:Benchmark" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:Benchmark" "failed"
+            fi
 
   - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct"
     key: "st_meta-llama_Llama-3_1-70B-Instruct"
@@ -68,3 +101,15 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:StressTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:StressTest" "failed"
+            fi
diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index 6d4e4288ab..eff2e3c815 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -6,18 +6,28 @@ steps:
     key: "ut_meta-llama_Llama-3_1-8B-Instruct"
     commands:
       # - replace_with_test_commands  # TODO: Replaced to actual test commands
-      # - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-8B-Instruct"  # TODO: Replace to actual test commands
-      - echo "Running..."; sleep 20;echo "End"
+      - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-8B-Instruct"  # TODO: Replace to actual test commands
   - label: "Notifications: Unit tests for meta-llama/Llama-3.1-8B-Instruct"
     key: "notifications_ut_meta-llama_Llama-3_1-8B-Instruct"
     depends_on: "ut_meta-llama_Llama-3_1-8B-Instruct"
-    soft_fail: true
     agents:
       queue: tpu_v6e_queue
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "failed"
+            fi
 
   - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct"
     key: "it_meta-llama_Llama-3_1-8B-Instruct"
@@ -29,13 +39,24 @@ steps:
   - label: "Notifications: Integration tests for meta-llama/Llama-3.1-8B-Instruct"
     key: "notifications_it_meta-llama_Llama-3_1-8B-Instruct"
     depends_on: "it_meta-llama_Llama-3_1-8B-Instruct"
-    soft_fail: true
     agents:
       queue: tpu_v6e_queue
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Integration tests for meta-llama/Llama-3.1-8B-Instruct" it_meta-llama_Llama-3_1-8B-Instruct
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:IntTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:IntTest" "failed"
+            fi
 
   - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
     key: "pb_meta-llama_Llama-3_1-8B-Instruct"
@@ -46,13 +67,24 @@ steps:
   - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
     key: "notifications_pb_meta-llama_Llama-3_1-8B-Instruct"
     depends_on: "pb_meta-llama_Llama-3_1-8B-Instruct"
-    soft_fail: true
     agents:
       queue: tpu_v6e_queue
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" pb_meta-llama_Llama-3_1-8B-Instruct
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:Benchmark" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:Benchmark" "failed"
+            fi
 
   - label: "Stress tests for meta-llama/Llama-3.1-8B-Instruct"
     key: "st_meta-llama_Llama-3_1-8B-Instruct"
@@ -69,3 +101,15 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Stress tests for meta-llama/Llama-3.1-8B-Instruct" st_meta-llama_Llama-3_1-8B-Instruct
+    plugins:
+      - hooks#v1:
+          post-command: |
+            echo "--- Post-command hook triggered ---"
+            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+              echo "The step passed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:StressTest" "passed"
+            else
+              echo "The step failed. Uploading result..."
+              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:StressTest" "failed"
+            fi
diff --git a/.buildkite/pipeline_dynamic.yml b/.buildkite/pipeline_dynamic.yml
new file mode 100644
index 0000000000..fa1355330e
--- /dev/null
+++ b/.buildkite/pipeline_dynamic.yml
@@ -0,0 +1,17 @@
+steps:
+  # -----------------------------------------------------------------
+  # TEST STEPS - Calling wrapper
+  # -----------------------------------------------------------------"
+   - label: "Upload Dynamic Pipeline Test"
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - .buildkite/scripts/dynamic_upload.sh
+
+   - wait: ~
+
+   - label: "Generate support matrices"
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - echo "Generate support matrices..."
\ No newline at end of file
diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
index 5170f89d57..fc42dbde93 100644
--- a/.buildkite/pipeline_jax.yml
+++ b/.buildkite/pipeline_jax.yml
@@ -1,191 +1,123 @@
 steps:
   # -----------------------------------------------------------------
   # TEST STEPS - Calling wrapper
-  # -----------------------------------------------------------------"
-  #  - label: "E2E MLPerf tests for JAX models"
-  #    key: test_0
-  #    soft_fail: true
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
-
-  #  - label: "E2E MLPerf tests for JAX models with quantization"
-  #    key: test_1
-  #    soft_fail: true
-  #    env:
-  #      QUANTIZATION: "True"
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
-
-  #  - label: "E2E MLPerf tests for JAX new models"
-  #    key: test_2
-  #    soft_fail: true
-  #    env:
-  #      NEW_MODEL_DESIGN: "True"
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
-
-  #  - label: "E2E MLPerf tests for JAX + vLLM models"
-  #    key: test_3
-  #    soft_fail: true
-  #    env:
-  #      MODEL_IMPL_TYPE: "vllm"
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
-
-  #  - label: "E2E MLperf tests for Llama4 models"
-  #    key: test_4
-  #    soft_fail: true
-  #    env:
-  #      NEW_MODEL_DESIGN: "True"
-  #      USE_V6E8_QUEUE: "True"
-  #    agents:
-  #      queue: tpu_v6e_8_queue
-  #    commands:
-  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
-
-  #  - label: "E2E multi modality test"
-  #    key: test_5
-  #    soft_fail: true
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - |
-  #        .buildkite/scripts/run_in_docker.sh \
-  #          bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \
-  #           bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh'
+  # -----------------------------------------------------------------
+   - label: "E2E MLPerf tests for JAX models"
+     key: test_0
+     soft_fail: true
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-  #  - label: "E2E speculative decoding test"
-  #    key: test_6
-  #    soft_fail: true
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - |
-  #        .buildkite/scripts/run_in_docker.sh \
-  #          bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py'
+   - label: "E2E MLPerf tests for JAX models with quantization"
+     key: test_1
+     soft_fail: true
+     env:
+       QUANTIZATION: "True"
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-  #  - label: "JAX unit tests"
-  #    key: test_7
-  #    soft_fail: true
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - |
-  #        .buildkite/scripts/run_in_docker.sh \
-  #          python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \
-  #          --ignore=/workspace/tpu_commons/tests/kernels \
-  #          --ignore=/workspace/tpu_commons/tests/lora \
-  #          --ignore=/workspace/tpu_commons/tests/e2e \
-  #          --ignore=/workspace/tpu_commons/tpu_commons/mock \
-  #          --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69
+   - label: "E2E MLPerf tests for JAX new models"
+     key: test_2
+     soft_fail: true
+     env:
+       NEW_MODEL_DESIGN: "True"
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-  #  - label: "JAX unit tests - kernels"
-  #    key: test_8
-  #    soft_fail: true
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - |
-  #        .buildkite/scripts/run_in_docker.sh \
-  #          python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \
-  #          --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
-  #          --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py
+   - label: "E2E MLPerf tests for JAX + vLLM models"
+     key: test_3
+     soft_fail: true
+     env:
+       MODEL_IMPL_TYPE: "vllm"
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-  #  - label: "lora tests for JAX + vLLM models"
-  #    key: test_9
-  #    soft_fail: true
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - |
-  #        .buildkite/scripts/run_in_docker.sh \
-  #          bash -c 'SKIP_JAX_PRECOMPILE=1 MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_commons/tests/lora/test_lora.py'
+   - label: "E2E MLperf tests for Llama4 models"
+     key: test_4
+     soft_fail: true
+     env:
+       NEW_MODEL_DESIGN: "True"
+       USE_V6E8_QUEUE: "True"
+     agents:
+       queue: tpu_v6e_8_queue
+     commands:
+       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-  #  - label: "Integration Test llama-3.1-8B on TPU"
-  #    key: integration_test_llama_3_1_8B_tpu
-  #    depends_on:
-  #      - test_0
-  #      - test_1
-  #      - test_2
-  #      - test_3
-  #      - test_4
-  #      - test_5
-  #      - test_6
-  #      - test_7
-  #      - test_8
-  #      - test_9
-  #    soft_fail: true
-  #    agents:
-  #      # Need check agent
-  #      queue: tpu_v6e_queue
-  #    env:
-  #      EXPECT_VALUES_PATH: "/workspace/"
-  #      EXPECT_VALUES_FILENAME: "expect_values.json"
-  #    commands:
-  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct"
 
-  #  - label: "Integration Test llama-3.1-70B on TPU"
-  #    key: integration_test_llama_3_1_70B_tpu
-  #    depends_on:
-  #      - test_0
-  #      - test_1
-  #      - test_2
-  #      - test_3
-  #      - test_4
-  #      - test_5
-  #      - test_6
-  #      - test_7
-  #      - test_8
-  #      - test_9
-  #    soft_fail: true
-  #    agents:
-  #      queue: tpu_v6e_8_queue
-  #    commands:
-  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 8 -m "meta-llama/Llama-3.1-70B-Instruct"
+   - label: "E2E multi modality test"
+     key: test_5
+     soft_fail: true
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - |
+         .buildkite/scripts/run_in_docker.sh \
+           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \
+            bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh'
 
-  # # -----------------------------------------------------------------
-  # # NOTIFICATION STEP
-  # # -----------------------------------------------------------------
-  #  - label: "TPU Test Notification"
-  #    depends_on:
-  #      - test_0
-  #      - test_1
-  #      - test_2
-  #      - test_3
-  #      - test_4
-  #      - test_5
-  #      - test_6
-  #      - test_7
-  #      - test_8
-  #      - test_9
-  #      - integration_test_llama_3_1_8B_tpu
-  #      - integration_test_llama_3_1_70B_tpu
-  #    agents:
-  #      queue: tpu_v6e_queue
-  #    commands:
-  #      - |
-  #        .buildkite/scripts/check_results.sh \
-  #          "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 \
-  #          integration_test_llama_3_1_8B_tpu integration_test_llama_3_1_70B_tpu
+   - label: "E2E speculative decoding test"
+     key: test_6
+     soft_fail: true
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - |
+         .buildkite/scripts/run_in_docker.sh \
+           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py'
 
-   - label: "Upload Dynamic Pipeline Test"
+   - label: "JAX unit tests"
+     key: test_7
+     soft_fail: true
      agents:
        queue: tpu_v6e_queue
      commands:
-       - .buildkite/scripts/dynamic_upload.sh
+       - |
+         .buildkite/scripts/run_in_docker.sh \
+           python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \
+           --ignore=/workspace/tpu_commons/tests/kernels \
+           --ignore=/workspace/tpu_commons/tests/e2e \
+           --ignore=/workspace/tpu_commons/tpu_commons/mock \
+           --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69
 
-   - wait: ~
+   - label: "JAX unit tests - kernels"
+     key: test_8
+     soft_fail: true
+     agents:
+       queue: tpu_v6e_queue
+     commands:
+       - |
+         .buildkite/scripts/run_in_docker.sh \
+           python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \
+           --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
+           --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py
 
-   - label: "Generate support matrices"
+  # -----------------------------------------------------------------
+  # NOTIFICATION STEP
+  # -----------------------------------------------------------------
+   - label: "TPU Test Notification"
+     depends_on:
+       - test_0
+       - test_1
+       - test_2
+       - test_3
+       - test_4
+       - test_5
+       - test_6
+       - test_7
+       - test_8
+       - integration_test_llama_3_1_8B_tpu
+       - integration_test_llama_3_1_70B_tpu
      agents:
        queue: tpu_v6e_queue
      commands:
-       - echo "Generate support matrices..."
\ No newline at end of file
+       - |
+         .buildkite/scripts/check_results.sh \
+           "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8
diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh
index d7a901f4ce..e5c892d31d 100755
--- a/.buildkite/scripts/bootstrap.sh
+++ b/.buildkite/scripts/bootstrap.sh
@@ -1,7 +1,26 @@
 #!/bin/bash
 
-echo "--- Starting Special Buildkite Bootstrap ---"
+echo "--- Starting Buildkite Bootstrap ---"
 
-buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+# Check if the current build is a pull request
+if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+  echo "This is a Pull Request build."
+  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
 
-echo "--- Buildkite Special Bootstrap Finished ---"
+  # If it's a PR, check for the specific label
+  if [[ $PR_LABELS == *"ready"* ]]; then
+    echo "Found 'ready' label on PR. Uploading main pipeline..."
+    buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+    # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
+  else
+    echo "No 'ready' label found on PR. Skipping main pipeline upload."
+    exit 0 # Exit with 0 to indicate success (no error, just skipped)
+  fi
+else
+  # If it's NOT a Pull Request (e.g., branch push, tag, manual build)
+  echo "This is not a Pull Request build. Uploading main pipeline."
+  buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+  # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
+fi
+
+echo "--- Buildkite Bootstrap Finished ---"
diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh
new file mode 100644
index 0000000000..9736eaa08b
--- /dev/null
+++ b/.buildkite/scripts/dynamic_bootstrap.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+echo "--- Starting Special Buildkite Bootstrap ---"
+
+# for loop features and models upload to buildkite
+BUILDKITE_DIR=".buildkite"
+TARGET_FOLDERS="models features models/informational"
+
+MODEL_LIST_KEY="tpu-model-list"
+INFORMATIONAL_MODEL_LIST_KEY="vllm-model-list"
+POPURLAR_MODEL_LIST_KEY="popular-model-list"
+
+FEATURE_LIST_METADATA_KEY="feature-list"
+
+declare -a model_names
+declare -a pipeline_steps
+
+# Declare separate arrays for each list
+declare -a tpu_model_list
+declare -a vllm_model_list
+declare -a popular_model_list
+declare -a feature_list
+
+echo "--- Scanning: ${TARGET_FOLDERS}"
+
+for folder_path in $TARGET_FOLDERS; do
+  folder=$BUILDKITE_DIR/$folder_path
+  # Check if the folder exists
+  if [[ ! -d "$folder" ]]; then
+    echo "Warning: Folder '$folder' not found. Skipping."
+    continue
+  fi
+
+  # Use find command to locate all .yml or .yaml files
+  # -print0 and read -r -d '' are a safe way to handle filenames with special characters (like spaces)
+  while IFS= read -r -d '' yml_file; do
+    echo "--- handling yml file: ${yml_file}"
+
+    # Read the first line for getting model name
+    first_line=$(awk 'NR==1{print $0; exit}' "${yml_file}")
+
+    # Check if the first line contains the '# ' comment marker
+    if [[ "$first_line" == "# "* ]]; then
+      model_name=${first_line#\# }
+      echo "Model Name: ${model_name}"
+
+      # folder_name=$(basename "$folder_path")
+
+      # Based on the folder name, add the model to the correct list
+      case "$folder_path" in
+        "models")
+          tpu_model_list+=("${model_name}")
+          ;;
+        "models/informational")
+          vllm_model_list+=("${model_name}")
+          ;;
+        "models/popular")
+          popular_model_list+=("${model_name}")
+          ;;
+        "features")
+          feature_list+=("${model_name}")
+          ;;
+        *)
+          echo "Warning: No specific list for folder '${folder_path}'. Ignoring model '${model_name}'."
+          ;;
+      esac
+
+
+      model_names+=("${model_name}")
+    else
+      echo "Warning: The first line of ${yml_file} is not in the expected comment format (ex: '# model-name')."
+    fi
+
+    # --- Dynamic Buildkite Pipeline Step ---
+    # For each found .yml file, generate a command step
+    # Here we assume the .yml file itself is an executable buildkite pipeline step script
+    pipeline_yaml=$(cat <<EOF
+- label: "Upload: ${yml_file}"
+  command: "buildkite-agent pipeline upload ${yml_file}"
+  agents:
+    queue: tpu_v6e_queue
+EOF
+)
+
+  pipeline_steps+=("${pipeline_yaml}")
+
+  done < <(find "$folder" -maxdepth 1 -type f \( -name "*.yml" -o -name "*.yaml" \) -print0)
+done
+
+echo "--- Scan Complete. Final Lists: ---"
+
+# Convert array to a newline-separated string
+echo "TPU Models (${#tpu_model_list[@]}):"
+printf "%s\n" "${tpu_model_list[@]}"
+tpu_model_list_str=$(printf "%s\n" "${tpu_model_list[@]}")
+
+echo "VLLM Models (${#vllm_model_list[@]}):"
+printf "%s\n" "${vllm_model_list[@]}"
+vllm_model_list_str=$(printf "%s\n" "${vllm_model_list[@]}")
+
+echo "Popular Models (${#popular_model_list[@]}):"
+printf "%s\n" "${popular_model_list[@]}"
+popular_model_list_str=$(printf "%s\n" "${popular_model_list[@]}")
+
+model_list_string=$(printf "%s\n" "${model_names[@]}")
+
+if [[ -n "$tpu_model_list_str" ]]; then
+  echo "--- Uploading tpu_model_list_str to Meta-data:${MODEL_LIST_KEY}"
+  # echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}"
+  # echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")"
+else
+  echo "--- No Model Names found to upload."
+fi
+
+if [[ -n "$vllm_model_list_str" ]]; then
+  echo "--- Uploading vllm_model_list_str to Meta-data:${INFORMATIONAL_MODEL_LIST_KEY}"
+  # echo "${vllm_model_list_str}" | buildkite-agent meta-data set "${INFORMATIONAL_MODEL_LIST_KEY}"
+  # echo "Testing: $(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")"
+else
+  echo "--- No Model Names found to upload."
+fi
+POPURLAR_MODEL_LIST_KEY
+if [[ -n "$popular_model_list_str" ]]; then
+  echo "--- Uploading popular_model_list_str to Meta-data:${POPURLAR_MODEL_LIST_KEY}"
+  # echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}"
+  # echo "Testing: $(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")"
+else
+  echo "--- No Model Names found to upload."
+fi
+
+
+# --- Upload Dynamic Pipeline ---
+
+if [[ -n "$pipeline_steps" ]]; then
+  echo "--- Uploading Dynamic Pipeline Steps"
+  final_pipeline_yaml="steps:"$'\n'
+  final_pipeline_yaml+=$(printf "%s\n" "${pipeline_steps[@]}")
+  echo "Upload YML: ${final_pipeline_yaml}"
+  # echo -e "${final_pipeline_yaml}" | buildkite-agent pipeline upload
+else
+  echo "--- No .yml files found, no new Pipeline Steps to upload."
+  # buildkite-agent step update --state "passed"
+fi
+
+echo "--- Buildkite Special Bootstrap Finished ---"
diff --git a/.buildkite/scripts/dynamic_upload.sh b/.buildkite/scripts/dynamic_upload.sh
deleted file mode 100644
index d2376176f2..0000000000
--- a/.buildkite/scripts/dynamic_upload.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-echo "--- Starting Special Buildkite Bootstrap ---"
-
-# for loop features and models upload to buildkite
-BUILDKITE_DIR=".buildkite"
-TARGET_FOLDERS="models features"
-MODEL_LIST_METADATA_KEY="model-names-list"
-
-declare -a model_names
-declare -a pipeline_steps
-
-echo "--- Scanning: ${TARGET_FOLDERS}"
-
-for folder in $TARGET_FOLDERS; do
-  folder=$BUILDKITE_DIR/$folder
-  # Check if the folder exists
-  if [[ ! -d "$folder" ]]; then
-    echo "Warning: Folder '$folder' not found. Skipping."
-    continue
-  fi
-
-  # Use find command to locate all .yml or .yaml files
-  # -print0 and read -r -d '' are a safe way to handle filenames with special characters (like spaces)
-  while IFS= read -r -d '' yml_file; do
-    echo "--- handling yml file: ${yml_file}"
-
-    # Read the first line for getting model name
-    first_line=$(awk 'NR==1{print $0; exit}' "${yml_file}")
-
-    # Check if the first line contains the '# ' comment marker
-    if [[ "$first_line" == "# "* ]]; then
-      model_name=${first_line#\# }
-      echo "Model Name: ${model_name}"
-      model_names+=("${model_name}")
-    else
-      echo "Warning: The first line of ${yml_file} is not in the expected comment format (ex: '# model-name')."
-    fi
-
-    # --- Dynamic Buildkite Pipeline Step ---
-    # For each found .yml file, generate a command step
-    # Here we assume the .yml file itself is an executable buildkite pipeline step script
-    pipeline_yaml=$(cat <<EOF
-- label: "Upload: ${yml_file}"
-  command: "buildkite-agent pipeline upload ${yml_file}"
-  agents:
-    queue: tpu_v6e_queue
-EOF
-)
-
-  pipeline_steps+=("${pipeline_yaml}")
-
-  done < <(find "$folder" -type f \( -name "*.yml" -o -name "*.yaml" \) -print0)
-done
-
-# Convert array to a newline-separated string
-model_list_string=$(printf "%s\n" "${model_names[@]}")
-
-if [[ -n "$model_list_string" ]]; then
-  echo "--- Uploading Model Name List to Meta-data"
-  echo "${model_list_string}" | buildkite-agent meta-data set "${MODEL_LIST_METADATA_KEY}"
-  echo "Testing: $(buildkite-agent meta-data get "model-names-list")"
-else
-  echo "--- No Model Names found to upload."
-fi
-
-
-# --- Upload Dynamic Pipeline ---
-
-if [[ -n "$pipeline_steps" ]]; then
-  echo "--- Uploading Dynamic Pipeline Steps"
-  final_pipeline_yaml="steps:"$'\n'
-  final_pipeline_yaml+=$(printf "%s\n" "${pipeline_steps[@]}")
-  echo "Upload YML: ${final_pipeline_yaml}"
-  echo -e "${final_pipeline_yaml}" | buildkite-agent pipeline upload
-else
-  echo "--- No .yml files found, no new Pipeline Steps to upload."
-  # buildkite-agent step update --state "passed"
-fi
-
-echo "--- Buildkite Special Bootstrap Finished ---"
diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh
index 57ced35f2a..241347d172 100755
--- a/.buildkite/scripts/run_in_docker.sh
+++ b/.buildkite/scripts/run_in_docker.sh
@@ -12,7 +12,6 @@ if [ "$#" -eq 0 ]; then
 fi
 
 MOUNT_EXPECT_RESULT="False"
-GPU_BASE="False"
 OTHER_ARGS=()
 
 while [[ $# -gt 0 ]]; do
@@ -21,10 +20,6 @@ while [[ $# -gt 0 ]]; do
             MOUNT_EXPECT_RESULT="True"
             shift 1
             ;;
-        --gpu)
-            GPU_BASE="True"
-            shift 1
-            ;;
         *)
             OTHER_ARGS+=("$@")
             break
@@ -40,10 +35,10 @@ if [ "$MOUNT_EXPECT_RESULT" = "True" ]; then
     echo "[DEBUG] Path: $EXPECT_VALUES_PATH, Filename: $EXPECT_VALUES_FILENAME, "
 
     EXPECT_VOLUME=(-v "$(pwd)/$EXPECT_VALUES_FILENAME":"$EXPECT_VALUES_PATH$EXPECT_VALUES_FILENAME")
-    echo "docker -v cmd: ${EXPECT_VOLUME[@]}"
+    echo "docker -v cmd: " "${EXPECT_VOLUME[@]}"
 
     EXPECT_ENV=(-e EXPECT_VALUES_PATH="$EXPECT_VALUES_PATH" -e EXPECT_VALUES_FILENAME="$EXPECT_VALUES_FILENAME")
-    echo "docker -e cmd: ${EXPECT_ENV[@]}"
+    echo "docker -e cmd: " "${EXPECT_ENV[@]}"
 fi
 
 if ! grep -q "^HF_TOKEN=" /etc/environment; then
@@ -81,54 +76,26 @@ else
 fi
 DOCKER_HF_HOME="/tmp/hf_home"
 
-# Prune older images on the host to save space.
-docker system prune -a -f --filter "until=3h"
-
 # (TODO): Consider creating a remote registry to cache and share between agents.
 # Subsequent builds on the same host should be cached.
 
 # Cleanup of existing containers and images.
 echo "Starting cleanup for vllm-tpu..."
-# Get all unique image IDs for the repository 'vllm-tpu'
-old_images=$(docker images vllm-tpu -q | uniq)
-total_containers=""
+leftover_containers=$(docker ps -a -q --filter "ancestor=vllm-tpu")
+if [ -n "$leftover_containers" ]; then
+  echo "Removing leftover containers using vllm-tpu image(s)..."
+  docker rm -f "$leftover_containers"
+fi
+old_images=$(docker images vllm-tpu -q)
 
 if [ -n "$old_images" ]; then
-    echo "Found old vllm-tpu images. Checking for dependent containers..."
-    # Loop through each image ID and find any containers (running or not) using it.
-    for img_id in $old_images; do
-        total_containers="$total_containers $(docker ps -a -q --filter "ancestor=$img_id")"
-    done
-
-    # Remove any found containers
-    if [ -n "$total_containers" ]; then
-        echo "Removing leftover containers using vllm-tpu image(s)..."
-        echo "$total_containers" | xargs -n1 | sort -u | xargs -r docker rm -f
-    fi
-
-    echo "Removing old vllm-tpu image(s)..."
-    docker rmi -f "$old_images"
-else
-    echo "No vllm-tpu images found to clean up."
+  echo "Removing old vllm-tpu image(s)..."
+  docker rmi -f "$old_images"
 fi
-
 echo "Cleanup complete."
 
 IMAGE_NAME="vllm-tpu"
-if [ $GPU_BASE == "True" ]; then
-  echo "Docker build gpu image"
-  IMAGE_NAME="vllm-gpu"
-  docker build --no-cache -f docker/Dockerfile.cuda -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
-  # DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 \
-  #   --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true \
-  #   --build-arg INSTALL_KV_CONNECTORS=true --tag "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
-  #   --target vllm-openai --progress plain -f docker/Dockerfile.cuda.dev .
-    # public.ecr.aws/q9t5s3a7/vllm-release-repo
-else
-  docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
-fi
-
-echo "Execute Cmd: $@ on Image: ${IMAGE_NAME}:${BUILDKITE_COMMIT}"
+docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
 
 exec docker run \
   --privileged \
@@ -141,11 +108,11 @@ exec docker run \
   -e HF_HOME="$DOCKER_HF_HOME" \
   -e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \
   -e HF_TOKEN="$HF_TOKEN" \
-  -e VLLM_XLA_CACHE_PATH="$DOCKER_HF_HOME/.cache/jax_cache" \
+  -e VLLM_XLA_CACHE_PATH= \
   -e VLLM_USE_V1=1 \
   -e VLLM_XLA_CHECK_RECOMPILATION=1 \
   ${QUANTIZATION:+-e QUANTIZATION="$QUANTIZATION"} \
   ${NEW_MODEL_DESIGN:+-e NEW_MODEL_DESIGN="$NEW_MODEL_DESIGN"} \
   ${USE_V6E8_QUEUE:+-e USE_V6E8_QUEUE="$USE_V6E8_QUEUE"} \
   "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
-  "$@" # Pass all script arguments as the command to run in the container
\ No newline at end of file
+  "$@" # Pass all script arguments as the command to run in the container
diff --git a/buildkite-script-dynamic.gz b/buildkite-script-dynamic.gz
new file mode 100644
index 0000000000000000000000000000000000000000..9062e95d9fc13a790f058d1a16c52a94da6bff10
GIT binary patch
literal 10583
zcmV-dDX7*TiwFR*VA5y+1MPilTjNI3Xg>Q_bi|wh6D%xWfsFGk1D*+QU|_*yHz&yv
z!j`duEjhZtuw>rnd^!JMKfb?if61xpZb_}1Ex?#d)X(k&ORetes;=tls_v>I?eh=1
z*4OQI`P?%`{Y%5uznXKYR4T1TgM8JZg?v?Qv>MeK!RM^NOQlh1*6a0Vt5zo!rKL1m
zUy;xA1|+R`j;q-K%8hPLbR)ao#5Ts7eRE-3-X!fk8wm;i`$CG+AfC~0D^*3Q)HX}C
z3K_e`SZ^z>#zs@AZZ?{7wOZM%Ha9k#rKbkAD043%*RMZXA3p!r>&W-5B>w}yMESo_
zZ8g6ljYoq>D1UMO7wCT!ECs<cwP&bftFMn#S07JCnybq<<I!FF0DJ&#^T_{3tx|81
zO0`jMDuw)CM42xC8<p0kTx)JL8ygj+UdaE~R|Ni_GuMa3=l^v1uPF6uHOBw7h63`x
zGQVMFm%ljwFOl1W!`;rl`f2;1^QEL+=%(vzl?eI6(>;BQ-2VNva|$mUSD(=5Wiryv
z_0blQPEErluI{)F8CZ6t9SJHDa;e`yHP2OVPqsgF)WrLqH6CkbpEiqS2^zO`m~cH^
zy*AvTijAv&EqvcS+&$bP$E;4DxE9fRuBVMC7(siyxiZmvLyJhi>~`Lte*A5Xcs{^}
zd65J;(f*X?LnM5%Tw`GLG}o|9XN%;-i79@iW@D(YPju)}walrA45@}CM!7Uhm&u>}
zFG2(f;Ph-`;yUZSq29YxZQb!kt|L1`@@r6=Os6|Z1Co9`8F?3mDL^o^tV?I<O4$<H
zo>-1s=D=<VFk)fEESJmVNT@}dA+Bv)T<A7T33xB1eAYz*>YrhdeS%Yq0N1-75bVwS
z)4l!OANRT)b?5NlVEfaq+WF^RS3T)&cTZ2EO&J673z24gLCX3cq(XlCj<`eJjC>ye
zGt>$46Paj^qxWTUIvH77A7>@Qxh%_4?8|^iWlC%^)?KaK*IbP_x=SRH2wO-X9I!+r
z)<X;<BS)VO*FZByj|&&;A==o$C`DLn&vf+*oAO8wmNN1&QC`kV$|fco^v@H$2eU}k
zdOgq9dN;(>VNOjnTZ5tNw(&2$FNm`g=CVCgHB7Z{^)7WAnd@AGf7Y)p`_h?!*jaZc
z9u;~3;5r@mx?Y9VSvPw_Sb;8q9Q1Jj0k%<gi9*WbD8i*IL1@G<`*`y3ZZwP%CnrU-
zt1VV6<`^nTgSC*38B*wxq$AMjISR)RtUR*uNVj2j0f~Vv1IaL*xhCiL5Rlo-btmP`
zmnB&OX_7Xa$CFC;z+MzBIdPd?ZLy*$?|4ZXz=eFu5Co4TpWgcfdTtU*1T2q?I&q;}
zC&#ky+(&iG=B+fzWO=sgP<W_Za#Y4R5s*UN&K<%o|6(}t+@#(oyvx-567tOnQSo?E
zEjPZSh|G!o>}rdZWz$DV(!eYvScaf@Bniey^c<C}2xJ*D?5Qnsi!G`CpNTQiM~11Z
zu4VUz4>SILmD~T*s8uR8h1!3stw!PhSxA}U|7j|fdUdlQ*OcmJrM0nH`-`#0U!n;7
zKWDBF%Pjv-wN{JqzXD$r{+~sZz^}7}s>{O)ZKc~e>5`Lf=ja4<tQ~DM!o~aA)+VsW
zUn8D_KPNZt&@yXiu!KKhHJ)4lf_B_JIwe%?s%%khQAHzzJh|ZuK2?R$)Joq~*h9w}
zxGL%e@XGeMjuO>xeFYaky=v+z{mI_N_1v8H-vl;D*&*dC(Ik8|8jZOv_)0+o@wzyo
zQpUg)K5&3*VvMM|pEZ8v#f(1~d~{S<IH+<QoNJEJ%Nl+4#f)C0zCWa_q1X7(13$s!
zhOAsEtEnicy%-b#EcVUBG61A?|EV~}tu6XwG*<O1Z4|swuM|}`VGF2d;j7N^^@pdK
zuJJEAv&4o|@m7B^cnjuD*7&mU(m?BnHlB=hXWbeMsES~i1K94G%v+U(o5nN1O&k|9
zN0NnwCZ8P$B0C%deVWi&#^9UJVDNX6DfkZdep!Re!b^+e#i4l3L`UnTp(8e^tl@9?
z)54lu;y;sFh}d|Mh)5e=7D_fbO2*y@wvtv~8)e~Q^F`t!GOkSBE6_SY1=gHQz3YW4
zp8lXRhwtm24h*N~s+I@ex+)AvxA8n-7CK&z9t!A4)kHD}oQW!5MIBrEtl9D?it3{(
z8<9a}q5xD?Tf4Z>`%oAE)m#?_gZc?Jl8KIIuCyi(@=PCOX^z|QPlw&T4|_Y?-Mzz4
zbUXf7TkM|{MnlmV#i=Fuk3!tWx7?9uO3Jfp%ClO^vwF(2M#{5h%ClC=vyGHzn`tno
z4kT?PO4?BTNqqPDN+f!6ERP3mS)i0dK2XQUvOo#}90BnHz6CT3*cFf};8H-LfH@Ii
zBA!I_h**&dYx5#Xs{hAdwbX&Gxt^`7K_!q<ckd(r{hwM3Ug7;8rBU4fTu7OE|EIE9
zZMEcTt5)CKsBg9kj(mMZ;Qu*ueORXZf8bqJiSd7}s<aCK&mxL3o>;a^H2Y$r*^XWs
z*cK?XnmZgB=Y+pJf}f?*64})UhN<JZ65?MBBLh$+Ol_<aJPX4%J*02gde61&8=^U&
zu9$#flONX6HRp!F-Z@Ye<x;nEaJ0YO?WiC2_B(h?gS0`j5$~e;^HIEwCY26PL8<Ag
zyL-p*0fM+9@rsVLS}OJR0SRX`j4Jqi#cvPc-omq4YreV#f3K4A_n^$0I#b0Yi7B%m
zc~cJ2!V@P10HGKl6m4377MbpJ$hLhEYIH>GmaJe`TLcfhM4ku7$>iWW3Y+-B-|2%`
zO&?I0r>46ypa+#sU~1C?XRa0Cf|B6_>yj+rik*L1rpK378SqPF2h~VCb(Eku9%TYe
zlJ)GrQE}S%v`A2OmDYvEOv{aaA{&m1jjpVQxbjEQv2DwSQAavozD=|apkuoO%QO35
zORGUUwhp5+{Q-4M2S<&=NlaBfBC<yzTq8YO#|{mb4Kkc<^7s<ITA}q->$2U2E$3<k
z^@s7cmv6(*0aC*OxU~L_9HROdep9%BU2s=OtQ4$sblW!}p6FRJ3$|G+5tOcxWqWyz
z=w{Et!&B{L&mELEmRFG>2jYP@fqH(=Z8DG%yp`1$HKZsNunuv&UQc(Nfj1h_;b3k3
zb5EbRq(lFMw;`ZYJ%hBg6*@!3wztTw{w0%&{t?OL^<^q`Bw1p3&`bj4*KMa~*=(L)
zX(OW_$S8+a54d;PL3;v2s-q3Ub&oP)JHnSTKia+`txIb}Ql-`KiOiq)@SS+)j%&D{
z%Vrv_HMB+u7=x2OwXG|oulG^%dSl%BrBK)B*}g!WIK5<gvfEvW5cNZ6yL)=vp{L&3
z;yBhK)#yRVNZkY;q94#hooy<R!WW5oa^y=Um>wJzI1SKYJA%itdGe!v!=}QNX|hGO
zMLZ(iaEg#?_$J_cVwhQ|m69-yU?~E2qaqHc%GhuKjquwc=em2X>n6FeJUb@$a00Pi
zwwyUXg=LaUnaCnZMINt8#lQa&!~$&S0d9+pBQhP?1Sh!k0pfu04qZI48E$F}+JTuX
z-6jb6g{_*x2V7hW>*#vCEq_S=unj(&gDDwEDp*_GVGaQ$^a|qXI;VIV$k*8SN`e*Q
z6dg$a`Jev>yhr8H*JQ^s2gU^>S_re|Zh<fsAo;R<AEAXHq0kb@>x(R5A&|-vc}waQ
z@-2~mHGQ3AtQn@i()nxl0X!oE&+M_wV-}&hm~ZJRqn|RI%ybgjo+*;UiYo#5y$!_m
z^*VW*e||*Ygwmp~v*`>Sw<TZ6fr<7$&~bM5kkkP(<C+fZFc5y*eSaNGgIy-?Eu6<T
zE+2n*ut4@@0bp6{_f@}g2qWk+d?uH1Qc9udac?Gi8x=T#6<M=%Sq>i$_0gp5V<#xc
zew1bSYRh{;3V48jE>8jpu%gQsGR$4BJ(ahYS7j1y5wwS)cE#2vm?+Egvfql|HWjL%
zRsd&=Mj&eZOH=$@__7*&5l&l*c)ARQIFWqB=RqHp*!Ra<98c_^HPcHuu6*1?43-QB
z{-X{4a5Xw-0AL4ZDjx#<4-UfN`bQ2OzKzx8$g0JFV=w5R&>85SDt)V}qz!uvRYhJ@
zRf#bpW6Zyrjbpm~kDUQ{s2$+$_P=_wS}*K>izxZ*f14Gh(JJhJudt-q|K`K>VVP<F
z3*x`|{9kQi{IA0P_xR-{*#DBTs8X$ewAw7&A8TvDmcOu5(TpcNU5wR_b_n~Qu;aqw
zS6KZ1$`(I?1=|8YzA$TFc*C_Fjv>X;C)A6LzOeE=nUznNg7>oV3BtUv@D&!m7i;08
z^Z1eNdrSeEVcnzg14biltgY|kzvY^&lsDd&_e^%Q_bx_1)?s1tn~TXWlyuClCoEru
zx$nMZdi;m}4It3yJ@790UF?6&Ruj+v)GF1&{<nxS!~R#RSLJ%Gve~NEHk7|Sv%JER
z7XM*BTpyMh_P<K2uEh9XsnrYn-$Kel<3G%&8>&hOVaRESD6-2YEji+|nBH~Z`rw@J
z0&)IOJ=(ME4EepLYt4Y&$A8CeIRz@W<4veJ6*i!@0ehG^0LP+cW%hv4vS!dfb^Z6x
zQO;?ORdVNlY5?CW!2gw2t5&T4izqYlf7B`sxn9}agqO9#|Mv=u!2dqi&pi=TC3pTO
z&i_hFLH)nT|Md7}`u^{C?eiS8z}fr1dYJ#Kh4Q~zuNCrtAtfgNtDCX>e;W<Cva!)>
z)oaD}?=_aR{oj1JJ}fi#f30S{6660$qf#mMe~TzDaQ_$GUUjM+a<GpP?uD4p|F!*3
z!jFTn%@PY7@zjz2?vd$bJfcDVf<0!=VKdKy80jm4h-sl#bn_}0+TTyNKXv#1H;p8L
zF(07P<7}&+1YM?noi$UY1zQmj^r`a??pojNoa}vkg-H6AT<z~4WZ`LA6c!Or3`cth
zNBe5`=c5jEhyi0>Dw-xyG*MS!JqtzCq6FrjN`#!Abkv`koeh;9(YVV*+jw#uCXq-3
zX%QMp%W!WfrV^NYtQdkqZ2vq%7gG>0Jd}H6IV%QG`SWZvg`vb$ih(c?A;#(a=9_4j
z_rm>HrN06G<rM5Y6TRnQ{M0L*^mTSlBb^4170U2=&_KY0>f-@O5Jwrx!61QTlEWV>
zzTrb~bYIj&TPc@~3)8ao_SA2ne-rhNdwdF%xZ`KS-(whh{Og`|Rb~fTvRaU#if#j4
z>>jI6gtxo}Tc@^kSGULIvEk6b)$9$@w@l9+wws$%rwP2^h)GPC$BF~_6cgqF#OI%8
zMMh(<)vKx>BW(_STwbX?tvA$FZ_>w8CSl_*G637%qOZjxuf?OT#Url8qpigwt;M6P
z#Urf6qpQUts|lGXS>P{v@Jg+zin{S0!_elc+Nub3ACGyaCMIIUK=v1dtVzTVu*QR{
z@xW?4s2UHb#)GNxKx#aQ8pG6M@BdTnzf6mrTOW7u|3Rst{ij~!e_2eK?*D7nt1Wr6
zxw+AT3Y&!i`1KWm{~6cM-TQYZ{~zOjrP*j!3jg0CN~-*y8GY++ea!a%h37x2)hfvU
zCd`;Z{x71;IRBw+H08Qdt<`JQ&3YmKUtbaUf6iPVmg)XKMQJpeG5)Vt8uh~ew}`St
zZp8p#UrOQjDVE7K@DvJ1FGE8I&BPjMNz6Fw`B_<0YTPN2xRc0~+n<sumu>1|xp)KX
zOpu8;3e;S3@#e-%G#BqQJVmncPOG@se7w^~O40$P4HOx9$zx{ZjUapr$wW^?KZA_C
znULklD!vz*fpRgJk(Z_Nb7E3r#d3uENwu;wgfBgw%Z-Z5AgvdANnE0!NmoikQi^~y
zVi<l*IeAxVMv0W0w$myuR$S&9FiD5EkeV5?>CvPnQ0sXL%@D9WG82z<VB823m^6?T
zvsi6%elNi}jn(d?>iO~|OH57D2KAV->RwokA}2Rar&V06z{)>rk`8nswK9amqe-pz
zKAE1M+!8^{Be_mo=-A1n8aVe;^rmrIn!K_+8*{t?R8(3yDr2J<V4*JP4sFZ37-AHW
zgq|q8%hVJTa?c4-@|e=@4w#Q3G&dclRa~r$n>ALF4r(FUGQ`KDNj65P=c$rKV9Stg
zPi^f>mQ?@Gac6t?pd*j_^XVg}|EE!_RBB4S#qNI;{{MxPDgK|PQmI!r8*&X^Hk-|A
zaRKZV7J>ig%=KZJasR7QZ?>ZRuQVI=M&bWiL|F>sz?Div*PS?9>!Bx525kgwj%99h
zER}E_{b;y9c<1C=Kc`BH;qdTrp3S^5Tx+3m!@?+Y{`EqchGpw{wvB5lU6cLKnOTfS
z9P{gMl}cx4XKC)It)L2D&4b_{Q<TW$LBK#S%gm>GZdop<-P!~+M~Oo7)MT4mNnQW>
zxaZYJ{_}s$N>!ojf3w~!*8fG6>E}Q2-nU$@*EU<V+D7B`%=)7EB}LlJy^s9o|2Y3E
zO{ia-|64?vvj0>4IGhhv3G<)-3-Uj}ABFs1M9G!^1>SDd8%6x5S6CAJzqxUJSZ3`1
zTCnkp@qY#WD)xViD8l|P44wGVk2#L}J=os$tsV_9jp5L-g5yWv$4KndVD8cDkw^6Y
zAP=4IP_KjMT4UJO^|aCGCd@`6#vWha*$wu3v7I0W!FK3=(b0oeM>@N@8AR@;9r5HH
zm~F;zSk!v%Ao1H~fSzM?avmcC&Fp^AGr3e+S|W$g^p&AsvwbcnIk~X35#N<M=u5HS
zTZ8tBz8Gyh@)H42AovzGx$(tRBPPHG`&0{~#G4*P3i}8EMR4>!i#z}G_Q5_IK2Mv|
z8-}4OFvK>8dkxbUke+7Jc<~x>49vdZZ)qbLU^Mg%n0?LeqjxDpIScEL1GaS<x81$A
zXmso_^gQgLONva$+qVa_|Kvn@`<ASnO-Ueiwz@@jD2XtQPNZ)-K%ZC|y_SyNBmki|
z#5yBgvrT+QTa`;y3W%S<dKM^Yx$N@;gDB@M0E%d2dw&|@nJjXC6QncI0O>YNq={uF
z)0+T9c!2tBb&a2Qpc7L_b#bNHhIWqRrd}MHlOjF9{u;|pp<U5VhB|j&U^W8=Z!mQ*
zsJw>Z|BVZi#*~j|z+rQKH%MA@LgU-RSYbA>5bxMNAMhFjip@!9XnCVPhO*aeTQe^(
z@dD1&(a5@H3>)vvfUj20_!XlwI)<|#;NaP65E`HHaX%(FSVp275JWT)2+hPJGrN#n
z`p>`r=Rg1czwkdYWdaJdX5^MiYcFjUWwn@gavtjbo7WAjtQ7hZyVQ$x_`$kHem!+`
z{JBJwUuAz%v-uU}y;A9v6KE>maq<ZwQp0=?;iSiuETL!tq`rN-!wq6@-%=E#RjkCW
zhEKyV5py_@cVV)b?+MRFK;BUYE_=v^|Msn)w!)#^0c(>pv7s}v;^!&gd9YFwI-Kj2
zYtgZ{$6nEDm`MaAnZu+CNb%lKv$dX!$$3V`CEyO`B4FT5K3iimEImcTnR@1@Xh=);
zf=CSIJvC`w>NhmU4;>C2{q&K9pNp9muBlMQ_rF``4g(GaJx{%G7BOa?ty1cBK5MuJ
zJcxR;a$p1$japMVNm({53IJ6f{MCqpFRVvYx}u;OSv{V-h4>;Ogh1sGs1XRLIF$wd
zSt8Xq!^GE!G?qzSAM!IzxH#lvo3!poG`%r-C;FA|!8~GP(<hyf?>J<jhfdnulQ0i1
zkc8#!N1SfK8pL`^Oe?ZSb0BIajj^EPF+efH*EeJbHW1dB2U%l!V_ZexCl%SG92l^q
z<Fd{r4Ca9fbe=@^uy~Rht`YytmA)SDoX_gWI`lTO0-f=d8>jRTS8TSn`zyXr(J%o`
zsI^k=S_+iEASYY|5i4gimesS>V2a*`Yr~f_;UZ6vE=~UoG8%!SYLcJm3qX%94X9`s
zOclSGa81F()LMYGp$BuIjLI3RRWNvc)aNpv%I7ocL^|WM{Wg{#4Z+$5CCdbD9F=9*
z7C3@yN=)mMzQgG=)zDY<(v}^n0?B8=0uzpgj)7(?V=8d^FkJsc%NeXSthu!H39zQK
zB#1OMp5fm3V`itRQVEcbdL|LV`QXdfTr(m~EfVKM<S@;hXGKKjMTi=2cM4g>6zva1
z(o^s2^Oh9*e`p8x6CH^D&wKQd-~QXIRH}8n|JiC4=YJMbX88Y_)rzcCtDBp(YNd$(
z@e0e7^d=AF`moHp|6QxpWBgyMREzUJ3n@!qub&&{I#tk1;Zfuuhx7^GIESkKdw*K6
zGs&eTs;Z1EYZ4e!sWQTitSm_m-C_g>jlz@0yL-oNDV&2+sk?prvC~yQ9PaOSj!)V=
zsdv~i-yRw!ON0xnw>APGnCiB_chXgV?EKu8@XQj^6h{U|wA}mj;qdr?24maaSL3xY
z4sE3R(c#hQ@&5L4qUOY!cq7e@R4(}{2Ca6`>2B|Cceh!6UoYV``+9GLCf2eh<b8F-
z$3YNwdftg;U1u*H9j^zt%;a_>*oCoeu?gHHvDV{naGV*h_`ovW`pO%9E>08JgrIG9
zi@dpwQtk_k79ku^ZIET61-1N5wAOc|Z-Ju4Dz)E4=EeFO;oDLoJ2`vFlX#cdA@*O$
z*QDGh68};nzlq1y8QA||*5obnfi<wqD+}qwWpZ+9OePoy)V~K}0yEI#V(fy-5}N7g
zR!%gvvwbCjjR4Ex3&WRhG@dBlp`Vp$;@Aqy@QR**vW*ZgFH_6AMzd0rYwZS`gKb)W
z6lM<Qrr{Jz)(YF!;00j^GiY89ahJRI;iMhw1AwB5e>vL4y9cy0WOzTZJ(AR~kHp04
zLe}zuVLL7ftRWY=>jpM@sz8_6k#`#d^+B!nn-%T)k}Q8ZZnu?NiUsnf@*Taq@MRfD
zCUlmbj$Y_7Fzey-<t4IAsX&kDYxX6LGw)9bm<s_663qj;M5MRTi9rNrY`?h;KUn&8
ziF_%E*9h1a_!Cq_{0O<1Iff8wK5bcRE12w=Zp0*UKKo<D`tQYb(0p3+y(Lez8<;s+
z-d!6_>WuAK8$C>N9c>?&Ex3HZg{Y9qn#6>}x9t@vIvla?ci+XEipgKGa!khDxn(Yh
zBGr<@DSxy6$`Pp`%Cp=}y%nFwgvlo>G~!^RB$Rz1VQvM=d|8&s9tJH9*=0G>{c_|U
zO3=UVXuVP?F<+f5@P(T&F<84%W`fPHz<g+bKtae2PySW4fWa8q$%_7Y3pp>e#soQj
znS|uQPJ^_3DPHCiG+ywKrcnEXzn-x)@3Jq5)<hegFKuyk<lp^(Ch}`|={Vk)nMHzc
ziQzi>Xn=##Oc;#b^IU33IZt?JnJtCs!0cj|zY?Pe2Sx_jPTsxi9DXRpa}UNuW#mM0
z><UrP8&`{BABrcIQ_m}7_f&L5^#nEvA`o*i+P9#ycjVm)U97^Du{_p3_w@-ZC<-ZK
zZfG(fzpfBwb(P-IRPv;32@f=U^6RR<gsqma>9{gLb}Y2}y1Gn07$6$RJ}%E&fvUJ;
znOC~)G988<5J0mm0$uj?n?5(J@Rl*E>H$*%SKiz%Mdj8nfBWsr>XuZZAPtE0znou9
z2^W)OHK`PRhL<>js(HR#^+)*=W<d~uiOIT5Xj1A)rP#wPc+$QN06JoVQG_WeD>K1L
zuL>0t&$EEbXaV4hehcJG`&<xX{&#HEE}QT=Nw0`P7f!tJDqr#woMQ+6y}hkBw^4<j
zGTjnyDo%U+{EI~XoOKUyYNK6tmH<S=N_I0B*i5t$X;mt55grZ{R+&%%ak!wwZ0sx%
zsZ>JQD_}5Xip@k}$~SQwPTNeyFq+a@K9uI}BR_&?HF{r+XKWTlLZoxlOj@LW7AHqW
z|1nCWx0R0=d3v}D9WvXy_oOc|Z6hleFm>jM^oXpUGwG0M?}UCvw}Y_AsZmuyVpo-<
zH_N}8%jtE#O%<jQptwNv0pcV3`CuQljg%%uVw7<=kX|W;+%cUbl>&iA#R;$mN+NZr
zMo|)>3xX5xiz-<x?miF&FRSa}MQ$Wy*lVaU1~#a`@&vkt;SBw2@lS0-d&y<0|0f=f
z?4Et(zyE~^0P*~9T~Vrq|7Q_py8lPntSEA$qSTdotJN$R?e&!$K?on*$Bg@5O(-$`
zuPV*L|Fe)H`hS){ixTETd>)jhrmsO2tn=A$(U7Uzwq>tT+b;g}Ow=R&BhG6y1PeY#
z#e>y#<PugX>z+lQVhyISfg-b%j6K7&gwFBt;qewZbwIZOt*AoYal?yB|8HM7s%xpy
zVaW3LsF_gE6smC?9G-sas-1rxb#}Vyap&Z8zuT5RXrKg1rNizIonv+T_~S`?1vaP5
zuntP(8$iW{iy-|L*wVQXR*9Hi@w}2}x@BfZE3>oOq8$;_BBt~o$@)i&Gc*PWviK~^
zI7vtdI5iFZO^Uugw{`6@tto0{WQLt3(tW?XMY<Moya`K%g@!}V>@lk^rtGr@Xp1*t
zXs9NCJUXS04K&#VhQlju<bl@6?TG`Ao-gO4{poQ3^q_;N<qtZae!^dz@F|#)NSh};
zj6@=GEw49(FZ_15_fI<~>W97k4j%iF1YZXaqeYH1cL;MP`oYn5_lGr5E}6q6@!D^1
zjc{v!Y#8M$0AXcv-CteLf@CWWBX*fpc~yX$O<zpL)f=N>5f%JM_sn!7f<=)DTY+SU
z?KCKm%SbvTiTYDP)RQ0rC=5Psp!k|x*l7IygGm49hYxD^@W;-lwnWx-x3{jFSB7nw
zXl)KCe9;?OULUrjJzIAj|J;^Cv>v^IL~jA%p=4H}eLiN}jq-3%cCAa@qz6b1kT8GJ
zz+aBnw}`7#*Jiv=@;zDa>sRZhHyV+5@7@t90Fc^%SEWGyZ1?flm;V~~KJmun*nT#s
ztBuezcf;M6$Q>!4BC?i%@%7P&#xEm%!$Aq!Rw|oH$+0}UrzgJSVzc}&M1M4-*Isw4
z{6)$Vqw%^#M2auGefI!JYgr=Se^U)wp@$dwxoO7yPO?dKVQK8z$}XENu&7?>VN;=4
z7Df?I92*@(>}`86(mt!E`I)1yYg0Qj&G-hi;s*#y6P!XAY?Nk#>l=1kS|6M4I-vN{
zS?6!p{dBHFYpjb^-~4cR&<WlmtSe-E39m?bg64&I;}Tg7;N3sm+1^)SCSV)ww8}7N
z;w?r2nATjZW_12_aIxLuxygj%?YaT2*(3=I`mhsCigemN-1!k!nxOx6cRX1i4pjVx
zTKrcq5+Po=!kc;M4h!$Q5nCTyF5b1cfKEj^Q5Vk;UE=4ud#wWlGtY<&4W4-Cj{b*-
zvy$p6j_*2k&{HwAKoEr2lW22&nYtcEx@LM4T(c<G(Q~=Xk&I4s?2=AK=!79FI0Mt;
zU!qTs9u;vX9t{^a(g!Xkl2pTXKuB;oIB`(CfQT!X2gb;SClUrxbzIAC`>jZPt)oV%
zl-82WdJHI|SHJw);uzR1Ouun9oE2x4hqmKz*<(^3WORoVwMKoFAtD?L`yIAi{$sTy
zKpuWF1z@N&8#IH#CNzj3v6g%S_Iwu;%&)zJ?T_qiFF(bDNG<W^==1>en^u`kD`=a&
zT4&OSx(Lh6G#oVaWh*H+LEyj~LjPd^s%O%rq>S#cD`*Grz_AA=tTS{PvY*ayTy~6q
zfz)n(<d1*^^bgb*BK(W`L6oB;S~GRAkG=8NSk^SkI$tEDwFnjBf)f?4$P=+CzC^$A
z{|w%tPfY#i{<gZaz4JqdsTYxVG-tKaR+1n5(AoJB^g2NC(cV6_%JW)pZX-$OzlE>F
z2LOinQ~JU`N@UN*591Hv<M^YXfmnKN{5<*yK8`-%pXQlBtlr-wvINQ}C@pwQh_9Ey
z<xads$<BBLhl+5bapgH+)HxbrrrQ7FQB&{PM^5{n5dX2RREqdd3n{tcKh~P9&1$tc
z|NRO}j#$YL?qi1iPpP%y{9jct|7&6Yd;G%ff6kB_;7X<KPd}?4wqb+6+a915Of<ud
zTv8>;Gqr*R4lQ@|WF4N6`kuT)ho5?>->8cCqxwAw;w=k%KJ1nrq~3nt*>9sQtwhaU
zqFa!j2}OO9hQ2=ZTv#hnO{Axdf9(8>izP0zfnDzKw7Y|PP|`0=osrZfZLww~>XX!Q
z!*xsj(d5rR`4AmE5v=zk1`m3GjnyQ)1$55x4t;A3M_>4ueBBOwp5!<3%{OW9{r;dr
zEaJ96pCDsAAdeaC{LaEkU#Q8<WhOJREq0!*%;H7N_6k!Qq#uJ*jKcZmHbS;9q1iH)
zi933tP0Ykigd=nQd4+hR2&On+g7b~s7K1nA^^tysH+9hf$%45vG?NvW8`pSFkfU{#
zSy!H#`JO?Us{hUD2GAY+zpYAL!TUeeR=v>w7Ez}AfAJocyiu<M0hMa4;Fs4|W<)!l
zOCK}zziOq`it)cvE8@Q`rp&tkH|^eE<a&-D(3=_E3>sO>ql+>u^p0CfBpEq6-QQP_
zJAXgzoOGGaL)Fcs1<?<SppheN@R&8oI-)w{(J|8!PS8W+*&|YRh~&pI*CvL1VYox@
zoCY(u^@-)IQ&Zu@wtm-pZX~B7g6ynkLis28{SUlOwJcNZ$NA+qncaP0m%{c2#Bi4>
zq{nN7`D|!FF|<wx*F*5jl7BzxO#nPzPI@a*w^00o62rp>dh&6Z)>vlEpy~VQSWIR@
z1GfdC)_om&)-?n{3Iyi`f7d?K&{g<-@*rQH7F^5j4S5?eb?uWC8wfkr$%o@#zl)AW
zJXy%qzzX4q4(Qdm3fjvIvv1IAc#g-MgDa-xA4vG!L+1cj0qMz1UyTfD_vx@3pJ_bW
z(z&gfy&=qMXSjxLtTi6PkKtxDF<;S7mO5uM$%P|(7^xmgsH`KH#nH=GT7>DqFW`mh
zf8>(i1Nz9X|2Lr2Q2(zfwL<@2M47JtH|y1wyxH8`XepJAdcp0luO#_D!u4}W1~5ba
zr~DG-f2Cfn7V-ZVQKrfNJTVya^ijzj|EW={HPQaxg8vore-UN6{BKk@K>lwus;z3R
zQJnvOg(b=V^WyrjRHny&swoX6#{cl6IRC$pvNSW+Zz*)+q=qC&Nz8U?8m^E{cUBK#
zE@97AXHCF+<dSz1DUthAaw0STRxItZ=a0!xx;%4Kb4kPe08~cPGUrBYBu8`ZukMy>
zZ$4?8?}{6eV>B%EN0HO{5pz1vqSF@A^_k>&z6-2*^4ag1u71I@o7|79jeGI&J!L^s
z$i_0*YrURlYrUK7{Pif8b6hecH23B@hRmjc%pz8Kim7$Is;{5xW^af=zJRFoaR33f
zQFbw!&3HE1q%0|K#OSkFVgK;i5vB9}=1T6bo<io|6GbG)pbF8SAx;-4`hmPJMe>KB
z<`MiyEC&GI03m@8yE}hR&L1xQbMyC|RBT@WW=Ymb8Uhw8Ugt!EC^qj$k=$QBg^Ya&
zo=A?76@oTHdM!}UzW3?+5=5;CY#w2I;@ar>zn==_%t67<&E9Du*79r&%>^T%0@hLa
zEuoo7nF6ic9om+6F~mF|35{KNm#MQSgtrqSbg?4)L6nTr<bHI`{nb+l>bqly<QPvO
zo--ug0>v|<>`PIFBhY3D=%+Sx7bUgK^8e592Dd6X97A{T|2FE)2C3APR-?%Ovyd|5
z{7<a~yxzd%@70as{^u(!v-to1TpyNM{@-d{iSa+47%Kd~izpfXUy;Ju4&aT4xq!t^
z^7w$0^`EB`I0EOwyuex5%;)+^ex1t?ocwI+K4{}9UBOYZ<QY-16Ux|IJn~LRq`|ZI
z24?{~XLoQGduLhmXE5}ArNXQ72q%a4P?zxhMgJV8NIv(^j0uzfv!_`B?u(~P&v3C2
zJhO%1nfrz_V0{eda2BIyxgGAb7UXC2C-x7|U)Iix4*8rxGpLgPv!@Zb4?<R^o4Am=
z&rIq*dq;65wDa>6XYqNKGwn|DHb0*~*jX$#`3$_p^B3R`BWFH;@eJbT|LkeR_B~LP
l=`=2c^)nOJ&);pF4Q}B#E=o~~Qj|rO{|^8;-8%qO0RW1;3nBmj

literal 0
HcmV?d00001


From 86f4667d1786298d2e3a8901eac6e5dfbcfa5016 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Tue, 23 Sep 2025 17:04:31 +0800
Subject: [PATCH 04/38] test new dynamic

---
 .buildkite/pipeline_dynamic.yml |  2 +-
 .buildkite/pipeline_jax.yml     |  2 --
 .buildkite/scripts/bootstrap.sh | 38 +++++++++++++++++----------------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.buildkite/pipeline_dynamic.yml b/.buildkite/pipeline_dynamic.yml
index fa1355330e..dbc315abfa 100644
--- a/.buildkite/pipeline_dynamic.yml
+++ b/.buildkite/pipeline_dynamic.yml
@@ -6,7 +6,7 @@ steps:
      agents:
        queue: tpu_v6e_queue
      commands:
-       - .buildkite/scripts/dynamic_upload.sh
+       - .buildkite/scripts/dynamic_bootstrap.sh
 
    - wait: ~
 
diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
index fc42dbde93..3184e36e60 100644
--- a/.buildkite/pipeline_jax.yml
+++ b/.buildkite/pipeline_jax.yml
@@ -113,8 +113,6 @@ steps:
        - test_6
        - test_7
        - test_8
-       - integration_test_llama_3_1_8B_tpu
-       - integration_test_llama_3_1_70B_tpu
      agents:
        queue: tpu_v6e_queue
      commands:
diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh
index e5c892d31d..44fa7bf64b 100755
--- a/.buildkite/scripts/bootstrap.sh
+++ b/.buildkite/scripts/bootstrap.sh
@@ -3,24 +3,26 @@
 echo "--- Starting Buildkite Bootstrap ---"
 
 # Check if the current build is a pull request
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-  echo "This is a Pull Request build."
-  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+# if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+#   echo "This is a Pull Request build."
+#   PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/tpu_commons/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
 
-  # If it's a PR, check for the specific label
-  if [[ $PR_LABELS == *"ready"* ]]; then
-    echo "Found 'ready' label on PR. Uploading main pipeline..."
-    buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
-    # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
-  else
-    echo "No 'ready' label found on PR. Skipping main pipeline upload."
-    exit 0 # Exit with 0 to indicate success (no error, just skipped)
-  fi
-else
-  # If it's NOT a Pull Request (e.g., branch push, tag, manual build)
-  echo "This is not a Pull Request build. Uploading main pipeline."
-  buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
-  # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
-fi
+#   # If it's a PR, check for the specific label
+#   if [[ $PR_LABELS == *"ready"* ]]; then
+#     echo "Found 'ready' label on PR. Uploading main pipeline..."
+#     buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+#     # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
+#   else
+#     echo "No 'ready' label found on PR. Skipping main pipeline upload."
+#     exit 0 # Exit with 0 to indicate success (no error, just skipped)
+#   fi
+# else
+#   # If it's NOT a Pull Request (e.g., branch push, tag, manual build)
+#   echo "This is not a Pull Request build. Uploading main pipeline."
+#   buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+#   # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
+# fi
+
+buildkite-agent pipeline upload .buildkite/pipeline_dynamic.yml
 
 echo "--- Buildkite Bootstrap Finished ---"

From 6603df2477739269cd9d1be6f60d85bfbd8607f6 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Tue, 23 Sep 2025 17:08:10 +0800
Subject: [PATCH 05/38] test

---
 .buildkite/scripts/dynamic_bootstrap.sh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh
index 9736eaa08b..5b9fd47294 100644
--- a/.buildkite/scripts/dynamic_bootstrap.sh
+++ b/.buildkite/scripts/dynamic_bootstrap.sh
@@ -106,24 +106,24 @@ model_list_string=$(printf "%s\n" "${model_names[@]}")
 
 if [[ -n "$tpu_model_list_str" ]]; then
   echo "--- Uploading tpu_model_list_str to Meta-data:${MODEL_LIST_KEY}"
-  # echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}"
-  # echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")"
+  echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}"
+  echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")"
 else
   echo "--- No Model Names found to upload."
 fi
 
 if [[ -n "$vllm_model_list_str" ]]; then
   echo "--- Uploading vllm_model_list_str to Meta-data:${INFORMATIONAL_MODEL_LIST_KEY}"
-  # echo "${vllm_model_list_str}" | buildkite-agent meta-data set "${INFORMATIONAL_MODEL_LIST_KEY}"
-  # echo "Testing: $(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")"
+  echo "${vllm_model_list_str}" | buildkite-agent meta-data set "${INFORMATIONAL_MODEL_LIST_KEY}"
+  echo "Testing: $(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")"
 else
   echo "--- No Model Names found to upload."
 fi
 POPURLAR_MODEL_LIST_KEY
 if [[ -n "$popular_model_list_str" ]]; then
   echo "--- Uploading popular_model_list_str to Meta-data:${POPURLAR_MODEL_LIST_KEY}"
-  # echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}"
-  # echo "Testing: $(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")"
+  echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}"
+  echo "Testing: $(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")"
 else
   echo "--- No Model Names found to upload."
 fi
@@ -136,10 +136,10 @@ if [[ -n "$pipeline_steps" ]]; then
   final_pipeline_yaml="steps:"$'\n'
   final_pipeline_yaml+=$(printf "%s\n" "${pipeline_steps[@]}")
   echo "Upload YML: ${final_pipeline_yaml}"
-  # echo -e "${final_pipeline_yaml}" | buildkite-agent pipeline upload
+  echo -e "${final_pipeline_yaml}" | buildkite-agent pipeline upload
 else
   echo "--- No .yml files found, no new Pipeline Steps to upload."
-  # buildkite-agent step update --state "passed"
+  buildkite-agent step update --state "passed"
 fi
 
 echo "--- Buildkite Special Bootstrap Finished ---"

From 89ef6c4e561a020cb368b4ee44b586d00c6c3e30 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Tue, 23 Sep 2025 17:18:17 +0800
Subject: [PATCH 06/38] test ssh

---
 .buildkite/buildkite_ci_model_template.yml | 2 +-
 .buildkite/scripts/dynamic_bootstrap.sh    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml
index f58b557557..48e451bc58 100644
--- a/.buildkite/buildkite_ci_model_template.yml
+++ b/.buildkite/buildkite_ci_model_template.yml
@@ -17,7 +17,7 @@ steps:
         .buildkite/scripts/check_results.sh \
           "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME}
     plugins:
-      - hooks#v1:
+      - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1::
           post-command: |
             echo "--- Post-command hook triggered ---"
             echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh
index 5b9fd47294..2b76261d3d 100644
--- a/.buildkite/scripts/dynamic_bootstrap.sh
+++ b/.buildkite/scripts/dynamic_bootstrap.sh
@@ -119,7 +119,7 @@ if [[ -n "$vllm_model_list_str" ]]; then
 else
   echo "--- No Model Names found to upload."
 fi
-POPURLAR_MODEL_LIST_KEY
+
 if [[ -n "$popular_model_list_str" ]]; then
   echo "--- Uploading popular_model_list_str to Meta-data:${POPURLAR_MODEL_LIST_KEY}"
   echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}"

From f0720f332ddd48afc0d75f67bd0fac63ade9eac1 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Tue, 23 Sep 2025 17:19:02 +0800
Subject: [PATCH 07/38] test git ssh

---
 .../models/informational/meta-llama_Llama-3_1-70B-Instruct.yml  | 2 +-
 .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml
index 9539111d5b..5adbb2c503 100644
--- a/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml
+++ b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml
@@ -17,7 +17,7 @@ steps:
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct
     plugins:
-      - hooks#v1:
+      - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1::
           post-command: |
             echo "--- Post-command hook triggered ---"
             echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index eff2e3c815..47bd3be519 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -17,7 +17,7 @@ steps:
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
     plugins:
-      - hooks#v1:
+      - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1::
           post-command: |
             echo "--- Post-command hook triggered ---"
             echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"

From 7060e48de9d385bcf8028084521ec5b3d8b1afd0 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Tue, 23 Sep 2025 17:41:47 +0800
Subject: [PATCH 08/38] test

---
 .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index 47bd3be519..5aa88e108b 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -17,7 +17,7 @@ steps:
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
     plugins:
-      - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1::
+      - buildkite-plugins/hooks-plugin#v1.1.0:
           post-command: |
             echo "--- Post-command hook triggered ---"
             echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"

From a337edba393f983b9837c7e18b773204fb8b2a41 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Tue, 23 Sep 2025 17:51:14 +0800
Subject: [PATCH 09/38] test

---
 .buildkite/hooks/post-command                       | 13 +++++++++++++
 .../models/meta-llama_Llama-3_1-8B-Instruct.yml     | 12 +-----------
 2 files changed, 14 insertions(+), 11 deletions(-)
 create mode 100644 .buildkite/hooks/post-command

diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command
new file mode 100644
index 0000000000..84b464ad45
--- /dev/null
+++ b/.buildkite/hooks/post-command
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -euo pipefail
+
+echo "--- Post-command hook triggered ---"
+echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+
+if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+  echo "The step passed. Uploading result..."
+  buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "passed"
+else
+  echo "The step failed. Uploading result..."
+  buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "failed"
+fi
diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index 5aa88e108b..f75ac13f81 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -17,17 +17,7 @@ steps:
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
     plugins:
-      - buildkite-plugins/hooks-plugin#v1.1.0:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "failed"
-            fi
+      - ".buildkite": ~
 
   - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct"
     key: "it_meta-llama_Llama-3_1-8B-Instruct"

From 6d96a0a1a7a17e877ceadf76d173122b3ab9cbb2 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Tue, 23 Sep 2025 17:53:33 +0800
Subject: [PATCH 10/38] test

---
 .../meta-llama_Llama-3_1-8B-Instruct.yml      | 36 -------------------
 1 file changed, 36 deletions(-)

diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index f75ac13f81..8d66a393f3 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -35,18 +35,6 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Integration tests for meta-llama/Llama-3.1-8B-Instruct" it_meta-llama_Llama-3_1-8B-Instruct
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:IntTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:IntTest" "failed"
-            fi
 
   - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
     key: "pb_meta-llama_Llama-3_1-8B-Instruct"
@@ -63,18 +51,6 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" pb_meta-llama_Llama-3_1-8B-Instruct
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:Benchmark" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:Benchmark" "failed"
-            fi
 
   - label: "Stress tests for meta-llama/Llama-3.1-8B-Instruct"
     key: "st_meta-llama_Llama-3_1-8B-Instruct"
@@ -91,15 +67,3 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Stress tests for meta-llama/Llama-3.1-8B-Instruct" st_meta-llama_Llama-3_1-8B-Instruct
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:StressTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:StressTest" "failed"
-            fi

From 3d7dcd8c0e924addf051abe695f49f37ee23fd56 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 10:41:12 +0800
Subject: [PATCH 11/38] test

---
 .buildkite/hooks/post-command                 |   8 +-
 .../meta-llama_Llama-3_1-70B-Instruct.yml     | 115 ------------------
 .../meta-llama_Llama-3_1-8B-Instruct.yml      |   3 +
 3 files changed, 7 insertions(+), 119 deletions(-)
 delete mode 100644 .buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml

diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command
index 84b464ad45..3a7a0f3a11 100644
--- a/.buildkite/hooks/post-command
+++ b/.buildkite/hooks/post-command
@@ -5,9 +5,9 @@ echo "--- Post-command hook triggered ---"
 echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
 
 if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-  echo "The step passed. Uploading result..."
-  buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "passed"
+  echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
+  buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed"
 else
-  echo "The step failed. Uploading result..."
-  buildkite-agent meta-data set "meta-llama/Llama-3.1-8B-Instruct:UnitTest" "failed"
+  echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
+  buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed"
 fi
diff --git a/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml
deleted file mode 100644
index 5adbb2c503..0000000000
--- a/.buildkite/models/informational/meta-llama_Llama-3_1-70B-Instruct.yml
+++ /dev/null
@@ -1,115 +0,0 @@
-# meta-llama/Llama-3.1-70B-Instruct
-agents:
-  queue: tpu_v6e_8_queue
-steps:
-  - label: "Unit tests for meta-llama/Llama-3.1-70B-Instruct"
-    key: "ut_meta-llama_Llama-3_1-70B-Instruct"
-    commands:
-      # - replace_with_test_commands  # TODO: Replaced to actual test commands
-      - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
-  - label: "Notifications: Unit tests for meta-llama/Llama-3.1-70B-Instruct"
-    key: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct"
-    depends_on: "ut_meta-llama_Llama-3_1-70B-Instruct"
-    agents:
-      queue: tpu_v6e_8_queue
-    commands:
-      - |
-        .buildkite/scripts/check_results.sh \
-          "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct
-    plugins:
-      - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1::
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:UnitTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:UnitTest" "failed"
-            fi
-
-  - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct"
-    key: "it_meta-llama_Llama-3_1-70B-Instruct"
-    depends_on: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct"
-    commands:
-      # TODO: expected_accuracy need parameterized
-      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-70B-Instruct"
-      - echo "[DEBUG], integration testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
-  - label: "Notifications: Integration tests for meta-llama/Llama-3.1-70B-Instruct"
-    key: "notifications_it_meta-llama_Llama-3_1-70B-Instruct"
-    depends_on: "it_meta-llama_Llama-3_1-70B-Instruct"
-    agents:
-      queue: tpu_v6e_8_queue
-    commands:
-      - |
-        .buildkite/scripts/check_results.sh \
-          "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:IntTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:IntTest" "failed"
-            fi
-
-  - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
-    key: "pb_meta-llama_Llama-3_1-70B-Instruct"
-    depends_on: "notifications_it_meta-llama_Llama-3_1-70B-Instruct"
-    commands:
-      # - replace_with_test_command  # TODO
-      - echo "[DEBUG], performance benchmarking for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
-  - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
-    key: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct"
-    depends_on: "pb_meta-llama_Llama-3_1-70B-Instruct"
-    agents:
-      queue: tpu_v6e_8_queue
-    commands:
-      - |
-        .buildkite/scripts/check_results.sh \
-          "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:Benchmark" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:Benchmark" "failed"
-            fi
-
-  - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct"
-    key: "st_meta-llama_Llama-3_1-70B-Instruct"
-    depends_on: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct"
-    commands:
-      # - our_stress_tests_script meta-llama/Llama-3.1-70B-Instruct expected_throughput # TODO: expected_throughput need parameterized
-      - echo "[DEBUG], stress testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
-  - label: "Notifications: Stress tests for meta-llama/Llama-3.1-70B-Instruct"
-    key: "notifications_st_meta-llama_Llama-3_1-70B-Instruct"
-    depends_on: "st_meta-llama_Llama-3_1-70B-Instruct"
-    agents:
-      queue: tpu_v6e_8_queue
-    commands:
-      - |
-        .buildkite/scripts/check_results.sh \
-          "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:StressTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "meta-llama/Llama-3.1-70B-Instruct:StressTest" "failed"
-            fi
diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index 8d66a393f3..fbd6e7ae2e 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -12,6 +12,9 @@ steps:
     depends_on: "ut_meta-llama_Llama-3_1-8B-Instruct"
     agents:
       queue: tpu_v6e_queue
+    env:
+      EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct"
+      EXECUTE_STAGE: "UnitTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \

From 6adad371fda1c7495db18caf15ff97b6403fdc2f Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 10:50:50 +0800
Subject: [PATCH 12/38] test

---
 .buildkite/hooks/post-command                      | 14 +++++++-------
 .../models/meta-llama_Llama-3_1-8B-Instruct.yml    |  6 ++++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command
index 3a7a0f3a11..a0da4e5802 100644
--- a/.buildkite/hooks/post-command
+++ b/.buildkite/hooks/post-command
@@ -4,10 +4,10 @@ set -euo pipefail
 echo "--- Post-command hook triggered ---"
 echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
 
-if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-  echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
-  buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed"
-else
-  echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
-  buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed"
-fi
+# if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+#   echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
+#   buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed"
+# else
+#   echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
+#   buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed"
+# fi
diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index fbd6e7ae2e..f6e2868ec3 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -19,8 +19,10 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
-    plugins:
-      - ".buildkite": ~
+    post-command: |
+      echo "Test post-command?"
+    # plugins:
+    #   - ".buildkite": ~
 
   - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct"
     key: "it_meta-llama_Llama-3_1-8B-Instruct"

From 7615b6e90dfded480397fdf890f713a2852a69f8 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 10:54:50 +0800
Subject: [PATCH 13/38] test

---
 .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index f6e2868ec3..58cf2f9b92 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -19,8 +19,9 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
-    post-command: |
-      echo "Test post-command?"
+    plugins:
+      - buildkite-plugins/hooks-plugin#v1.1.0:
+        directory: ".buildkite"
     # plugins:
     #   - ".buildkite": ~
 

From 942b25f3052025e7957ba7a748687cf6457fafd4 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 11:27:48 +0800
Subject: [PATCH 14/38] test post command

---
 .buildkite/hooks/post-command                 | 33 ++++++++++++++-----
 .../meta-llama_Llama-3_1-8B-Instruct.yml      |  6 ++--
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command
index a0da4e5802..1f7b660299 100644
--- a/.buildkite/hooks/post-command
+++ b/.buildkite/hooks/post-command
@@ -2,12 +2,27 @@
 set -euo pipefail
 
 echo "--- Post-command hook triggered ---"
-echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-
-# if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-#   echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
-#   buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed"
-# else
-#   echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
-#   buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed"
-# fi
+
+if [ -n "$EXECUTE_MODEL" ] && \
+   [ -n "$EXECUTE_STAGE" ] && \
+   [[ "$BUILDKITE_STEP_KEY" == "notifications_"* ]]; then
+
+    echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+
+    # If all conditions are true, execute the logic here.
+    echo "EXECUTE_MODEL: $EXECUTE_MODEL"
+    echo "EXECUTE_STAGE: $EXECUTE_STAGE"
+    echo "BUILDKITE_STEP_KEY: $BUILDKITE_STEP_KEY"
+    
+    if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
+      echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
+      buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed"
+    else
+      echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
+      buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed"
+    fi
+
+else
+    # If any condition is false, print a message and exit.
+    echo "One or more conditions were not met. Skipping execution."
+fi
diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index 58cf2f9b92..95dcb99a19 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -19,9 +19,9 @@ steps:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
-    plugins:
-      - buildkite-plugins/hooks-plugin#v1.1.0:
-        directory: ".buildkite"
+    # plugins:
+    #   - buildkite-plugins/hooks-plugin#v1.1.0:
+    #     directory: ".buildkite"
     # plugins:
     #   - ".buildkite": ~
 

From d5c417c3b4c26c89c330e82b8d5f9b6e8aa4e6a7 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 11:30:52 +0800
Subject: [PATCH 15/38] test

---
 .buildkite/hooks/post-command | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command
index 1f7b660299..e85e681c97 100644
--- a/.buildkite/hooks/post-command
+++ b/.buildkite/hooks/post-command
@@ -3,9 +3,9 @@ set -euo pipefail
 
 echo "--- Post-command hook triggered ---"
 
-if [ -n "$EXECUTE_MODEL" ] && \
-   [ -n "$EXECUTE_STAGE" ] && \
-   [[ "$BUILDKITE_STEP_KEY" == "notifications_"* ]]; then
+if [ -n "${EXECUTE_MODEL:-}" ] && \
+   [ -n "${EXECUTE_STAGE:-}" ] && \
+   [[ "${BUILDKITE_STEP_KEY:-}" == "notifications_"* ]]; then
 
     echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
 
@@ -25,4 +25,4 @@ if [ -n "$EXECUTE_MODEL" ] && \
 else
     # If any condition is false, print a message and exit.
     echo "One or more conditions were not met. Skipping execution."
-fi
+fi
\ No newline at end of file

From 2c7458709e8a2f643f24e561b8da5e5a7a30e76e Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 11:33:46 +0800
Subject: [PATCH 16/38] test all post

---
 .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index 95dcb99a19..9c83f909ae 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -37,6 +37,9 @@ steps:
     depends_on: "it_meta-llama_Llama-3_1-8B-Instruct"
     agents:
       queue: tpu_v6e_queue
+    env:
+      EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct"
+      EXECUTE_STAGE: "IntTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
@@ -53,6 +56,9 @@ steps:
     depends_on: "pb_meta-llama_Llama-3_1-8B-Instruct"
     agents:
       queue: tpu_v6e_queue
+    env:
+      EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct"
+      EXECUTE_STAGE: "Benchmark"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
@@ -69,6 +75,9 @@ steps:
     depends_on: "st_meta-llama_Llama-3_1-8B-Instruct"
     agents:
       queue: tpu_v6e_queue
+    env:
+      EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct"
+      EXECUTE_STAGE: "StressTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \

From a09184ce5f216a0f0ee1e62b0de4207c4bdc657c Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 11:57:02 +0800
Subject: [PATCH 17/38] test to check_results

---
 .buildkite/buildkite_ci_feature_template.yml | 60 ++++----------------
 .buildkite/buildkite_ci_model_template.yml   | 60 ++++----------------
 .buildkite/hooks/post-command                | 28 ---------
 .buildkite/scripts/check_results.sh          | 23 +++++++-
 4 files changed, 46 insertions(+), 125 deletions(-)
 delete mode 100644 .buildkite/hooks/post-command

diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml
index e907815416..e7286ccda4 100644
--- a/.buildkite/buildkite_ci_feature_template.yml
+++ b/.buildkite/buildkite_ci_feature_template.yml
@@ -12,22 +12,13 @@ steps:
     depends_on: "ut_{SAFE_FEATURE_NAME}"
     agents:
       queue: {QUEUE}
+    env:
+      EXECUTE_ENTITY: "{FEATURE_NAME}"
+      EXECUTE_STAGE: "UnitTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for {FEATURE_NAME}" ut_{SAFE_FEATURE_NAME}
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "{FEATURE_NAME}:UnitTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "{FEATURE_NAME}:UnitTest" "failed"
-            fi
 
   - label: "Integration tests for {FEATURE_NAME}"
     key: "it_{SAFE_FEATURE_NAME}"
@@ -41,22 +32,13 @@ steps:
     depends_on: "it_{SAFE_FEATURE_NAME}"
     agents:
       queue: {QUEUE}
+    env:
+      EXECUTE_ENTITY: "{FEATURE_NAME}"
+      EXECUTE_STAGE: "IntTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Integration tests for {FEATURE_NAME}" it_{SAFE_FEATURE_NAME}
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "{FEATURE_NAME}:IntTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "{FEATURE_NAME}:IntTest" "failed"
-            fi
 
   - label: "Performance benchmarks for {FEATURE_NAME}"
     key: "pb_{SAFE_FEATURE_NAME}"
@@ -69,22 +51,13 @@ steps:
     depends_on: "pb_{SAFE_FEATURE_NAME}"
     agents:
       queue: {QUEUE}
+    env:
+      EXECUTE_ENTITY: "{FEATURE_NAME}"
+      EXECUTE_STAGE: "Benchmark"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Performance benchmarks for {FEATURE_NAME}" pb_{SAFE_FEATURE_NAME}
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "{FEATURE_NAME}:Benchmark" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "{FEATURE_NAME}:Benchmark" "failed"
-            fi
 
   - label: "Stress tests for {FEATURE_NAME}"
     key: "st_{SAFE_FEATURE_NAME}"
@@ -97,19 +70,10 @@ steps:
     depends_on: "st_{SAFE_FEATURE_NAME}"
     agents:
       queue: {QUEUE}
+    env:
+      EXECUTE_ENTITY: "{FEATURE_NAME}"
+      EXECUTE_STAGE: "StressTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Stress tests for {FEATURE_NAME}" st_{SAFE_FEATURE_NAME}
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "{FEATURE_NAME}:StressTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "{FEATURE_NAME}:StressTest" "failed"
-            fi
diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml
index 48e451bc58..6c6c13910c 100644
--- a/.buildkite/buildkite_ci_model_template.yml
+++ b/.buildkite/buildkite_ci_model_template.yml
@@ -12,22 +12,13 @@ steps:
     depends_on: "ut_{SAFE_MODEL_NAME}"
     agents:
       queue: {QUEUE}
+    env:
+      EXECUTE_ENTITY: "{MODEL_NAME}"
+      EXECUTE_STAGE: "UnitTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME}
-    plugins:
-      - git@github.com:buildkite-plugins/hooks-buildkite-plugin.git#v1::
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "{MODEL_NAME}:UnitTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "{MODEL_NAME}:UnitTest" "failed"
-            fi
 
   - label: "Integration tests for {MODEL_NAME}"
     key: "it_{SAFE_MODEL_NAME}"
@@ -41,22 +32,13 @@ steps:
     depends_on: "it_{SAFE_MODEL_NAME}"
     agents:
       queue: {QUEUE}
+    env:
+      EXECUTE_ENTITY: "{MODEL_NAME}"
+      EXECUTE_STAGE: "IntTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Integration tests for {MODEL_NAME}" it_{SAFE_MODEL_NAME}
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "{MODEL_NAME}:IntTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "{MODEL_NAME}:IntTest" "failed"
-            fi
 
   - label: "Performance benchmarks for {MODEL_NAME}"
     key: "pb_{SAFE_MODEL_NAME}"
@@ -69,22 +51,13 @@ steps:
     depends_on: "pb_{SAFE_MODEL_NAME}"
     agents:
       queue: {QUEUE}
+    env:
+      EXECUTE_ENTITY: "{MODEL_NAME}"
+      EXECUTE_STAGE: "Benchmark"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Performance benchmarks for {MODEL_NAME}" pb_{SAFE_MODEL_NAME}
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "{MODEL_NAME}:Benchmark" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "{MODEL_NAME}:Benchmark" "failed"
-            fi
 
   - label: "Stress tests for {MODEL_NAME}"
     key: "st_{SAFE_MODEL_NAME}"
@@ -97,19 +70,10 @@ steps:
     depends_on: "st_{SAFE_MODEL_NAME}"
     agents:
       queue: {QUEUE}
+    env:
+      EXECUTE_ENTITY: "{MODEL_NAME}"
+      EXECUTE_STAGE: "StressTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Stress tests for {MODEL_NAME}" st_{SAFE_MODEL_NAME}
-    plugins:
-      - hooks#v1:
-          post-command: |
-            echo "--- Post-command hook triggered ---"
-            echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-            if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-              echo "The step passed. Uploading result..."
-              buildkite-agent meta-data set "{MODEL_NAME}:StressTest" "passed"
-            else
-              echo "The step failed. Uploading result..."
-              buildkite-agent meta-data set "{MODEL_NAME}:StressTest" "failed"
-            fi
diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command
deleted file mode 100644
index e85e681c97..0000000000
--- a/.buildkite/hooks/post-command
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-echo "--- Post-command hook triggered ---"
-
-if [ -n "${EXECUTE_MODEL:-}" ] && \
-   [ -n "${EXECUTE_STAGE:-}" ] && \
-   [[ "${BUILDKITE_STEP_KEY:-}" == "notifications_"* ]]; then
-
-    echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
-
-    # If all conditions are true, execute the logic here.
-    echo "EXECUTE_MODEL: $EXECUTE_MODEL"
-    echo "EXECUTE_STAGE: $EXECUTE_STAGE"
-    echo "BUILDKITE_STEP_KEY: $BUILDKITE_STEP_KEY"
-    
-    if [ "$BUILDKITE_COMMAND_EXIT_STATUS" -eq 0 ]; then
-      echo "The step passed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
-      buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "passed"
-    else
-      echo "The step failed. Uploading $EXECUTE_MODEL:$EXECUTE_STAGE result..."
-      buildkite-agent meta-data set "$EXECUTE_MODEL:$EXECUTE_STAGE" "failed"
-    fi
-
-else
-    # If any condition is false, print a message and exit.
-    echo "One or more conditions were not met. Skipping execution."
-fi
\ No newline at end of file
diff --git a/.buildkite/scripts/check_results.sh b/.buildkite/scripts/check_results.sh
index f57edbf1f3..913930dc2e 100755
--- a/.buildkite/scripts/check_results.sh
+++ b/.buildkite/scripts/check_results.sh
@@ -21,7 +21,28 @@ for KEY in "$@"; do
     fi
 done
 
-if [ "${ANY_FAILED}" = "true" ] ; then
+# Check Test Result and upload to buildkite meta-data
+if [ -n "${EXECUTE_ENTITY:-}" ] && \
+   [ -n "${EXECUTE_STAGE:-}" ] && \
+   [[ "${BUILDKITE_STEP_KEY:-}" == "notifications_"* ]]; then
+
+    # If all conditions are true, execute the logic here.
+    echo "EXECUTE_ENTITY: $EXECUTE_ENTITY"
+    echo "EXECUTE_STAGE: $EXECUTE_STAGE"
+    echo "BUILDKITE_STEP_KEY: $BUILDKITE_STEP_KEY"
+
+    echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
+    
+    if [ "${ANY_FAILED}" = "true" ]; then
+      echo "The step failed. Uploading $EXECUTE_ENTITY:$EXECUTE_STAGE result..."
+      buildkite-agent meta-data set "$EXECUTE_ENTITY:$EXECUTE_STAGE" "failed"
+    else
+      echo "The step passed. Uploading $EXECUTE_ENTITY:$EXECUTE_STAGE result..."
+      buildkite-agent meta-data set "$EXECUTE_ENTITY:$EXECUTE_STAGE" "passed"
+    fi
+fi
+
+if [ "${ANY_FAILED}" = "true" ]; then
     cat <<- YAML | buildkite-agent pipeline upload
     steps:
     - label: "${FAILURE_LABEL}"

From 736df21c967d35977c63e88f47e58d2e09203a4d Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 12:11:18 +0800
Subject: [PATCH 18/38] test

---
 .../models/meta-llama_Llama-3_1-8B-Instruct.yml     | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index 9c83f909ae..496407d1ac 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -13,17 +13,12 @@ steps:
     agents:
       queue: tpu_v6e_queue
     env:
-      EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct"
+      EXECUTE_ENTITY: "meta-llama/Llama-3.1-8B-Instruct"
       EXECUTE_STAGE: "UnitTest"
     commands:
       - |
         .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
-    # plugins:
-    #   - buildkite-plugins/hooks-plugin#v1.1.0:
-    #     directory: ".buildkite"
-    # plugins:
-    #   - ".buildkite": ~
 
   - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct"
     key: "it_meta-llama_Llama-3_1-8B-Instruct"
@@ -38,7 +33,7 @@ steps:
     agents:
       queue: tpu_v6e_queue
     env:
-      EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct"
+      EXECUTE_ENTITY: "meta-llama/Llama-3.1-8B-Instruct"
       EXECUTE_STAGE: "IntTest"
     commands:
       - |
@@ -57,7 +52,7 @@ steps:
     agents:
       queue: tpu_v6e_queue
     env:
-      EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct"
+      EXECUTE_ENTITY: "meta-llama/Llama-3.1-8B-Instruct"
       EXECUTE_STAGE: "Benchmark"
     commands:
       - |
@@ -76,7 +71,7 @@ steps:
     agents:
       queue: tpu_v6e_queue
     env:
-      EXECUTE_MODEL: "meta-llama/Llama-3.1-8B-Instruct"
+      EXECUTE_ENTITY: "meta-llama/Llama-3.1-8B-Instruct"
       EXECUTE_STAGE: "StressTest"
     commands:
       - |

From e15f755c1b3120c8924ccd14f23260044cc6608c Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 12:17:13 +0800
Subject: [PATCH 19/38] fix

---
 .buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
index 496407d1ac..9aac0432d6 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-8B-Instruct.yml
@@ -17,7 +17,7 @@ steps:
       EXECUTE_STAGE: "UnitTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-8B-Instruct" ut_meta-llama_Llama-3_1-8B-Instruct
 
   - label: "Integration tests for meta-llama/Llama-3.1-8B-Instruct"
@@ -37,7 +37,7 @@ steps:
       EXECUTE_STAGE: "IntTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Integration tests for meta-llama/Llama-3.1-8B-Instruct" it_meta-llama_Llama-3_1-8B-Instruct
 
   - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
@@ -56,7 +56,7 @@ steps:
       EXECUTE_STAGE: "Benchmark"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" pb_meta-llama_Llama-3_1-8B-Instruct
 
   - label: "Stress tests for meta-llama/Llama-3.1-8B-Instruct"
@@ -75,5 +75,5 @@ steps:
       EXECUTE_STAGE: "StressTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Stress tests for meta-llama/Llama-3.1-8B-Instruct" st_meta-llama_Llama-3_1-8B-Instruct

From 1a9e6369bcdbcaa44073b9cd22811d5ca18cff5e Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 12:19:27 +0800
Subject: [PATCH 20/38] fix

---
 .buildkite/scripts/check_results.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.buildkite/scripts/check_results.sh b/.buildkite/scripts/check_results.sh
index 913930dc2e..c961e53548 100755
--- a/.buildkite/scripts/check_results.sh
+++ b/.buildkite/scripts/check_results.sh
@@ -30,8 +30,6 @@ if [ -n "${EXECUTE_ENTITY:-}" ] && \
     echo "EXECUTE_ENTITY: $EXECUTE_ENTITY"
     echo "EXECUTE_STAGE: $EXECUTE_STAGE"
     echo "BUILDKITE_STEP_KEY: $BUILDKITE_STEP_KEY"
-
-    echo "Test exited with status: $BUILDKITE_COMMAND_EXIT_STATUS"
     
     if [ "${ANY_FAILED}" = "true" ]; then
       echo "The step failed. Uploading $EXECUTE_ENTITY:$EXECUTE_STAGE result..."

From 5f93dd238ecf494da4538c90b45b233100800e1b Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 14:58:41 +0800
Subject: [PATCH 21/38] test

---
 .buildkite/pipeline_dynamic.yml             |  48 +++++++-
 .buildkite/scripts/dynamic_bootstrap.sh     |   6 +-
 .buildkite/scripts/export_support_matrix.sh | 115 ++++++++++++++++++++
 3 files changed, 163 insertions(+), 6 deletions(-)
 create mode 100644 .buildkite/scripts/export_support_matrix.sh

diff --git a/.buildkite/pipeline_dynamic.yml b/.buildkite/pipeline_dynamic.yml
index dbc315abfa..139d6385ac 100644
--- a/.buildkite/pipeline_dynamic.yml
+++ b/.buildkite/pipeline_dynamic.yml
@@ -6,12 +6,54 @@ steps:
      agents:
        queue: tpu_v6e_queue
      commands:
-       - .buildkite/scripts/dynamic_bootstrap.sh
+       - bash .buildkite/scripts/dynamic_bootstrap.sh
 
    - wait: ~
 
-   - label: "Generate support matrices"
+   - label: "Export support matrix report"
+     key: export_support_matrix
      agents:
        queue: tpu_v6e_queue
      commands:
-       - echo "Generate support matrices..."
\ No newline at end of file
+       - echo "Generate support matrices..."
+       - bash .buildkite/scripts/export_support_matrix.sh
+
+   # Handle PR builds: print model matrices and feature matrices
+   - label: "Handle Report"
+     if: build.pull_request.id != null
+     depends_on: export_support_matrix
+     agents:
+       queue: tpu_v6e_queue
+     command: |
+       buildkite-agent artifact download "model_support_matrix.csv" .
+       buildkite-agent artifact download "feature_support_matrix.csv" .
+       echo "--- Model Support Matrix ---"
+       cat model_support_matrix.csv
+       echo "--- Feature Support Matrix ---"
+       cat feature_support_matrix.csv
+
+  #  # Release Tag build: commit CSVs
+  #  - label: "Commit CSVs on Release Tag"
+  #    if: build.tag =~ /^v\.?[0-9]+(\.[0-9]+)*$/
+  #    depends_on: "set-results"
+  #    command: |
+  #      echo "=== Release Tag build ==="
+  #      echo "BUILDKITE_TAG=$BUILDKITE_TAG"
+
+  #      # Checkout main branch and sync code
+  #      git fetch origin main
+  #      git checkout main
+  #      git reset --hard origin/main
+
+  #      # Create target folder tpu_dev/result
+  #      mkdir -p result
+
+  #      # Download all CSV artifacts
+  #      buildkite-agent artifact download "model_support_matrix.csv" .
+  #      buildkite-agent artifact download "feature_support_matrix.csv" .
+
+  #      # Stage and commit changes (skip CI to avoid infinite loop)
+  #      git add *.csv
+  #      git commit -m "[skip ci] Update CSVs for $BUILDKITE_TAG" || echo "No changes to commit"
+
+  #      git push origin main
\ No newline at end of file
diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh
index 2b76261d3d..c8c667d261 100644
--- a/.buildkite/scripts/dynamic_bootstrap.sh
+++ b/.buildkite/scripts/dynamic_bootstrap.sh
@@ -109,7 +109,7 @@ if [[ -n "$tpu_model_list_str" ]]; then
   echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}"
   echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")"
 else
-  echo "--- No Model Names found to upload."
+  echo "--- No tpu-support Models found to upload."
 fi
 
 if [[ -n "$vllm_model_list_str" ]]; then
@@ -117,7 +117,7 @@ if [[ -n "$vllm_model_list_str" ]]; then
   echo "${vllm_model_list_str}" | buildkite-agent meta-data set "${INFORMATIONAL_MODEL_LIST_KEY}"
   echo "Testing: $(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")"
 else
-  echo "--- No Model Names found to upload."
+  echo "--- No vllm-native Models found to upload."
 fi
 
 if [[ -n "$popular_model_list_str" ]]; then
@@ -125,7 +125,7 @@ if [[ -n "$popular_model_list_str" ]]; then
   echo "${popular_model_list_str}" | buildkite-agent meta-data set "${POPURLAR_MODEL_LIST_KEY}"
   echo "Testing: $(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")"
 else
-  echo "--- No Model Names found to upload."
+  echo "--- No popular Models found to upload."
 fi
 
 
diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh
new file mode 100644
index 0000000000..ff6a408d36
--- /dev/null
+++ b/.buildkite/scripts/export_support_matrix.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+set -euo pipefail
+
+ANY_FAILED=false
+
+MODEL_LIST_KEY="tpu-model-list"
+INFORMATIONAL_MODEL_LIST_KEY="vllm-model-list"
+POPURLAR_MODEL_LIST_KEY="popular-model-list"
+
+FEATURE_LIST_METADATA_KEY="feature-list"
+
+# tpu_model_list="Qwen/Qwen2.5-0.5B-Instruct Qwen/Qwen2.5-2B-Instruct"
+# vllm_model_list="NousResearch/Nous-Hermes-1.4B NousResearch/Nous-Hermes-2.5B"
+# popular_model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.2-8B-Instruct"
+tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}")
+vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")
+popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")
+
+feature_list="f1 f2"
+STAGES="UnitTest IntTest Benchmark StressTest"
+
+# Output CSV files
+output_model_support_matrix_file="model_support_matrix.csv"
+echo "Model,UnitTest,IntTest,Benchmark,StressTest" > "$output_model_support_matrix_file"
+
+output_feature_support_matrix_file="feature_support_matrix.csv"
+echo "Feature,UnitTest,IntTest,Benchmark,StressTest" > "$output_feature_support_matrix_file"
+
+# All stages must pass for TPU models
+check_tpu_model() {
+    local model="$1"
+    for stage in $STAGES; do
+        result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "not_run")
+        if [[ "$result" != "passed" ]]; then
+            echo "TPU model $model failed at $stage ($result)"
+            ANY_FAILED=true
+        fi
+    done
+}
+
+# Only UnitTest and IntTest must pass for VLLM models
+check_vllm_model() {
+    local model="$1"
+    local required="UnitTest IntTest"
+    for stage in $required; do
+        result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "not_run")
+        if [[ "$result" != "passed" ]]; then
+            echo "VLLM model $model failed at $stage ($result)"
+            ANY_FAILED=true
+        fi
+    done
+}
+
+process_models() {
+    local model_list="$1"
+    local mode="$2"   # tpu | vllm | popular
+    for model in $model_list; do
+        row="$model"
+        for stage in $STAGES; do
+            result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "${model}:${stage} not_run")
+            row="$row,$result"
+        done
+        echo "$row" >> "$output_model_support_matrix_file"
+
+        # run checks
+        case $mode in
+            tpu) check_tpu_model "$model" ;;
+            vllm) check_vllm_model "$model" ;;
+            popular) ;;
+        esac
+    done
+}
+
+process_features() {
+    local feature_list="$1"
+    for feature in $feature_list; do
+        row="$feature"
+        for stage in $STAGES; do
+            result=$(buildkite-agent meta-data get "${feature}:${stage}" || echo "${feature}:${stage} not_run")
+            row="$row,$result"
+        done
+        echo "$row" >> "$output_feature_support_matrix_file"
+    done
+}
+
+echo "--- Checking TPU models Outcomes and Generating Reports ---"
+process_models "$tpu_model_list" tpu
+
+echo "--- Checking VLLM models Outcomes and Generating Reports ---"
+process_models "$vllm_model_list" vllm
+
+echo "--- Checking popular models Outcomes and Generating Reports ---"
+process_models "$popular_model_list" popular
+
+echo "--- Checking features Outcomes and Generating Reports ---"
+process_features "$feature_list"
+
+# Get commit hashes
+VLLM_COMMIT_HASH=$(buildkite-agent meta-data get 'VLLM_COMMIT_HASH' || echo "not_set")
+TPU_COMMONS_COMMIT_HASH=$(buildkite-agent meta-data get 'TPU_COMMONS_COMMIT_HASH' || echo "not_set")
+
+if [ "$ANY_FAILED" = true ]; then
+    echo "Some checks failed!"
+    echo "VLLM_COMMIT_HASH: $VLLM_COMMIT_HASH"
+    echo "TPU_COMMONS_COMMIT_HASH: $TPU_COMMONS_COMMIT_HASH"
+    exit 1
+else
+    echo "--- Uploading Commit Hash to Repo ---"
+    echo "Will commit to tpu_commons main"
+fi
+
+echo "--- Uploading CSV Reports as Buildkite Artifacts ---"
+buildkite-agent artifact upload "$output_model_support_matrix_file"
+buildkite-agent artifact upload "$output_feature_support_matrix_file"
+echo "Reports uploaded successfully."
\ No newline at end of file

From 2d53e701622eac54b3c9475b15afbafa98cb70d7 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 15:22:38 +0800
Subject: [PATCH 22/38] test

---
 .buildkite/scripts/dynamic_bootstrap.sh     |  2 +-
 .buildkite/scripts/export_support_matrix.sh | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh
index c8c667d261..5d3d3bc9f3 100644
--- a/.buildkite/scripts/dynamic_bootstrap.sh
+++ b/.buildkite/scripts/dynamic_bootstrap.sh
@@ -107,7 +107,7 @@ model_list_string=$(printf "%s\n" "${model_names[@]}")
 if [[ -n "$tpu_model_list_str" ]]; then
   echo "--- Uploading tpu_model_list_str to Meta-data:${MODEL_LIST_KEY}"
   echo "${tpu_model_list_str}" | buildkite-agent meta-data set "${MODEL_LIST_KEY}"
-  echo "Testing: $(buildkite-agent meta-data get "MODEL_LIST_KEY")"
+  echo "Testing: $(buildkite-agent meta-data get "${MODEL_LIST_KEY}")"
 else
   echo "--- No tpu-support Models found to upload."
 fi
diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh
index ff6a408d36..6706ff8ab5 100644
--- a/.buildkite/scripts/export_support_matrix.sh
+++ b/.buildkite/scripts/export_support_matrix.sh
@@ -12,9 +12,9 @@ FEATURE_LIST_METADATA_KEY="feature-list"
 # tpu_model_list="Qwen/Qwen2.5-0.5B-Instruct Qwen/Qwen2.5-2B-Instruct"
 # vllm_model_list="NousResearch/Nous-Hermes-1.4B NousResearch/Nous-Hermes-2.5B"
 # popular_model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.2-8B-Instruct"
-tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}")
-vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}")
-popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}")
+tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}" --default "")
+vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}" --default "")
+popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}" --default "")
 
 feature_list="f1 f2"
 STAGES="UnitTest IntTest Benchmark StressTest"
@@ -30,7 +30,7 @@ echo "Feature,UnitTest,IntTest,Benchmark,StressTest" > "$output_feature_support_
 check_tpu_model() {
     local model="$1"
     for stage in $STAGES; do
-        result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "not_run")
+        result=$(buildkite-agent meta-data get "${model}:${stage}" --default "not_run")
         if [[ "$result" != "passed" ]]; then
             echo "TPU model $model failed at $stage ($result)"
             ANY_FAILED=true
@@ -43,7 +43,7 @@ check_vllm_model() {
     local model="$1"
     local required="UnitTest IntTest"
     for stage in $required; do
-        result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "not_run")
+        result=$(buildkite-agent meta-data get "${model}:${stage}" --default "not_run")
         if [[ "$result" != "passed" ]]; then
             echo "VLLM model $model failed at $stage ($result)"
             ANY_FAILED=true
@@ -57,7 +57,7 @@ process_models() {
     for model in $model_list; do
         row="$model"
         for stage in $STAGES; do
-            result=$(buildkite-agent meta-data get "${model}:${stage}" || echo "${model}:${stage} not_run")
+            result=$(buildkite-agent meta-data get "${model}:${stage}" --default "${model}:${stage} not_run")
             row="$row,$result"
         done
         echo "$row" >> "$output_model_support_matrix_file"
@@ -76,7 +76,7 @@ process_features() {
     for feature in $feature_list; do
         row="$feature"
         for stage in $STAGES; do
-            result=$(buildkite-agent meta-data get "${feature}:${stage}" || echo "${feature}:${stage} not_run")
+            result=$(buildkite-agent meta-data get "${feature}:${stage}" --default "${feature}:${stage} not_run")
             row="$row,$result"
         done
         echo "$row" >> "$output_feature_support_matrix_file"
@@ -96,8 +96,8 @@ echo "--- Checking features Outcomes and Generating Reports ---"
 process_features "$feature_list"
 
 # Get commit hashes
-VLLM_COMMIT_HASH=$(buildkite-agent meta-data get 'VLLM_COMMIT_HASH' || echo "not_set")
-TPU_COMMONS_COMMIT_HASH=$(buildkite-agent meta-data get 'TPU_COMMONS_COMMIT_HASH' || echo "not_set")
+VLLM_COMMIT_HASH=$(buildkite-agent meta-data get 'VLLM_COMMIT_HASH' --default "not_set")
+TPU_COMMONS_COMMIT_HASH=$(buildkite-agent meta-data get 'TPU_COMMONS_COMMIT_HASH' --default "not_set")
 
 if [ "$ANY_FAILED" = true ]; then
     echo "Some checks failed!"

From ea6d7c800e0c850bb1859d466ee7c13b6cda570d Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 15:32:10 +0800
Subject: [PATCH 23/38] test

---
 .buildkite/scripts/export_support_matrix.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh
index 6706ff8ab5..5f2975ddee 100644
--- a/.buildkite/scripts/export_support_matrix.sh
+++ b/.buildkite/scripts/export_support_matrix.sh
@@ -109,6 +109,9 @@ else
     echo "Will commit to tpu_commons main"
 fi
 
+echo "--- Print Model Report Content ---"
+cat "$output_model_support_matrix_file"
+
 echo "--- Uploading CSV Reports as Buildkite Artifacts ---"
 buildkite-agent artifact upload "$output_model_support_matrix_file"
 buildkite-agent artifact upload "$output_feature_support_matrix_file"

From 99e502fc3828459011b85b1efcb9d24602cb8c3b Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 15:39:47 +0800
Subject: [PATCH 24/38] add test models

---
 .../NousResearch_Nous-Hermes-1_4B.yml         | 79 +++++++++++++++++++
 .../meta-llama_Llama-3_1-70B-Instruct.yml     | 79 +++++++++++++++++++
 .../popular/Qwen_Qwen2_5-2B-Instruct.yml      | 79 +++++++++++++++++++
 3 files changed, 237 insertions(+)
 create mode 100644 .buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml
 create mode 100644 .buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
 create mode 100644 .buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml

diff --git a/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml b/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml
new file mode 100644
index 0000000000..4e4890c79b
--- /dev/null
+++ b/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml
@@ -0,0 +1,79 @@
+# NousResearch/Nous-Hermes-1.4B
+agents:
+  queue: tpu_v6e_queue
+steps:
+  - label: "Unit tests for NousResearch/Nous-Hermes-1.4B"
+    key: "ut_NousResearch_Nous-Hermes-1_4B"
+    commands:
+      # - replace_with_test_commands  # TODO: Replaced to actual test commands
+      - echo "[DEBUG], unit testing for NousResearch/Nous-Hermes-1.4B"  # TODO: Replace to actual test commands
+  - label: "Notifications: Unit tests for NousResearch/Nous-Hermes-1.4B"
+    key: "notifications_ut_NousResearch_Nous-Hermes-1_4B"
+    depends_on: "ut_NousResearch_Nous-Hermes-1_4B"
+    agents:
+      queue: tpu_v6e_queue
+    env:
+      EXECUTE_ENTITY: "NousResearch/Nous-Hermes-1.4B"
+      EXECUTE_STAGE: "UnitTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Unit tests for NousResearch/Nous-Hermes-1.4B" ut_NousResearch_Nous-Hermes-1_4B
+
+  - label: "Integration tests for NousResearch/Nous-Hermes-1.4B"
+    key: "it_NousResearch_Nous-Hermes-1_4B"
+    depends_on: "notifications_ut_NousResearch_Nous-Hermes-1_4B"
+    commands:
+      # TODO: expected_accuracy need parameterized
+      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "NousResearch/Nous-Hermes-1.4B"
+      - echo "[DEBUG], integration testing for NousResearch/Nous-Hermes-1.4B"  # TODO: Replace to actual test commands
+  - label: "Notifications: Integration tests for NousResearch/Nous-Hermes-1.4B"
+    key: "notifications_it_NousResearch_Nous-Hermes-1_4B"
+    depends_on: "it_NousResearch_Nous-Hermes-1_4B"
+    agents:
+      queue: tpu_v6e_queue
+    env:
+      EXECUTE_ENTITY: "NousResearch/Nous-Hermes-1.4B"
+      EXECUTE_STAGE: "IntTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Integration tests for NousResearch/Nous-Hermes-1.4B" it_NousResearch_Nous-Hermes-1_4B
+
+  - label: "Performance benchmarks for NousResearch/Nous-Hermes-1.4B"
+    key: "pb_NousResearch_Nous-Hermes-1_4B"
+    depends_on: "notifications_it_NousResearch_Nous-Hermes-1_4B"
+    commands:
+      # - replace_with_test_command  # TODO
+      - echo "[DEBUG], performance benchmarking for NousResearch/Nous-Hermes-1.4B"  # TODO: Replace to actual test commands
+  - label: "Notifications: Performance benchmarks for NousResearch/Nous-Hermes-1.4B"
+    key: "notifications_pb_NousResearch_Nous-Hermes-1_4B"
+    depends_on: "pb_NousResearch_Nous-Hermes-1_4B"
+    agents:
+      queue: tpu_v6e_queue
+    env:
+      EXECUTE_ENTITY: "NousResearch/Nous-Hermes-1.4B"
+      EXECUTE_STAGE: "Benchmark"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Performance benchmarks for NousResearch/Nous-Hermes-1.4B" pb_NousResearch_Nous-Hermes-1_4B
+
+  - label: "Stress tests for NousResearch/Nous-Hermes-1.4B"
+    key: "st_NousResearch_Nous-Hermes-1_4B"
+    depends_on: "notifications_pb_NousResearch_Nous-Hermes-1_4B"
+    commands:
+      # - our_stress_tests_script NousResearch/Nous-Hermes-1.4B expected_throughput # TODO: expected_throughput need parameterized
+      - echo "[DEBUG], stress testing for NousResearch/Nous-Hermes-1.4B"  # TODO: Replace to actual test commands
+  - label: "Notifications: Stress tests for NousResearch/Nous-Hermes-1.4B"
+    key: "notifications_st_NousResearch_Nous-Hermes-1_4B"
+    depends_on: "st_NousResearch_Nous-Hermes-1_4B"
+    agents:
+      queue: tpu_v6e_queue
+    env:
+      EXECUTE_ENTITY: "NousResearch/Nous-Hermes-1.4B"
+      EXECUTE_STAGE: "StressTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Stress tests for NousResearch/Nous-Hermes-1.4B" st_NousResearch_Nous-Hermes-1_4B
diff --git a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
new file mode 100644
index 0000000000..901933a724
--- /dev/null
+++ b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
@@ -0,0 +1,79 @@
+# meta-llama/Llama-3.1-70B-Instruct
+agents:
+  queue: tpu_v6e_8_queue
+steps:
+  - label: "Unit tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "ut_meta-llama_Llama-3_1-70B-Instruct"
+    commands:
+      # - replace_with_test_commands  # TODO: Replaced to actual test commands
+      - echo "[DEBUG], unit testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Unit tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "ut_meta-llama_Llama-3_1-70B-Instruct"
+    agents:
+      queue: tpu_v6e_8_queue
+    env:
+      EXECUTE_ENTITY: "meta-llama/Llama-3.1-70B-Instruct"
+      EXECUTE_STAGE: "UnitTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct
+
+  - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "it_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "notifications_ut_meta-llama_Llama-3_1-70B-Instruct"
+    commands:
+      # TODO: expected_accuracy need parameterized
+      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-70B-Instruct"
+      - echo "[DEBUG], integration testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Integration tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "notifications_it_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "it_meta-llama_Llama-3_1-70B-Instruct"
+    agents:
+      queue: tpu_v6e_8_queue
+    env:
+      EXECUTE_ENTITY: "meta-llama/Llama-3.1-70B-Instruct"
+      EXECUTE_STAGE: "IntTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct
+
+  - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
+    key: "pb_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "notifications_it_meta-llama_Llama-3_1-70B-Instruct"
+    commands:
+      # - replace_with_test_command  # TODO
+      - echo "[DEBUG], performance benchmarking for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
+    key: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "pb_meta-llama_Llama-3_1-70B-Instruct"
+    agents:
+      queue: tpu_v6e_8_queue
+    env:
+      EXECUTE_ENTITY: "meta-llama/Llama-3.1-70B-Instruct"
+      EXECUTE_STAGE: "Benchmark"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct
+
+  - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "st_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "notifications_pb_meta-llama_Llama-3_1-70B-Instruct"
+    commands:
+      # - our_stress_tests_script meta-llama/Llama-3.1-70B-Instruct expected_throughput # TODO: expected_throughput need parameterized
+      - echo "[DEBUG], stress testing for meta-llama/Llama-3.1-70B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Stress tests for meta-llama/Llama-3.1-70B-Instruct"
+    key: "notifications_st_meta-llama_Llama-3_1-70B-Instruct"
+    depends_on: "st_meta-llama_Llama-3_1-70B-Instruct"
+    agents:
+      queue: tpu_v6e_8_queue
+    env:
+      EXECUTE_ENTITY: "meta-llama/Llama-3.1-70B-Instruct"
+      EXECUTE_STAGE: "StressTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct
diff --git a/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml b/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml
new file mode 100644
index 0000000000..1531ca0551
--- /dev/null
+++ b/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml
@@ -0,0 +1,79 @@
+# Qwen/Qwen2.5-2B-Instruct
+agents:
+  queue: tpu_v6e_queue
+steps:
+  - label: "Unit tests for Qwen/Qwen2.5-2B-Instruct"
+    key: "ut_Qwen_Qwen2_5-2B-Instruct"
+    commands:
+      # - replace_with_test_commands  # TODO: Replaced to actual test commands
+      - echo "[DEBUG], unit testing for Qwen/Qwen2.5-2B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Unit tests for Qwen/Qwen2.5-2B-Instruct"
+    key: "notifications_ut_Qwen_Qwen2_5-2B-Instruct"
+    depends_on: "ut_Qwen_Qwen2_5-2B-Instruct"
+    agents:
+      queue: tpu_v6e_queue
+    env:
+      EXECUTE_ENTITY: "Qwen/Qwen2.5-2B-Instruct"
+      EXECUTE_STAGE: "UnitTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Unit tests for Qwen/Qwen2.5-2B-Instruct" ut_Qwen_Qwen2_5-2B-Instruct
+
+  - label: "Integration tests for Qwen/Qwen2.5-2B-Instruct"
+    key: "it_Qwen_Qwen2_5-2B-Instruct"
+    depends_on: "notifications_ut_Qwen_Qwen2_5-2B-Instruct"
+    commands:
+      # TODO: expected_accuracy need parameterized
+      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "Qwen/Qwen2.5-2B-Instruct"
+      - echo "[DEBUG], integration testing for Qwen/Qwen2.5-2B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Integration tests for Qwen/Qwen2.5-2B-Instruct"
+    key: "notifications_it_Qwen_Qwen2_5-2B-Instruct"
+    depends_on: "it_Qwen_Qwen2_5-2B-Instruct"
+    agents:
+      queue: tpu_v6e_queue
+    env:
+      EXECUTE_ENTITY: "Qwen/Qwen2.5-2B-Instruct"
+      EXECUTE_STAGE: "IntTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Integration tests for Qwen/Qwen2.5-2B-Instruct" it_Qwen_Qwen2_5-2B-Instruct
+
+  - label: "Performance benchmarks for Qwen/Qwen2.5-2B-Instruct"
+    key: "pb_Qwen_Qwen2_5-2B-Instruct"
+    depends_on: "notifications_it_Qwen_Qwen2_5-2B-Instruct"
+    commands:
+      # - replace_with_test_command  # TODO
+      - echo "[DEBUG], performance benchmarking for Qwen/Qwen2.5-2B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Performance benchmarks for Qwen/Qwen2.5-2B-Instruct"
+    key: "notifications_pb_Qwen_Qwen2_5-2B-Instruct"
+    depends_on: "pb_Qwen_Qwen2_5-2B-Instruct"
+    agents:
+      queue: tpu_v6e_queue
+    env:
+      EXECUTE_ENTITY: "Qwen/Qwen2.5-2B-Instruct"
+      EXECUTE_STAGE: "Benchmark"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Performance benchmarks for Qwen/Qwen2.5-2B-Instruct" pb_Qwen_Qwen2_5-2B-Instruct
+
+  - label: "Stress tests for Qwen/Qwen2.5-2B-Instruct"
+    key: "st_Qwen_Qwen2_5-2B-Instruct"
+    depends_on: "notifications_pb_Qwen_Qwen2_5-2B-Instruct"
+    commands:
+      # - our_stress_tests_script Qwen/Qwen2.5-2B-Instruct expected_throughput # TODO: expected_throughput need parameterized
+      - echo "[DEBUG], stress testing for Qwen/Qwen2.5-2B-Instruct"  # TODO: Replace to actual test commands
+  - label: "Notifications: Stress tests for Qwen/Qwen2.5-2B-Instruct"
+    key: "notifications_st_Qwen_Qwen2_5-2B-Instruct"
+    depends_on: "st_Qwen_Qwen2_5-2B-Instruct"
+    agents:
+      queue: tpu_v6e_queue
+    env:
+      EXECUTE_ENTITY: "Qwen/Qwen2.5-2B-Instruct"
+      EXECUTE_STAGE: "StressTest"
+    commands:
+      - |
+        .buildkite/scripts/check_results.sh \
+          "Stress tests for Qwen/Qwen2.5-2B-Instruct" st_Qwen_Qwen2_5-2B-Instruct

From 3005ea2c35f6e123e83056d171230d40bd0162bd Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 15:46:42 +0800
Subject: [PATCH 25/38] fix for test

---
 .buildkite/buildkite_ci_feature_template.yml              | 8 ++++----
 .buildkite/buildkite_ci_model_template.yml                | 8 ++++----
 .../informational/NousResearch_Nous-Hermes-1_4B.yml       | 8 ++++----
 .buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml   | 8 ++++----
 .buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml    | 8 ++++----
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml
index e7286ccda4..cab1a9250e 100644
--- a/.buildkite/buildkite_ci_feature_template.yml
+++ b/.buildkite/buildkite_ci_feature_template.yml
@@ -17,7 +17,7 @@ steps:
       EXECUTE_STAGE: "UnitTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Unit tests for {FEATURE_NAME}" ut_{SAFE_FEATURE_NAME}
 
   - label: "Integration tests for {FEATURE_NAME}"
@@ -37,7 +37,7 @@ steps:
       EXECUTE_STAGE: "IntTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Integration tests for {FEATURE_NAME}" it_{SAFE_FEATURE_NAME}
 
   - label: "Performance benchmarks for {FEATURE_NAME}"
@@ -56,7 +56,7 @@ steps:
       EXECUTE_STAGE: "Benchmark"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Performance benchmarks for {FEATURE_NAME}" pb_{SAFE_FEATURE_NAME}
 
   - label: "Stress tests for {FEATURE_NAME}"
@@ -75,5 +75,5 @@ steps:
       EXECUTE_STAGE: "StressTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Stress tests for {FEATURE_NAME}" st_{SAFE_FEATURE_NAME}
diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml
index 6c6c13910c..595ae98c05 100644
--- a/.buildkite/buildkite_ci_model_template.yml
+++ b/.buildkite/buildkite_ci_model_template.yml
@@ -17,7 +17,7 @@ steps:
       EXECUTE_STAGE: "UnitTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Unit tests for {MODEL_NAME}" ut_{SAFE_MODEL_NAME}
 
   - label: "Integration tests for {MODEL_NAME}"
@@ -37,7 +37,7 @@ steps:
       EXECUTE_STAGE: "IntTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Integration tests for {MODEL_NAME}" it_{SAFE_MODEL_NAME}
 
   - label: "Performance benchmarks for {MODEL_NAME}"
@@ -56,7 +56,7 @@ steps:
       EXECUTE_STAGE: "Benchmark"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Performance benchmarks for {MODEL_NAME}" pb_{SAFE_MODEL_NAME}
 
   - label: "Stress tests for {MODEL_NAME}"
@@ -75,5 +75,5 @@ steps:
       EXECUTE_STAGE: "StressTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Stress tests for {MODEL_NAME}" st_{SAFE_MODEL_NAME}
diff --git a/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml b/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml
index 4e4890c79b..ea71da11d4 100644
--- a/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml
+++ b/.buildkite/models/informational/NousResearch_Nous-Hermes-1_4B.yml
@@ -17,7 +17,7 @@ steps:
       EXECUTE_STAGE: "UnitTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Unit tests for NousResearch/Nous-Hermes-1.4B" ut_NousResearch_Nous-Hermes-1_4B
 
   - label: "Integration tests for NousResearch/Nous-Hermes-1.4B"
@@ -37,7 +37,7 @@ steps:
       EXECUTE_STAGE: "IntTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Integration tests for NousResearch/Nous-Hermes-1.4B" it_NousResearch_Nous-Hermes-1_4B
 
   - label: "Performance benchmarks for NousResearch/Nous-Hermes-1.4B"
@@ -56,7 +56,7 @@ steps:
       EXECUTE_STAGE: "Benchmark"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Performance benchmarks for NousResearch/Nous-Hermes-1.4B" pb_NousResearch_Nous-Hermes-1_4B
 
   - label: "Stress tests for NousResearch/Nous-Hermes-1.4B"
@@ -75,5 +75,5 @@ steps:
       EXECUTE_STAGE: "StressTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Stress tests for NousResearch/Nous-Hermes-1.4B" st_NousResearch_Nous-Hermes-1_4B
diff --git a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
index 901933a724..818165e019 100644
--- a/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
+++ b/.buildkite/models/meta-llama_Llama-3_1-70B-Instruct.yml
@@ -17,7 +17,7 @@ steps:
       EXECUTE_STAGE: "UnitTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Unit tests for meta-llama/Llama-3.1-70B-Instruct" ut_meta-llama_Llama-3_1-70B-Instruct
 
   - label: "Integration tests for meta-llama/Llama-3.1-70B-Instruct"
@@ -37,7 +37,7 @@ steps:
       EXECUTE_STAGE: "IntTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Integration tests for meta-llama/Llama-3.1-70B-Instruct" it_meta-llama_Llama-3_1-70B-Instruct
 
   - label: "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct"
@@ -56,7 +56,7 @@ steps:
       EXECUTE_STAGE: "Benchmark"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Performance benchmarks for meta-llama/Llama-3.1-70B-Instruct" pb_meta-llama_Llama-3_1-70B-Instruct
 
   - label: "Stress tests for meta-llama/Llama-3.1-70B-Instruct"
@@ -75,5 +75,5 @@ steps:
       EXECUTE_STAGE: "StressTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Stress tests for meta-llama/Llama-3.1-70B-Instruct" st_meta-llama_Llama-3_1-70B-Instruct
diff --git a/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml b/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml
index 1531ca0551..d9191a3704 100644
--- a/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml
+++ b/.buildkite/models/popular/Qwen_Qwen2_5-2B-Instruct.yml
@@ -17,7 +17,7 @@ steps:
       EXECUTE_STAGE: "UnitTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Unit tests for Qwen/Qwen2.5-2B-Instruct" ut_Qwen_Qwen2_5-2B-Instruct
 
   - label: "Integration tests for Qwen/Qwen2.5-2B-Instruct"
@@ -37,7 +37,7 @@ steps:
       EXECUTE_STAGE: "IntTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Integration tests for Qwen/Qwen2.5-2B-Instruct" it_Qwen_Qwen2_5-2B-Instruct
 
   - label: "Performance benchmarks for Qwen/Qwen2.5-2B-Instruct"
@@ -56,7 +56,7 @@ steps:
       EXECUTE_STAGE: "Benchmark"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Performance benchmarks for Qwen/Qwen2.5-2B-Instruct" pb_Qwen_Qwen2_5-2B-Instruct
 
   - label: "Stress tests for Qwen/Qwen2.5-2B-Instruct"
@@ -75,5 +75,5 @@ steps:
       EXECUTE_STAGE: "StressTest"
     commands:
       - |
-        .buildkite/scripts/check_results.sh \
+        bash .buildkite/scripts/check_results.sh \
           "Stress tests for Qwen/Qwen2.5-2B-Instruct" st_Qwen_Qwen2_5-2B-Instruct

From 6a6af3ef623f889f8ad7762e53e9dbc5392c9d13 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 16:05:12 +0800
Subject: [PATCH 26/38] test

---
 .buildkite/scripts/export_support_matrix.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh
index 5f2975ddee..0dc78eedd2 100644
--- a/.buildkite/scripts/export_support_matrix.sh
+++ b/.buildkite/scripts/export_support_matrix.sh
@@ -9,13 +9,14 @@ POPURLAR_MODEL_LIST_KEY="popular-model-list"
 
 FEATURE_LIST_METADATA_KEY="feature-list"
 
-# tpu_model_list="Qwen/Qwen2.5-0.5B-Instruct Qwen/Qwen2.5-2B-Instruct"
-# vllm_model_list="NousResearch/Nous-Hermes-1.4B NousResearch/Nous-Hermes-2.5B"
-# popular_model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.2-8B-Instruct"
 tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}" --default "")
 vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}" --default "")
 popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}" --default "")
 
+echo "tl: $tpu_model_list"
+echo "vl: $vllm_model_list"
+echo "pl: $popular_model_list"
+
 feature_list="f1 f2"
 STAGES="UnitTest IntTest Benchmark StressTest"
 
@@ -106,12 +107,15 @@ if [ "$ANY_FAILED" = true ]; then
     exit 1
 else
     echo "--- Uploading Commit Hash to Repo ---"
-    echo "Will commit to tpu_commons main"
+    # TODO: Will commit hash value to tpu_commons main
 fi
 
 echo "--- Print Model Report Content ---"
 cat "$output_model_support_matrix_file"
 
+echo "--- Print Feature Report Content ---"
+cat "$output_feature_support_matrix_file"
+
 echo "--- Uploading CSV Reports as Buildkite Artifacts ---"
 buildkite-agent artifact upload "$output_model_support_matrix_file"
 buildkite-agent artifact upload "$output_feature_support_matrix_file"

From 689d12b377c92411ca194abc24e1408283e41980 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 16:23:21 +0800
Subject: [PATCH 27/38] test

---
 .buildkite/scripts/dynamic_bootstrap.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh
index 5d3d3bc9f3..f981625464 100644
--- a/.buildkite/scripts/dynamic_bootstrap.sh
+++ b/.buildkite/scripts/dynamic_bootstrap.sh
@@ -4,7 +4,7 @@ echo "--- Starting Special Buildkite Bootstrap ---"
 
 # for loop features and models upload to buildkite
 BUILDKITE_DIR=".buildkite"
-TARGET_FOLDERS="models features models/informational"
+TARGET_FOLDERS="models models/informational models/popular features"
 
 MODEL_LIST_KEY="tpu-model-list"
 INFORMATIONAL_MODEL_LIST_KEY="vllm-model-list"

From 281d1a8600fd1dbb7e1e3fbdfb72ac431ae3141c Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 16:31:02 +0800
Subject: [PATCH 28/38] test

---
 .buildkite/scripts/dynamic_bootstrap.sh     | 1 -
 .buildkite/scripts/export_support_matrix.sh | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/.buildkite/scripts/dynamic_bootstrap.sh b/.buildkite/scripts/dynamic_bootstrap.sh
index f981625464..16a32bd0c3 100644
--- a/.buildkite/scripts/dynamic_bootstrap.sh
+++ b/.buildkite/scripts/dynamic_bootstrap.sh
@@ -128,7 +128,6 @@ else
   echo "--- No popular Models found to upload."
 fi
 
-
 # --- Upload Dynamic Pipeline ---
 
 if [[ -n "$pipeline_steps" ]]; then
diff --git a/.buildkite/scripts/export_support_matrix.sh b/.buildkite/scripts/export_support_matrix.sh
index 0dc78eedd2..c6de583f45 100644
--- a/.buildkite/scripts/export_support_matrix.sh
+++ b/.buildkite/scripts/export_support_matrix.sh
@@ -13,11 +13,7 @@ tpu_model_list=$(buildkite-agent meta-data get "${MODEL_LIST_KEY}" --default "")
 vllm_model_list=$(buildkite-agent meta-data get "${INFORMATIONAL_MODEL_LIST_KEY}" --default "")
 popular_model_list=$(buildkite-agent meta-data get "${POPURLAR_MODEL_LIST_KEY}" --default "")
 
-echo "tl: $tpu_model_list"
-echo "vl: $vllm_model_list"
-echo "pl: $popular_model_list"
-
-feature_list="f1 f2"
+feature_list=$(buildkite-agent meta-data get "${FEATURE_LIST_METADATA_KEY}" --default "")
 STAGES="UnitTest IntTest Benchmark StressTest"
 
 # Output CSV files

From 43352a810178d9d60d9c296ecbf0831340493d5a Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Wed, 24 Sep 2025 16:32:19 +0800
Subject: [PATCH 29/38] remove gz

---
 buildkite-script-dynamic.gz | Bin 10583 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 buildkite-script-dynamic.gz

diff --git a/buildkite-script-dynamic.gz b/buildkite-script-dynamic.gz
deleted file mode 100644
index 9062e95d9fc13a790f058d1a16c52a94da6bff10..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10583
zcmV-dDX7*TiwFR*VA5y+1MPilTjNI3Xg>Q_bi|wh6D%xWfsFGk1D*+QU|_*yHz&yv
z!j`duEjhZtuw>rnd^!JMKfb?if61xpZb_}1Ex?#d)X(k&ORetes;=tls_v>I?eh=1
z*4OQI`P?%`{Y%5uznXKYR4T1TgM8JZg?v?Qv>MeK!RM^NOQlh1*6a0Vt5zo!rKL1m
zUy;xA1|+R`j;q-K%8hPLbR)ao#5Ts7eRE-3-X!fk8wm;i`$CG+AfC~0D^*3Q)HX}C
z3K_e`SZ^z>#zs@AZZ?{7wOZM%Ha9k#rKbkAD043%*RMZXA3p!r>&W-5B>w}yMESo_
zZ8g6ljYoq>D1UMO7wCT!ECs<cwP&bftFMn#S07JCnybq<<I!FF0DJ&#^T_{3tx|81
zO0`jMDuw)CM42xC8<p0kTx)JL8ygj+UdaE~R|Ni_GuMa3=l^v1uPF6uHOBw7h63`x
zGQVMFm%ljwFOl1W!`;rl`f2;1^QEL+=%(vzl?eI6(>;BQ-2VNva|$mUSD(=5Wiryv
z_0blQPEErluI{)F8CZ6t9SJHDa;e`yHP2OVPqsgF)WrLqH6CkbpEiqS2^zO`m~cH^
zy*AvTijAv&EqvcS+&$bP$E;4DxE9fRuBVMC7(siyxiZmvLyJhi>~`Lte*A5Xcs{^}
zd65J;(f*X?LnM5%Tw`GLG}o|9XN%;-i79@iW@D(YPju)}walrA45@}CM!7Uhm&u>}
zFG2(f;Ph-`;yUZSq29YxZQb!kt|L1`@@r6=Os6|Z1Co9`8F?3mDL^o^tV?I<O4$<H
zo>-1s=D=<VFk)fEESJmVNT@}dA+Bv)T<A7T33xB1eAYz*>YrhdeS%Yq0N1-75bVwS
z)4l!OANRT)b?5NlVEfaq+WF^RS3T)&cTZ2EO&J673z24gLCX3cq(XlCj<`eJjC>ye
zGt>$46Paj^qxWTUIvH77A7>@Qxh%_4?8|^iWlC%^)?KaK*IbP_x=SRH2wO-X9I!+r
z)<X;<BS)VO*FZByj|&&;A==o$C`DLn&vf+*oAO8wmNN1&QC`kV$|fco^v@H$2eU}k
zdOgq9dN;(>VNOjnTZ5tNw(&2$FNm`g=CVCgHB7Z{^)7WAnd@AGf7Y)p`_h?!*jaZc
z9u;~3;5r@mx?Y9VSvPw_Sb;8q9Q1Jj0k%<gi9*WbD8i*IL1@G<`*`y3ZZwP%CnrU-
zt1VV6<`^nTgSC*38B*wxq$AMjISR)RtUR*uNVj2j0f~Vv1IaL*xhCiL5Rlo-btmP`
zmnB&OX_7Xa$CFC;z+MzBIdPd?ZLy*$?|4ZXz=eFu5Co4TpWgcfdTtU*1T2q?I&q;}
zC&#ky+(&iG=B+fzWO=sgP<W_Za#Y4R5s*UN&K<%o|6(}t+@#(oyvx-567tOnQSo?E
zEjPZSh|G!o>}rdZWz$DV(!eYvScaf@Bniey^c<C}2xJ*D?5Qnsi!G`CpNTQiM~11Z
zu4VUz4>SILmD~T*s8uR8h1!3stw!PhSxA}U|7j|fdUdlQ*OcmJrM0nH`-`#0U!n;7
zKWDBF%Pjv-wN{JqzXD$r{+~sZz^}7}s>{O)ZKc~e>5`Lf=ja4<tQ~DM!o~aA)+VsW
zUn8D_KPNZt&@yXiu!KKhHJ)4lf_B_JIwe%?s%%khQAHzzJh|ZuK2?R$)Joq~*h9w}
zxGL%e@XGeMjuO>xeFYaky=v+z{mI_N_1v8H-vl;D*&*dC(Ik8|8jZOv_)0+o@wzyo
zQpUg)K5&3*VvMM|pEZ8v#f(1~d~{S<IH+<QoNJEJ%Nl+4#f)C0zCWa_q1X7(13$s!
zhOAsEtEnicy%-b#EcVUBG61A?|EV~}tu6XwG*<O1Z4|swuM|}`VGF2d;j7N^^@pdK
zuJJEAv&4o|@m7B^cnjuD*7&mU(m?BnHlB=hXWbeMsES~i1K94G%v+U(o5nN1O&k|9
zN0NnwCZ8P$B0C%deVWi&#^9UJVDNX6DfkZdep!Re!b^+e#i4l3L`UnTp(8e^tl@9?
z)54lu;y;sFh}d|Mh)5e=7D_fbO2*y@wvtv~8)e~Q^F`t!GOkSBE6_SY1=gHQz3YW4
zp8lXRhwtm24h*N~s+I@ex+)AvxA8n-7CK&z9t!A4)kHD}oQW!5MIBrEtl9D?it3{(
z8<9a}q5xD?Tf4Z>`%oAE)m#?_gZc?Jl8KIIuCyi(@=PCOX^z|QPlw&T4|_Y?-Mzz4
zbUXf7TkM|{MnlmV#i=Fuk3!tWx7?9uO3Jfp%ClO^vwF(2M#{5h%ClC=vyGHzn`tno
z4kT?PO4?BTNqqPDN+f!6ERP3mS)i0dK2XQUvOo#}90BnHz6CT3*cFf};8H-LfH@Ii
zBA!I_h**&dYx5#Xs{hAdwbX&Gxt^`7K_!q<ckd(r{hwM3Ug7;8rBU4fTu7OE|EIE9
zZMEcTt5)CKsBg9kj(mMZ;Qu*ueORXZf8bqJiSd7}s<aCK&mxL3o>;a^H2Y$r*^XWs
z*cK?XnmZgB=Y+pJf}f?*64})UhN<JZ65?MBBLh$+Ol_<aJPX4%J*02gde61&8=^U&
zu9$#flONX6HRp!F-Z@Ye<x;nEaJ0YO?WiC2_B(h?gS0`j5$~e;^HIEwCY26PL8<Ag
zyL-p*0fM+9@rsVLS}OJR0SRX`j4Jqi#cvPc-omq4YreV#f3K4A_n^$0I#b0Yi7B%m
zc~cJ2!V@P10HGKl6m4377MbpJ$hLhEYIH>GmaJe`TLcfhM4ku7$>iWW3Y+-B-|2%`
zO&?I0r>46ypa+#sU~1C?XRa0Cf|B6_>yj+rik*L1rpK378SqPF2h~VCb(Eku9%TYe
zlJ)GrQE}S%v`A2OmDYvEOv{aaA{&m1jjpVQxbjEQv2DwSQAavozD=|apkuoO%QO35
zORGUUwhp5+{Q-4M2S<&=NlaBfBC<yzTq8YO#|{mb4Kkc<^7s<ITA}q->$2U2E$3<k
z^@s7cmv6(*0aC*OxU~L_9HROdep9%BU2s=OtQ4$sblW!}p6FRJ3$|G+5tOcxWqWyz
z=w{Et!&B{L&mELEmRFG>2jYP@fqH(=Z8DG%yp`1$HKZsNunuv&UQc(Nfj1h_;b3k3
zb5EbRq(lFMw;`ZYJ%hBg6*@!3wztTw{w0%&{t?OL^<^q`Bw1p3&`bj4*KMa~*=(L)
zX(OW_$S8+a54d;PL3;v2s-q3Ub&oP)JHnSTKia+`txIb}Ql-`KiOiq)@SS+)j%&D{
z%Vrv_HMB+u7=x2OwXG|oulG^%dSl%BrBK)B*}g!WIK5<gvfEvW5cNZ6yL)=vp{L&3
z;yBhK)#yRVNZkY;q94#hooy<R!WW5oa^y=Um>wJzI1SKYJA%itdGe!v!=}QNX|hGO
zMLZ(iaEg#?_$J_cVwhQ|m69-yU?~E2qaqHc%GhuKjquwc=em2X>n6FeJUb@$a00Pi
zwwyUXg=LaUnaCnZMINt8#lQa&!~$&S0d9+pBQhP?1Sh!k0pfu04qZI48E$F}+JTuX
z-6jb6g{_*x2V7hW>*#vCEq_S=unj(&gDDwEDp*_GVGaQ$^a|qXI;VIV$k*8SN`e*Q
z6dg$a`Jev>yhr8H*JQ^s2gU^>S_re|Zh<fsAo;R<AEAXHq0kb@>x(R5A&|-vc}waQ
z@-2~mHGQ3AtQn@i()nxl0X!oE&+M_wV-}&hm~ZJRqn|RI%ybgjo+*;UiYo#5y$!_m
z^*VW*e||*Ygwmp~v*`>Sw<TZ6fr<7$&~bM5kkkP(<C+fZFc5y*eSaNGgIy-?Eu6<T
zE+2n*ut4@@0bp6{_f@}g2qWk+d?uH1Qc9udac?Gi8x=T#6<M=%Sq>i$_0gp5V<#xc
zew1bSYRh{;3V48jE>8jpu%gQsGR$4BJ(ahYS7j1y5wwS)cE#2vm?+Egvfql|HWjL%
zRsd&=Mj&eZOH=$@__7*&5l&l*c)ARQIFWqB=RqHp*!Ra<98c_^HPcHuu6*1?43-QB
z{-X{4a5Xw-0AL4ZDjx#<4-UfN`bQ2OzKzx8$g0JFV=w5R&>85SDt)V}qz!uvRYhJ@
zRf#bpW6Zyrjbpm~kDUQ{s2$+$_P=_wS}*K>izxZ*f14Gh(JJhJudt-q|K`K>VVP<F
z3*x`|{9kQi{IA0P_xR-{*#DBTs8X$ewAw7&A8TvDmcOu5(TpcNU5wR_b_n~Qu;aqw
zS6KZ1$`(I?1=|8YzA$TFc*C_Fjv>X;C)A6LzOeE=nUznNg7>oV3BtUv@D&!m7i;08
z^Z1eNdrSeEVcnzg14biltgY|kzvY^&lsDd&_e^%Q_bx_1)?s1tn~TXWlyuClCoEru
zx$nMZdi;m}4It3yJ@790UF?6&Ruj+v)GF1&{<nxS!~R#RSLJ%Gve~NEHk7|Sv%JER
z7XM*BTpyMh_P<K2uEh9XsnrYn-$Kel<3G%&8>&hOVaRESD6-2YEji+|nBH~Z`rw@J
z0&)IOJ=(ME4EepLYt4Y&$A8CeIRz@W<4veJ6*i!@0ehG^0LP+cW%hv4vS!dfb^Z6x
zQO;?ORdVNlY5?CW!2gw2t5&T4izqYlf7B`sxn9}agqO9#|Mv=u!2dqi&pi=TC3pTO
z&i_hFLH)nT|Md7}`u^{C?eiS8z}fr1dYJ#Kh4Q~zuNCrtAtfgNtDCX>e;W<Cva!)>
z)oaD}?=_aR{oj1JJ}fi#f30S{6660$qf#mMe~TzDaQ_$GUUjM+a<GpP?uD4p|F!*3
z!jFTn%@PY7@zjz2?vd$bJfcDVf<0!=VKdKy80jm4h-sl#bn_}0+TTyNKXv#1H;p8L
zF(07P<7}&+1YM?noi$UY1zQmj^r`a??pojNoa}vkg-H6AT<z~4WZ`LA6c!Or3`cth
zNBe5`=c5jEhyi0>Dw-xyG*MS!JqtzCq6FrjN`#!Abkv`koeh;9(YVV*+jw#uCXq-3
zX%QMp%W!WfrV^NYtQdkqZ2vq%7gG>0Jd}H6IV%QG`SWZvg`vb$ih(c?A;#(a=9_4j
z_rm>HrN06G<rM5Y6TRnQ{M0L*^mTSlBb^4170U2=&_KY0>f-@O5Jwrx!61QTlEWV>
zzTrb~bYIj&TPc@~3)8ao_SA2ne-rhNdwdF%xZ`KS-(whh{Og`|Rb~fTvRaU#if#j4
z>>jI6gtxo}Tc@^kSGULIvEk6b)$9$@w@l9+wws$%rwP2^h)GPC$BF~_6cgqF#OI%8
zMMh(<)vKx>BW(_STwbX?tvA$FZ_>w8CSl_*G637%qOZjxuf?OT#Url8qpigwt;M6P
z#Urf6qpQUts|lGXS>P{v@Jg+zin{S0!_elc+Nub3ACGyaCMIIUK=v1dtVzTVu*QR{
z@xW?4s2UHb#)GNxKx#aQ8pG6M@BdTnzf6mrTOW7u|3Rst{ij~!e_2eK?*D7nt1Wr6
zxw+AT3Y&!i`1KWm{~6cM-TQYZ{~zOjrP*j!3jg0CN~-*y8GY++ea!a%h37x2)hfvU
zCd`;Z{x71;IRBw+H08Qdt<`JQ&3YmKUtbaUf6iPVmg)XKMQJpeG5)Vt8uh~ew}`St
zZp8p#UrOQjDVE7K@DvJ1FGE8I&BPjMNz6Fw`B_<0YTPN2xRc0~+n<sumu>1|xp)KX
zOpu8;3e;S3@#e-%G#BqQJVmncPOG@se7w^~O40$P4HOx9$zx{ZjUapr$wW^?KZA_C
znULklD!vz*fpRgJk(Z_Nb7E3r#d3uENwu;wgfBgw%Z-Z5AgvdANnE0!NmoikQi^~y
zVi<l*IeAxVMv0W0w$myuR$S&9FiD5EkeV5?>CvPnQ0sXL%@D9WG82z<VB823m^6?T
zvsi6%elNi}jn(d?>iO~|OH57D2KAV->RwokA}2Rar&V06z{)>rk`8nswK9amqe-pz
zKAE1M+!8^{Be_mo=-A1n8aVe;^rmrIn!K_+8*{t?R8(3yDr2J<V4*JP4sFZ37-AHW
zgq|q8%hVJTa?c4-@|e=@4w#Q3G&dclRa~r$n>ALF4r(FUGQ`KDNj65P=c$rKV9Stg
zPi^f>mQ?@Gac6t?pd*j_^XVg}|EE!_RBB4S#qNI;{{MxPDgK|PQmI!r8*&X^Hk-|A
zaRKZV7J>ig%=KZJasR7QZ?>ZRuQVI=M&bWiL|F>sz?Div*PS?9>!Bx525kgwj%99h
zER}E_{b;y9c<1C=Kc`BH;qdTrp3S^5Tx+3m!@?+Y{`EqchGpw{wvB5lU6cLKnOTfS
z9P{gMl}cx4XKC)It)L2D&4b_{Q<TW$LBK#S%gm>GZdop<-P!~+M~Oo7)MT4mNnQW>
zxaZYJ{_}s$N>!ojf3w~!*8fG6>E}Q2-nU$@*EU<V+D7B`%=)7EB}LlJy^s9o|2Y3E
zO{ia-|64?vvj0>4IGhhv3G<)-3-Uj}ABFs1M9G!^1>SDd8%6x5S6CAJzqxUJSZ3`1
zTCnkp@qY#WD)xViD8l|P44wGVk2#L}J=os$tsV_9jp5L-g5yWv$4KndVD8cDkw^6Y
zAP=4IP_KjMT4UJO^|aCGCd@`6#vWha*$wu3v7I0W!FK3=(b0oeM>@N@8AR@;9r5HH
zm~F;zSk!v%Ao1H~fSzM?avmcC&Fp^AGr3e+S|W$g^p&AsvwbcnIk~X35#N<M=u5HS
zTZ8tBz8Gyh@)H42AovzGx$(tRBPPHG`&0{~#G4*P3i}8EMR4>!i#z}G_Q5_IK2Mv|
z8-}4OFvK>8dkxbUke+7Jc<~x>49vdZZ)qbLU^Mg%n0?LeqjxDpIScEL1GaS<x81$A
zXmso_^gQgLONva$+qVa_|Kvn@`<ASnO-Ueiwz@@jD2XtQPNZ)-K%ZC|y_SyNBmki|
z#5yBgvrT+QTa`;y3W%S<dKM^Yx$N@;gDB@M0E%d2dw&|@nJjXC6QncI0O>YNq={uF
z)0+T9c!2tBb&a2Qpc7L_b#bNHhIWqRrd}MHlOjF9{u;|pp<U5VhB|j&U^W8=Z!mQ*
zsJw>Z|BVZi#*~j|z+rQKH%MA@LgU-RSYbA>5bxMNAMhFjip@!9XnCVPhO*aeTQe^(
z@dD1&(a5@H3>)vvfUj20_!XlwI)<|#;NaP65E`HHaX%(FSVp275JWT)2+hPJGrN#n
z`p>`r=Rg1czwkdYWdaJdX5^MiYcFjUWwn@gavtjbo7WAjtQ7hZyVQ$x_`$kHem!+`
z{JBJwUuAz%v-uU}y;A9v6KE>maq<ZwQp0=?;iSiuETL!tq`rN-!wq6@-%=E#RjkCW
zhEKyV5py_@cVV)b?+MRFK;BUYE_=v^|Msn)w!)#^0c(>pv7s}v;^!&gd9YFwI-Kj2
zYtgZ{$6nEDm`MaAnZu+CNb%lKv$dX!$$3V`CEyO`B4FT5K3iimEImcTnR@1@Xh=);
zf=CSIJvC`w>NhmU4;>C2{q&K9pNp9muBlMQ_rF``4g(GaJx{%G7BOa?ty1cBK5MuJ
zJcxR;a$p1$japMVNm({53IJ6f{MCqpFRVvYx}u;OSv{V-h4>;Ogh1sGs1XRLIF$wd
zSt8Xq!^GE!G?qzSAM!IzxH#lvo3!poG`%r-C;FA|!8~GP(<hyf?>J<jhfdnulQ0i1
zkc8#!N1SfK8pL`^Oe?ZSb0BIajj^EPF+efH*EeJbHW1dB2U%l!V_ZexCl%SG92l^q
z<Fd{r4Ca9fbe=@^uy~Rht`YytmA)SDoX_gWI`lTO0-f=d8>jRTS8TSn`zyXr(J%o`
zsI^k=S_+iEASYY|5i4gimesS>V2a*`Yr~f_;UZ6vE=~UoG8%!SYLcJm3qX%94X9`s
zOclSGa81F()LMYGp$BuIjLI3RRWNvc)aNpv%I7ocL^|WM{Wg{#4Z+$5CCdbD9F=9*
z7C3@yN=)mMzQgG=)zDY<(v}^n0?B8=0uzpgj)7(?V=8d^FkJsc%NeXSthu!H39zQK
zB#1OMp5fm3V`itRQVEcbdL|LV`QXdfTr(m~EfVKM<S@;hXGKKjMTi=2cM4g>6zva1
z(o^s2^Oh9*e`p8x6CH^D&wKQd-~QXIRH}8n|JiC4=YJMbX88Y_)rzcCtDBp(YNd$(
z@e0e7^d=AF`moHp|6QxpWBgyMREzUJ3n@!qub&&{I#tk1;Zfuuhx7^GIESkKdw*K6
zGs&eTs;Z1EYZ4e!sWQTitSm_m-C_g>jlz@0yL-oNDV&2+sk?prvC~yQ9PaOSj!)V=
zsdv~i-yRw!ON0xnw>APGnCiB_chXgV?EKu8@XQj^6h{U|wA}mj;qdr?24maaSL3xY
z4sE3R(c#hQ@&5L4qUOY!cq7e@R4(}{2Ca6`>2B|Cceh!6UoYV``+9GLCf2eh<b8F-
z$3YNwdftg;U1u*H9j^zt%;a_>*oCoeu?gHHvDV{naGV*h_`ovW`pO%9E>08JgrIG9
zi@dpwQtk_k79ku^ZIET61-1N5wAOc|Z-Ju4Dz)E4=EeFO;oDLoJ2`vFlX#cdA@*O$
z*QDGh68};nzlq1y8QA||*5obnfi<wqD+}qwWpZ+9OePoy)V~K}0yEI#V(fy-5}N7g
zR!%gvvwbCjjR4Ex3&WRhG@dBlp`Vp$;@Aqy@QR**vW*ZgFH_6AMzd0rYwZS`gKb)W
z6lM<Qrr{Jz)(YF!;00j^GiY89ahJRI;iMhw1AwB5e>vL4y9cy0WOzTZJ(AR~kHp04
zLe}zuVLL7ftRWY=>jpM@sz8_6k#`#d^+B!nn-%T)k}Q8ZZnu?NiUsnf@*Taq@MRfD
zCUlmbj$Y_7Fzey-<t4IAsX&kDYxX6LGw)9bm<s_663qj;M5MRTi9rNrY`?h;KUn&8
ziF_%E*9h1a_!Cq_{0O<1Iff8wK5bcRE12w=Zp0*UKKo<D`tQYb(0p3+y(Lez8<;s+
z-d!6_>WuAK8$C>N9c>?&Ex3HZg{Y9qn#6>}x9t@vIvla?ci+XEipgKGa!khDxn(Yh
zBGr<@DSxy6$`Pp`%Cp=}y%nFwgvlo>G~!^RB$Rz1VQvM=d|8&s9tJH9*=0G>{c_|U
zO3=UVXuVP?F<+f5@P(T&F<84%W`fPHz<g+bKtae2PySW4fWa8q$%_7Y3pp>e#soQj
znS|uQPJ^_3DPHCiG+ywKrcnEXzn-x)@3Jq5)<hegFKuyk<lp^(Ch}`|={Vk)nMHzc
ziQzi>Xn=##Oc;#b^IU33IZt?JnJtCs!0cj|zY?Pe2Sx_jPTsxi9DXRpa}UNuW#mM0
z><UrP8&`{BABrcIQ_m}7_f&L5^#nEvA`o*i+P9#ycjVm)U97^Du{_p3_w@-ZC<-ZK
zZfG(fzpfBwb(P-IRPv;32@f=U^6RR<gsqma>9{gLb}Y2}y1Gn07$6$RJ}%E&fvUJ;
znOC~)G988<5J0mm0$uj?n?5(J@Rl*E>H$*%SKiz%Mdj8nfBWsr>XuZZAPtE0znou9
z2^W)OHK`PRhL<>js(HR#^+)*=W<d~uiOIT5Xj1A)rP#wPc+$QN06JoVQG_WeD>K1L
zuL>0t&$EEbXaV4hehcJG`&<xX{&#HEE}QT=Nw0`P7f!tJDqr#woMQ+6y}hkBw^4<j
zGTjnyDo%U+{EI~XoOKUyYNK6tmH<S=N_I0B*i5t$X;mt55grZ{R+&%%ak!wwZ0sx%
zsZ>JQD_}5Xip@k}$~SQwPTNeyFq+a@K9uI}BR_&?HF{r+XKWTlLZoxlOj@LW7AHqW
z|1nCWx0R0=d3v}D9WvXy_oOc|Z6hleFm>jM^oXpUGwG0M?}UCvw}Y_AsZmuyVpo-<
zH_N}8%jtE#O%<jQptwNv0pcV3`CuQljg%%uVw7<=kX|W;+%cUbl>&iA#R;$mN+NZr
zMo|)>3xX5xiz-<x?miF&FRSa}MQ$Wy*lVaU1~#a`@&vkt;SBw2@lS0-d&y<0|0f=f
z?4Et(zyE~^0P*~9T~Vrq|7Q_py8lPntSEA$qSTdotJN$R?e&!$K?on*$Bg@5O(-$`
zuPV*L|Fe)H`hS){ixTETd>)jhrmsO2tn=A$(U7Uzwq>tT+b;g}Ow=R&BhG6y1PeY#
z#e>y#<PugX>z+lQVhyISfg-b%j6K7&gwFBt;qewZbwIZOt*AoYal?yB|8HM7s%xpy
zVaW3LsF_gE6smC?9G-sas-1rxb#}Vyap&Z8zuT5RXrKg1rNizIonv+T_~S`?1vaP5
zuntP(8$iW{iy-|L*wVQXR*9Hi@w}2}x@BfZE3>oOq8$;_BBt~o$@)i&Gc*PWviK~^
zI7vtdI5iFZO^Uugw{`6@tto0{WQLt3(tW?XMY<Moya`K%g@!}V>@lk^rtGr@Xp1*t
zXs9NCJUXS04K&#VhQlju<bl@6?TG`Ao-gO4{poQ3^q_;N<qtZae!^dz@F|#)NSh};
zj6@=GEw49(FZ_15_fI<~>W97k4j%iF1YZXaqeYH1cL;MP`oYn5_lGr5E}6q6@!D^1
zjc{v!Y#8M$0AXcv-CteLf@CWWBX*fpc~yX$O<zpL)f=N>5f%JM_sn!7f<=)DTY+SU
z?KCKm%SbvTiTYDP)RQ0rC=5Psp!k|x*l7IygGm49hYxD^@W;-lwnWx-x3{jFSB7nw
zXl)KCe9;?OULUrjJzIAj|J;^Cv>v^IL~jA%p=4H}eLiN}jq-3%cCAa@qz6b1kT8GJ
zz+aBnw}`7#*Jiv=@;zDa>sRZhHyV+5@7@t90Fc^%SEWGyZ1?flm;V~~KJmun*nT#s
ztBuezcf;M6$Q>!4BC?i%@%7P&#xEm%!$Aq!Rw|oH$+0}UrzgJSVzc}&M1M4-*Isw4
z{6)$Vqw%^#M2auGefI!JYgr=Se^U)wp@$dwxoO7yPO?dKVQK8z$}XENu&7?>VN;=4
z7Df?I92*@(>}`86(mt!E`I)1yYg0Qj&G-hi;s*#y6P!XAY?Nk#>l=1kS|6M4I-vN{
zS?6!p{dBHFYpjb^-~4cR&<WlmtSe-E39m?bg64&I;}Tg7;N3sm+1^)SCSV)ww8}7N
z;w?r2nATjZW_12_aIxLuxygj%?YaT2*(3=I`mhsCigemN-1!k!nxOx6cRX1i4pjVx
zTKrcq5+Po=!kc;M4h!$Q5nCTyF5b1cfKEj^Q5Vk;UE=4ud#wWlGtY<&4W4-Cj{b*-
zvy$p6j_*2k&{HwAKoEr2lW22&nYtcEx@LM4T(c<G(Q~=Xk&I4s?2=AK=!79FI0Mt;
zU!qTs9u;vX9t{^a(g!Xkl2pTXKuB;oIB`(CfQT!X2gb;SClUrxbzIAC`>jZPt)oV%
zl-82WdJHI|SHJw);uzR1Ouun9oE2x4hqmKz*<(^3WORoVwMKoFAtD?L`yIAi{$sTy
zKpuWF1z@N&8#IH#CNzj3v6g%S_Iwu;%&)zJ?T_qiFF(bDNG<W^==1>en^u`kD`=a&
zT4&OSx(Lh6G#oVaWh*H+LEyj~LjPd^s%O%rq>S#cD`*Grz_AA=tTS{PvY*ayTy~6q
zfz)n(<d1*^^bgb*BK(W`L6oB;S~GRAkG=8NSk^SkI$tEDwFnjBf)f?4$P=+CzC^$A
z{|w%tPfY#i{<gZaz4JqdsTYxVG-tKaR+1n5(AoJB^g2NC(cV6_%JW)pZX-$OzlE>F
z2LOinQ~JU`N@UN*591Hv<M^YXfmnKN{5<*yK8`-%pXQlBtlr-wvINQ}C@pwQh_9Ey
z<xads$<BBLhl+5bapgH+)HxbrrrQ7FQB&{PM^5{n5dX2RREqdd3n{tcKh~P9&1$tc
z|NRO}j#$YL?qi1iPpP%y{9jct|7&6Yd;G%ff6kB_;7X<KPd}?4wqb+6+a915Of<ud
zTv8>;Gqr*R4lQ@|WF4N6`kuT)ho5?>->8cCqxwAw;w=k%KJ1nrq~3nt*>9sQtwhaU
zqFa!j2}OO9hQ2=ZTv#hnO{Axdf9(8>izP0zfnDzKw7Y|PP|`0=osrZfZLww~>XX!Q
z!*xsj(d5rR`4AmE5v=zk1`m3GjnyQ)1$55x4t;A3M_>4ueBBOwp5!<3%{OW9{r;dr
zEaJ96pCDsAAdeaC{LaEkU#Q8<WhOJREq0!*%;H7N_6k!Qq#uJ*jKcZmHbS;9q1iH)
zi933tP0Ykigd=nQd4+hR2&On+g7b~s7K1nA^^tysH+9hf$%45vG?NvW8`pSFkfU{#
zSy!H#`JO?Us{hUD2GAY+zpYAL!TUeeR=v>w7Ez}AfAJocyiu<M0hMa4;Fs4|W<)!l
zOCK}zziOq`it)cvE8@Q`rp&tkH|^eE<a&-D(3=_E3>sO>ql+>u^p0CfBpEq6-QQP_
zJAXgzoOGGaL)Fcs1<?<SppheN@R&8oI-)w{(J|8!PS8W+*&|YRh~&pI*CvL1VYox@
zoCY(u^@-)IQ&Zu@wtm-pZX~B7g6ynkLis28{SUlOwJcNZ$NA+qncaP0m%{c2#Bi4>
zq{nN7`D|!FF|<wx*F*5jl7BzxO#nPzPI@a*w^00o62rp>dh&6Z)>vlEpy~VQSWIR@
z1GfdC)_om&)-?n{3Iyi`f7d?K&{g<-@*rQH7F^5j4S5?eb?uWC8wfkr$%o@#zl)AW
zJXy%qzzX4q4(Qdm3fjvIvv1IAc#g-MgDa-xA4vG!L+1cj0qMz1UyTfD_vx@3pJ_bW
z(z&gfy&=qMXSjxLtTi6PkKtxDF<;S7mO5uM$%P|(7^xmgsH`KH#nH=GT7>DqFW`mh
zf8>(i1Nz9X|2Lr2Q2(zfwL<@2M47JtH|y1wyxH8`XepJAdcp0luO#_D!u4}W1~5ba
zr~DG-f2Cfn7V-ZVQKrfNJTVya^ijzj|EW={HPQaxg8vore-UN6{BKk@K>lwus;z3R
zQJnvOg(b=V^WyrjRHny&swoX6#{cl6IRC$pvNSW+Zz*)+q=qC&Nz8U?8m^E{cUBK#
zE@97AXHCF+<dSz1DUthAaw0STRxItZ=a0!xx;%4Kb4kPe08~cPGUrBYBu8`ZukMy>
zZ$4?8?}{6eV>B%EN0HO{5pz1vqSF@A^_k>&z6-2*^4ag1u71I@o7|79jeGI&J!L^s
z$i_0*YrURlYrUK7{Pif8b6hecH23B@hRmjc%pz8Kim7$Is;{5xW^af=zJRFoaR33f
zQFbw!&3HE1q%0|K#OSkFVgK;i5vB9}=1T6bo<io|6GbG)pbF8SAx;-4`hmPJMe>KB
z<`MiyEC&GI03m@8yE}hR&L1xQbMyC|RBT@WW=Ymb8Uhw8Ugt!EC^qj$k=$QBg^Ya&
zo=A?76@oTHdM!}UzW3?+5=5;CY#w2I;@ar>zn==_%t67<&E9Du*79r&%>^T%0@hLa
zEuoo7nF6ic9om+6F~mF|35{KNm#MQSgtrqSbg?4)L6nTr<bHI`{nb+l>bqly<QPvO
zo--ug0>v|<>`PIFBhY3D=%+Sx7bUgK^8e592Dd6X97A{T|2FE)2C3APR-?%Ovyd|5
z{7<a~yxzd%@70as{^u(!v-to1TpyNM{@-d{iSa+47%Kd~izpfXUy;Ju4&aT4xq!t^
z^7w$0^`EB`I0EOwyuex5%;)+^ex1t?ocwI+K4{}9UBOYZ<QY-16Ux|IJn~LRq`|ZI
z24?{~XLoQGduLhmXE5}ArNXQ72q%a4P?zxhMgJV8NIv(^j0uzfv!_`B?u(~P&v3C2
zJhO%1nfrz_V0{eda2BIyxgGAb7UXC2C-x7|U)Iix4*8rxGpLgPv!@Zb4?<R^o4Am=
z&rIq*dq;65wDa>6XYqNKGwn|DHb0*~*jX$#`3$_p^B3R`BWFH;@eJbT|LkeR_B~LP
l=`=2c^)nOJ&);pF4Q}B#E=o~~Qj|rO{|^8;-8%qO0RW1;3nBmj


From b7eed10e64cb9530db9afc784fb437cbccd5eb07 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 25 Sep 2025 15:13:14 +0800
Subject: [PATCH 30/38] ready for test

---
 .buildkite/README_dynamic.md                 | 138 +++++++++++++++++++
 .buildkite/buildkite_ci_feature_template.yml |  17 ++-
 .buildkite/buildkite_ci_model_template.yml   |  17 ++-
 .buildkite/pipeline_dynamic.yml              |   4 +-
 .buildkite/scripts/run_in_docker.sh          |  32 -----
 5 files changed, 156 insertions(+), 52 deletions(-)
 create mode 100644 .buildkite/README_dynamic.md

diff --git a/.buildkite/README_dynamic.md b/.buildkite/README_dynamic.md
new file mode 100644
index 0000000000..9be88140d4
--- /dev/null
+++ b/.buildkite/README_dynamic.md
@@ -0,0 +1,138 @@
+# Buildkite Pipeline Generator
+
+This document outlines the process for using Python scripts to automatically generate Buildkite CI/CD pipeline configuration files. These scripts leverage templates to create consistent testing pipelines for both models and features.
+
+## Overview
+
+The primary goal of these tools is to streamline the creation of Buildkite pipelines. Instead of manually creating and editing YAML files for each new model or feature, you can run a simple command to generate a standardized pipeline file.
+
+There are two main generators:
+
+1.  **Model Pipeline Generator** (`generate_model_buildkite.py`): Creates a pipeline file for testing a specific machine learning model.
+2.  **Feature Pipeline Generator** (`generate_feature_buildkite.py`): Creates a pipeline file for testing a new feature.
+
+Both scripts work by reading a corresponding template file (`.yml`), replacing placeholder variables with your command-line arguments, and saving the result as a new YAML file in a designated output directory.
+
+## Directory Structure
+
+To use the scripts, your files should be arranged as follows. The output directories (`models/` and `features/`) will be created automatically if they do not exist.
+
+```
+.
+├── generate_model_buildkite.py
+├── buildkite_ci_model_template.yml
+├── generate_feature_buildkite.py
+├── buildkite_ci_feature_template.yml
+└── README.md
+```
+
+-----
+
+## How to Use
+
+### 1\. Generating a Model Pipeline
+
+Use the `generate_model_buildkite.py` script to create a CI pipeline for a new model.
+
+**Command:**
+
+```bash
+python generate_model_buildkite.py --model-name <MODEL_NAME> --queue <QUEUE_NAME>
+```
+
+**Arguments:**
+
+  * `--model-name` (required): The name of the model to be tested. If the name contains special characters like `/` or `.`, they will be replaced with `_` in the output filename and for Buildkite step keys.
+  * `--queue` (required): The name of the Buildkite agent queue where the jobs will run (e.g., `tpu_v6e_queue`).
+
+**Example:**
+
+```bash
+python generate_model_buildkite.py --model-name meta-llama/Llama-3.1-8B-Instruct --queue tpu_v6e_queue
+```
+
+**Output:**
+
+This command will generate a new file located at `models/meta-llama_Llama-3_1-8B-Instruct.yml`.
+
+-----
+
+### 2\. Generating a Feature Pipeline
+
+Use the `generate_feature_buildkite.py` script to create a CI pipeline for a new feature.
+
+**Command:**
+
+```bash
+python generate_feature_buildkite.py --feature-name <FEATURE_NAME> --queue <QUEUE_NAME>
+```
+
+**Arguments:**
+
+  * `--feature-name` (required): The name of the feature to be tested.
+  * `--queue` (required): The name of the Buildkite agent queue.
+
+**Example:**
+
+```bash
+python generate_feature_buildkite.py --feature-name Feat-A --queue tpu_v6e_queue
+```
+
+**Output:**
+
+This command will generate a new file located at `features/Feat-A.yml`.
+
+-----
+
+## Important Notes: Placeholders & Customization
+
+The scripts work by performing a find-and-replace on specific placeholders within the template files. You can customize the `buildkite_ci_*_template.yml` files to change the structure of the generated pipelines.
+
+#### **Model Template Placeholders (`buildkite_ci_model_template.yml`)**
+
+  * `{MODEL_NAME}`: Replaced with the exact string provided to `--model-name`. This is typically used in human-readable fields like step `label`.
+  * `{SAFE_MODEL_NAME}`: A sanitized version of the model name, automatically generated by replacing characters like `/` and `.` with `_`. This is used for machine-readable fields like the step `key` and the output filename to ensure validity.
+  * `{QUEUE}`: Replaced with the string provided to `--queue`.
+
+#### **Feature Template Placeholders (`buildkite_ci_feature_template.yml`)**
+
+  * `{FEATURE_NAME}`: Replaced with the exact string provided to `--feature-name`.
+  * `{SAFE_FEATURE_NAME}`: A sanitized version of the feature name.
+  * `{QUEUE}`: Replaced with the string provided to `--queue`.
+
+## Integration with the Main Pipeline
+
+After generating a pipeline `.yml` file, you must place it in the correct subdirectory within the `.buildkite/` folder. The generator scripts create these files in the top-level `models/` and `features/` directories, so you will need to **manually move** them to the corresponding location inside `.buildkite/`.
+
+This is a crucial step, as the main CI process relies on the `dynamic_bootstrap.sh` script to automatically scan these specific directories to discover and upload the pipeline steps. Also, remember to **replace the script in your main pipeline's configuration to execute `dynamic_bootstrap.sh`**.
+
+### Target Directories
+
+Place your generated `.yml` files into the following directories for detection:
+
+  * **Standard Models**: Move the generated file to `.buildkite/models/`.
+  * **Informational Models**: For models considered "informational" (e.g., VLLM-native models), move the generated file to `.buildkite/models/informational/`.
+  * **Popular Models**: For models designated as "popular," move the generated file to `.buildkite/models/popular/`.
+  * **Features**: Move the generated feature file to `.buildkite/features/`.
+
+### Example Workflow
+
+1.  Generate a pipeline for a new model that you consider "popular":
+
+    ```bash
+    python generate_model_buildkite.py --model-name my-popular-model --queue tpu_v6e_queue
+    ```
+
+    This creates `models/my-popular-model.yml`.
+
+2.  Move the file to the correct directory for the bootstrap script to find it:
+
+    ```bash
+    # Create the directory if it doesn't exist
+    mkdir -p .buildkite/models/popular
+
+    # Move the file
+    mv models/my-popular-model.yml .buildkite/models/popular/
+    ```
+
+Once the file is in the correct `.buildkite/` subdirectory and committed, the `dynamic_bootstrap.sh` script will automatically find it and add its steps to the Buildkite pipeline.
\ No newline at end of file
diff --git a/.buildkite/buildkite_ci_feature_template.yml b/.buildkite/buildkite_ci_feature_template.yml
index cab1a9250e..6825c44a4e 100644
--- a/.buildkite/buildkite_ci_feature_template.yml
+++ b/.buildkite/buildkite_ci_feature_template.yml
@@ -5,8 +5,8 @@ steps:
   - label: "Unit tests for {FEATURE_NAME}"
     key: "ut_{SAFE_FEATURE_NAME}"
     commands:
-      # - replace_with_test_commands  # TODO: Replaced to actual test commands
-      - echo "[DEBUG], unit testing for {FEATURE_NAME}"  # TODO: Replace to actual test commands
+      # Replace to actual test commands
+      - echo "[DEBUG], unit testing for {FEATURE_NAME}"
   - label: "Notifications: Unit tests for {FEATURE_NAME}"
     key: "notifications_ut_{SAFE_FEATURE_NAME}"
     depends_on: "ut_{SAFE_FEATURE_NAME}"
@@ -24,9 +24,8 @@ steps:
     key: "it_{SAFE_FEATURE_NAME}"
     depends_on: "notifications_ut_{SAFE_FEATURE_NAME}"
     commands:
-      # TODO: expected_accuracy need parameterized
-      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "{FEATURE_NAME}"
-      - echo "[DEBUG], integration testing for {FEATURE_NAME}"  # TODO: Replace to actual test commands
+      # Replace to actual test commands
+      - echo "[DEBUG], integration testing for {FEATURE_NAME}"
   - label: "Notifications: Integration tests for {FEATURE_NAME}"
     key: "notifications_it_{SAFE_FEATURE_NAME}"
     depends_on: "it_{SAFE_FEATURE_NAME}"
@@ -44,8 +43,8 @@ steps:
     key: "pb_{SAFE_FEATURE_NAME}"
     depends_on: "notifications_it_{SAFE_FEATURE_NAME}"
     commands:
-      # - replace_with_test_command  # TODO
-      - echo "[DEBUG], performance benchmarking for {FEATURE_NAME}"  # TODO: Replace to actual test commands
+      # Replace to actual test commands
+      - echo "[DEBUG], performance benchmarking for {FEATURE_NAME}"
   - label: "Notifications: Performance benchmarks for {FEATURE_NAME}"
     key: "notifications_pb_{SAFE_FEATURE_NAME}"
     depends_on: "pb_{SAFE_FEATURE_NAME}"
@@ -63,8 +62,8 @@ steps:
     key: "st_{SAFE_FEATURE_NAME}"
     depends_on: "notifications_pb_{SAFE_FEATURE_NAME}"
     commands:
-      # - our_stress_tests_script {FEATURE_NAME} expected_throughput # TODO: expected_throughput need parameterized
-      - echo "[DEBUG], stress testing for {FEATURE_NAME}"  # TODO: Replace to actual test commands
+      # Replace to actual test commands
+      - echo "[DEBUG], stress testing for {FEATURE_NAME}"
   - label: "Notifications: Stress tests for {FEATURE_NAME}"
     key: "notifications_st_{SAFE_FEATURE_NAME}"
     depends_on: "st_{SAFE_FEATURE_NAME}"
diff --git a/.buildkite/buildkite_ci_model_template.yml b/.buildkite/buildkite_ci_model_template.yml
index 595ae98c05..9cf355a4a3 100644
--- a/.buildkite/buildkite_ci_model_template.yml
+++ b/.buildkite/buildkite_ci_model_template.yml
@@ -5,8 +5,8 @@ steps:
   - label: "Unit tests for {MODEL_NAME}"
     key: "ut_{SAFE_MODEL_NAME}"
     commands:
-      # - replace_with_test_commands  # TODO: Replaced to actual test commands
-      - echo "[DEBUG], unit testing for {MODEL_NAME}"  # TODO: Replace to actual test commands
+      # Replace to actual test commands
+      - echo "[DEBUG], unit testing for {MODEL_NAME}"
   - label: "Notifications: Unit tests for {MODEL_NAME}"
     key: "notifications_ut_{SAFE_MODEL_NAME}"
     depends_on: "ut_{SAFE_MODEL_NAME}"
@@ -24,9 +24,8 @@ steps:
     key: "it_{SAFE_MODEL_NAME}"
     depends_on: "notifications_ut_{SAFE_MODEL_NAME}"
     commands:
-      # TODO: expected_accuracy need parameterized
-      # - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "{MODEL_NAME}"
-      - echo "[DEBUG], integration testing for {MODEL_NAME}"  # TODO: Replace to actual test commands
+      # Replace to actual test commands
+      - echo "[DEBUG], integration testing for {MODEL_NAME}"
   - label: "Notifications: Integration tests for {MODEL_NAME}"
     key: "notifications_it_{SAFE_MODEL_NAME}"
     depends_on: "it_{SAFE_MODEL_NAME}"
@@ -44,8 +43,8 @@ steps:
     key: "pb_{SAFE_MODEL_NAME}"
     depends_on: "notifications_it_{SAFE_MODEL_NAME}"
     commands:
-      # - replace_with_test_command  # TODO
-      - echo "[DEBUG], performance benchmarking for {MODEL_NAME}"  # TODO: Replace to actual test commands
+      # Replace to actual test commands
+      - echo "[DEBUG], performance benchmarking for {MODEL_NAME}"
   - label: "Notifications: Performance benchmarks for {MODEL_NAME}"
     key: "notifications_pb_{SAFE_MODEL_NAME}"
     depends_on: "pb_{SAFE_MODEL_NAME}"
@@ -63,8 +62,8 @@ steps:
     key: "st_{SAFE_MODEL_NAME}"
     depends_on: "notifications_pb_{SAFE_MODEL_NAME}"
     commands:
-      # - our_stress_tests_script {MODEL_NAME} expected_throughput # TODO: expected_throughput need parameterized
-      - echo "[DEBUG], stress testing for {MODEL_NAME}"  # TODO: Replace to actual test commands
+      # Replace to actual test commands
+      - echo "[DEBUG], stress testing for {MODEL_NAME}"
   - label: "Notifications: Stress tests for {MODEL_NAME}"
     key: "notifications_st_{SAFE_MODEL_NAME}"
     depends_on: "st_{SAFE_MODEL_NAME}"
diff --git a/.buildkite/pipeline_dynamic.yml b/.buildkite/pipeline_dynamic.yml
index 139d6385ac..ef80b1efcd 100644
--- a/.buildkite/pipeline_dynamic.yml
+++ b/.buildkite/pipeline_dynamic.yml
@@ -15,10 +15,10 @@ steps:
      agents:
        queue: tpu_v6e_queue
      commands:
-       - echo "Generate support matrices..."
+       - echo "Generate support matrices report..."
        - bash .buildkite/scripts/export_support_matrix.sh
 
-   # Handle PR builds: print model matrices and feature matrices
+   # Print model matrices and feature matrices
    - label: "Handle Report"
      if: build.pull_request.id != null
      depends_on: export_support_matrix
diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh
index 241347d172..5c105c5a60 100755
--- a/.buildkite/scripts/run_in_docker.sh
+++ b/.buildkite/scripts/run_in_docker.sh
@@ -11,36 +11,6 @@ if [ "$#" -eq 0 ]; then
   exit 1
 fi
 
-MOUNT_EXPECT_RESULT="False"
-OTHER_ARGS=()
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --mount-expect-result)
-            MOUNT_EXPECT_RESULT="True"
-            shift 1
-            ;;
-        *)
-            OTHER_ARGS+=("$@")
-            break
-            ;;
-    esac
-done
-
-# TBD: To support the functionality of connecting GPU and TPU expected values in the future
-EXPECT_VOLUME=()
-EXPECT_ENV=()
-if [ "$MOUNT_EXPECT_RESULT" = "True" ]; then
-    touch "$EXPECT_VALUES_FILENAME"
-    echo "[DEBUG] Path: $EXPECT_VALUES_PATH, Filename: $EXPECT_VALUES_FILENAME, "
-
-    EXPECT_VOLUME=(-v "$(pwd)/$EXPECT_VALUES_FILENAME":"$EXPECT_VALUES_PATH$EXPECT_VALUES_FILENAME")
-    echo "docker -v cmd: " "${EXPECT_VOLUME[@]}"
-
-    EXPECT_ENV=(-e EXPECT_VALUES_PATH="$EXPECT_VALUES_PATH" -e EXPECT_VALUES_FILENAME="$EXPECT_VALUES_FILENAME")
-    echo "docker -e cmd: " "${EXPECT_ENV[@]}"
-fi
-
 if ! grep -q "^HF_TOKEN=" /etc/environment; then
   gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \
   sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)"
@@ -103,8 +73,6 @@ exec docker run \
   --shm-size=16G \
   --rm \
   -v "$LOCAL_HF_HOME":"$DOCKER_HF_HOME" \
-  "${EXPECT_VOLUME[@]}" \
-  "${EXPECT_ENV[@]}" \
   -e HF_HOME="$DOCKER_HF_HOME" \
   -e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \
   -e HF_TOKEN="$HF_TOKEN" \

From 3b203be00d3664f1cfbbf7a1d2552354b0cde9f1 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 25 Sep 2025 15:17:13 +0800
Subject: [PATCH 31/38] test accuracy

---
 .buildkite/pipeline_jax.yml | 221 +++++++++++++++++++-----------------
 1 file changed, 114 insertions(+), 107 deletions(-)

diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
index 3184e36e60..187d6c4b31 100644
--- a/.buildkite/pipeline_jax.yml
+++ b/.buildkite/pipeline_jax.yml
@@ -2,120 +2,127 @@ steps:
   # -----------------------------------------------------------------
   # TEST STEPS - Calling wrapper
   # -----------------------------------------------------------------
-   - label: "E2E MLPerf tests for JAX models"
-     key: test_0
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+    - label: "Integration Test llama-3.1-8B on TPU"
+      key: integration_test_llama_3_1_8B_tpu
+      soft_fail: true
+      agents:
+        queue: tpu_v6e_queue
+      commands:
+        - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/test_accuracy.sh -t 1 -m "meta-llama/Llama-3.1-8B-Instruct" -e "0.8"
+  #  - label: "E2E MLPerf tests for JAX models"
+  #    key: test_0
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-   - label: "E2E MLPerf tests for JAX models with quantization"
-     key: test_1
-     soft_fail: true
-     env:
-       QUANTIZATION: "True"
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  #  - label: "E2E MLPerf tests for JAX models with quantization"
+  #    key: test_1
+  #    soft_fail: true
+  #    env:
+  #      QUANTIZATION: "True"
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-   - label: "E2E MLPerf tests for JAX new models"
-     key: test_2
-     soft_fail: true
-     env:
-       NEW_MODEL_DESIGN: "True"
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  #  - label: "E2E MLPerf tests for JAX new models"
+  #    key: test_2
+  #    soft_fail: true
+  #    env:
+  #      NEW_MODEL_DESIGN: "True"
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-   - label: "E2E MLPerf tests for JAX + vLLM models"
-     key: test_3
-     soft_fail: true
-     env:
-       MODEL_IMPL_TYPE: "vllm"
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  #  - label: "E2E MLPerf tests for JAX + vLLM models"
+  #    key: test_3
+  #    soft_fail: true
+  #    env:
+  #      MODEL_IMPL_TYPE: "vllm"
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
-   - label: "E2E MLperf tests for Llama4 models"
-     key: test_4
-     soft_fail: true
-     env:
-       NEW_MODEL_DESIGN: "True"
-       USE_V6E8_QUEUE: "True"
-     agents:
-       queue: tpu_v6e_8_queue
-     commands:
-       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
+  #  - label: "E2E MLperf tests for Llama4 models"
+  #    key: test_4
+  #    soft_fail: true
+  #    env:
+  #      NEW_MODEL_DESIGN: "True"
+  #      USE_V6E8_QUEUE: "True"
+  #    agents:
+  #      queue: tpu_v6e_8_queue
+  #    commands:
+  #      - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_commons/tests/e2e/benchmarking/mlperf.sh
 
 
-   - label: "E2E multi modality test"
-     key: test_5
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \
-            bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh'
+  #  - label: "E2E multi modality test"
+  #    key: test_5
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_multi_modal_inference.py && \
+  #           bash /workspace/tpu_commons/tests/e2e/benchmarking/mm_bench.sh'
 
-   - label: "E2E speculative decoding test"
-     key: test_6
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py'
+  #  - label: "E2E speculative decoding test"
+  #    key: test_6
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          bash -c 'python3 -m pytest -s -v -x /workspace/tpu_commons/tests/e2e/test_speculative_decoding.py'
 
-   - label: "JAX unit tests"
-     key: test_7
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \
-           --ignore=/workspace/tpu_commons/tests/kernels \
-           --ignore=/workspace/tpu_commons/tests/e2e \
-           --ignore=/workspace/tpu_commons/tpu_commons/mock \
-           --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69
+  #  - label: "JAX unit tests"
+  #    key: test_7
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          python3 -m pytest -s -v -x /workspace/tpu_commons/tests/ \
+  #          --ignore=/workspace/tpu_commons/tests/kernels \
+  #          --ignore=/workspace/tpu_commons/tests/e2e \
+  #          --ignore=/workspace/tpu_commons/tpu_commons/mock \
+  #          --cov-config=/workspace/tpu_commons/.coveragerc --cov tpu_commons --cov-report term-missing --cov-fail-under=69
 
-   - label: "JAX unit tests - kernels"
-     key: test_8
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_in_docker.sh \
-           python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \
-           --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
-           --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py
+  #  - label: "JAX unit tests - kernels"
+  #    key: test_8
+  #    soft_fail: true
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/run_in_docker.sh \
+  #          python3 -m pytest -s -v -x /workspace/tpu_commons/tests/kernels \
+  #          --ignore=/workspace/tpu_commons/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
+  #          --ignore=/workspace/tpu_commons/tests/kernels/ragged_kv_cache_update_v2_test.py
 
-  # -----------------------------------------------------------------
-  # NOTIFICATION STEP
-  # -----------------------------------------------------------------
-   - label: "TPU Test Notification"
-     depends_on:
-       - test_0
-       - test_1
-       - test_2
-       - test_3
-       - test_4
-       - test_5
-       - test_6
-       - test_7
-       - test_8
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/check_results.sh \
-           "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8
+  # # -----------------------------------------------------------------
+  # # NOTIFICATION STEP
+  # # -----------------------------------------------------------------
+  #  - label: "TPU Test Notification"
+  #    depends_on:
+  #      - test_0
+  #      - test_1
+  #      - test_2
+  #      - test_3
+  #      - test_4
+  #      - test_5
+  #      - test_6
+  #      - test_7
+  #      - test_8
+  #    agents:
+  #      queue: tpu_v6e_queue
+  #    commands:
+  #      - |
+  #        .buildkite/scripts/check_results.sh \
+  #          "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8

From f87e0c47097881956144e64c2afd6e72d959f5f7 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 25 Sep 2025 15:18:24 +0800
Subject: [PATCH 32/38] test

---
 .buildkite/scripts/bootstrap.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh
index 44fa7bf64b..aff2c6b9d4 100755
--- a/.buildkite/scripts/bootstrap.sh
+++ b/.buildkite/scripts/bootstrap.sh
@@ -23,6 +23,6 @@ echo "--- Starting Buildkite Bootstrap ---"
 #   # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
 # fi
 
-buildkite-agent pipeline upload .buildkite/pipeline_dynamic.yml
+buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
 
 echo "--- Buildkite Bootstrap Finished ---"

From df4fd874a268afbb5436d6e7abd4f63cfac74ee2 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 25 Sep 2025 15:32:52 +0800
Subject: [PATCH 33/38] test

---
 scripts/vllm/integration/conftest.py      |  5 +++++
 scripts/vllm/integration/test_accuracy.py | 12 +++++++++++-
 tests/e2e/benchmarking/test_accuracy.sh   | 14 ++++++--------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/scripts/vllm/integration/conftest.py b/scripts/vllm/integration/conftest.py
index b1c2ba1872..2020620933 100644
--- a/scripts/vllm/integration/conftest.py
+++ b/scripts/vllm/integration/conftest.py
@@ -16,6 +16,11 @@ def pytest_addoption(parser):
         help="This is used to specify the JSON file that stores the expected values. " +
             "The results from running test_accuracy on a GPU will be saved to this file, " +
             "and when running on a TPU, the results will be read from this file for comparison.")
+    parser.addoption(
+        "--expected-value",
+        type=float,
+        default=None,
+        help="This value will be used to compare the measure value and determine if the test passes or fails.")
     parser.addoption(
         "--model-names",
         action="store",
diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py
index 68fb69e643..1a06017fdd 100644
--- a/scripts/vllm/integration/test_accuracy.py
+++ b/scripts/vllm/integration/test_accuracy.py
@@ -150,9 +150,14 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, requ
 
     tp_size = request.config.getoption("--tensor-parallel-size")
     expected_json_filepath = request.config.getoption("--expected-values-file")
-            
+    expected_value = request.config.getoption("--expected-value")
+
     expected_values_data = read_expected_value(expected_json_filepath)
 
+    # Add expected-value to expected_values_data with model name
+    if expected-value is not None:
+        expected_values_data[model] = float(expected_value)
+
     if tp_size is None:
         tp_size = 1
     elif tp_size < 1 or tp_size > 8:
@@ -181,9 +186,14 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
 
     tp_size = request.config.getoption("--tensor-parallel-size")
     expected_json_filepath = request.config.getoption("--expected-values-file")
+    expected_value = request.config.getoption("--expected-value")
 
     expected_values_data = read_expected_value(expected_json_filepath)
 
+    # Add expected-value to expected_values_data with model name
+    if expected-value is not None:
+        expected_values_data[model] = float(expected_value)
+
     if tp_size is None:
         tp_size = 1
     elif tp_size < 1 or tp_size > 8:
diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh
index 0ce96d9e42..5f003533dd 100644
--- a/tests/e2e/benchmarking/test_accuracy.sh
+++ b/tests/e2e/benchmarking/test_accuracy.sh
@@ -2,7 +2,7 @@
 
 model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.1-70B-Instruct"
 tensor_parallel_size=1
-gpu_enabled=false
+expected_value=0
 
 extra_serve_args=()
 echo extra_serve_args: "${extra_serve_args[@]}"
@@ -17,6 +17,7 @@ helpFunction()
    echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)"
    echo -e "\t-m A space-separated list of HuggingFace model ids to use (default: meta-llama/Llama-3.1-8B-Instruct and meta-llama/Llama-3.1-70B-Instruct)"
    echo -e "\t-t Tensor parallel size (default: 1)"
+   echo -e "\t-e Excepted value"
    exit 1
 }
 
@@ -37,8 +38,9 @@ while [[ "$#" -gt 0 ]]; do
             shift
             shift
             ;;
-        -g|--gpu)
-            gpu_enabled=true
+        -e|--excepted-value)
+            expected_value="$2"
+            shift
             shift
             ;;
         -h|--help)
@@ -66,10 +68,6 @@ echo "Running integration for models: $comma_model_list"
 echo "--------------------------------------------------"
 
 # Default action
-if $gpu_enabled; then
-    python3 -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list"
-else
-    python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list"
-fi
+python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" --expected_value="$expected_value"
 
 exit $exit_code
\ No newline at end of file

From 66cada4869b8a6c96748c7dac5ccfce4e32f7028 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 25 Sep 2025 15:47:43 +0800
Subject: [PATCH 34/38] fix

---
 requirements_benchmarking.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements_benchmarking.txt b/requirements_benchmarking.txt
index 4484d13728..cdfbc6e506 100644
--- a/requirements_benchmarking.txt
+++ b/requirements_benchmarking.txt
@@ -4,4 +4,5 @@ nltk
 evaluate
 datasets
 rouge-score
-scikit-learn
\ No newline at end of file
+scikit-learn
+tblib==3.1.0
\ No newline at end of file

From 2c7e75b62e0021f99829cf2d7c2f1a1be8868daf Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 25 Sep 2025 15:56:04 +0800
Subject: [PATCH 35/38] fix

---
 tests/e2e/benchmarking/test_accuracy.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh
index 5f003533dd..1bae5455af 100644
--- a/tests/e2e/benchmarking/test_accuracy.sh
+++ b/tests/e2e/benchmarking/test_accuracy.sh
@@ -68,6 +68,6 @@ echo "Running integration for models: $comma_model_list"
 echo "--------------------------------------------------"
 
 # Default action
-python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" --expected_value="$expected_value"
+python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" --expected-value="$expected_value"
 
 exit $exit_code
\ No newline at end of file

From 277559eea9b4444e26938aaa4d3bd25b599e3336 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Thu, 25 Sep 2025 16:04:41 +0800
Subject: [PATCH 36/38] fix

---
 scripts/vllm/integration/test_accuracy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py
index 1a06017fdd..34916a4b0a 100644
--- a/scripts/vllm/integration/test_accuracy.py
+++ b/scripts/vllm/integration/test_accuracy.py
@@ -155,7 +155,7 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, requ
     expected_values_data = read_expected_value(expected_json_filepath)
 
     # Add expected-value to expected_values_data with model name
-    if expected-value is not None:
+    if expected_value is not None:
         expected_values_data[model] = float(expected_value)
 
     if tp_size is None:
@@ -191,7 +191,7 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
     expected_values_data = read_expected_value(expected_json_filepath)
 
     # Add expected-value to expected_values_data with model name
-    if expected-value is not None:
+    if expected_value is not None:
         expected_values_data[model] = float(expected_value)
 
     if tp_size is None:

From 597e943d4b8b78ff03dcef5fb9a3dc76661319cd Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Fri, 26 Sep 2025 10:46:59 +0800
Subject: [PATCH 37/38] test

---
 scripts/vllm/integration/conftest.py      |  23 ++--
 scripts/vllm/integration/test_accuracy.py | 134 ++++------------------
 tests/e2e/benchmarking/test_accuracy.sh   |  42 +++++--
 3 files changed, 61 insertions(+), 138 deletions(-)

diff --git a/scripts/vllm/integration/conftest.py b/scripts/vllm/integration/conftest.py
index 2020620933..612f7a264f 100644
--- a/scripts/vllm/integration/conftest.py
+++ b/scripts/vllm/integration/conftest.py
@@ -9,28 +9,21 @@ def pytest_addoption(parser):
         default=1,
         help="The tensor parallel size to use for the test."
     )
-    parser.addoption(
-        "--expected-values-file",
-        type=str,
-        default=None,
-        help="This is used to specify the JSON file that stores the expected values. " +
-            "The results from running test_accuracy on a GPU will be saved to this file, " +
-            "and when running on a TPU, the results will be read from this file for comparison.")
     parser.addoption(
         "--expected-value",
         type=float,
         default=None,
         help="This value will be used to compare the measure value and determine if the test passes or fails.")
     parser.addoption(
-        "--model-names",
-        action="store",
-        # default="meta-llama/Llama-3.1-8B-Instruct",
+        "--model-name",
+        type=str,
         default=None,
-        help="Comma-separated list of model names to test (e.g., 'model1,model2')"
-    )
+        help=
+        "Model name to test (e.g., 'model1')")
     parser.addoption(
-        "--fp8-kv-model-names",
-        action="store",
+        "--fp8-kv-model-name",
+        type=str,
         default=None,
-        help="Comma-separated list of model names to test fp8-kv (e.g., 'model1,model2')"
+        help=
+        "Model name to test fp8-kv (e.g., 'model1')"
     )
\ No newline at end of file
diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py
index 34916a4b0a..e88bcf3545 100644
--- a/scripts/vllm/integration/test_accuracy.py
+++ b/scripts/vllm/integration/test_accuracy.py
@@ -28,77 +28,11 @@
 RTOL = 0.03
 _JSON_WRITE_LOCK = threading.Lock()
 
-EXPECTED_VALUES = {
-    "Qwen/Qwen3-1.7B": 0.68,
-    "google/gemma-3-1b-it": 0.25,
-    "meta-llama/Llama-3.1-8B-Instruct": 0.76,
-    "meta-llama/Llama-3.1-70B-Instruct": 0.876,
-}
-
-# Parametrize test cases based on CLI arguments or default values
-def parametrize_by_cli_or_default(metafunc, fixture_name, cli_parameter, default_list):
-    if fixture_name in metafunc.fixturenames:
-        print(f"Checking CLI parameter '{cli_parameter}' for '{fixture_name}'")
-        names_str = metafunc.config.getoption(cli_parameter)
-        if names_str:
-            print(f"Using '{cli_parameter}' parameter for '{fixture_name}'")
-            param_list = [name.strip() for name in names_str.split(',') if name.strip()]
-            metafunc.parametrize(fixture_name, param_list)
-        else:
-            print(f"Using default list for '{fixture_name}'")
-            metafunc.parametrize(fixture_name, default_list)
-
-def pytest_generate_tests(metafunc):
-    parametrize_by_cli_or_default(metafunc, fixture_name="model", cli_parameter="--model-names", default_list=MODEL_NAMES)
-    parametrize_by_cli_or_default(metafunc, fixture_name="fp8_kv_model", cli_parameter="--fp8-kv-model-names", default_list=FP8_KV_MODEL_NAMES)
-
-# Write expected values to json file
-# TBD: To support the functionality of connecting GPU and TPU expected values in the future
-def write_expected_value_to_json(model_name, measured_value, json_filepath):
-    with _JSON_WRITE_LOCK:
-        data = {}
-        try:
-            with open(json_filepath, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-        except (FileNotFoundError, json.JSONDecodeError):
-            print(
-                f"'{json_filepath}' not found or is empty/invalid. A new one will be created."
-            )
-            data = {}
-        
-        data[model_name] = measured_value
-        
-        try:
-            with open(json_filepath, 'w', encoding='utf-8') as f:
-                json.dump(data, f, indent=4)
-            print(
-                f"Successfully updated '{json_filepath}' with the result for {model_name}."
-            )
-        except IOError as e:
-            print(
-                f"Error: Failed to write to file '{json_filepath}'. Reason: {e}"
-            )
-            raise
-
-# Read expected values from json file if exist
-# TBD: To support the functionality of connecting GPU and TPU expected values in the future
-def read_expected_value(expected_json_filepath=None):
-    expected_values_data = {}
-    if expected_json_filepath is None:
-        expected_values_data = EXPECTED_VALUES
-    else:
-        path_obj = Path(expected_json_filepath)
-        # Read expected values from json file if exist
-        if path_obj.is_file() and os.path.getsize(expected_json_filepath) > 0:
-            print(f"\n[Fixture] Loading from: {expected_json_filepath}")
-            with open(expected_json_filepath, 'r', encoding='utf-8') as f:
-                expected_values_data = json.load(f)
-        else:
-            raise FileNotFoundError(f"Expected values file not found: {expected_json_filepath}")
-    return expected_values_data
-
-
-def run_test(model_name, expected_values_data, expected_json_filepath, more_args=None):
+
+def run_test(model_name,
+             expected_value,
+             expected_json_filepath,
+             more_args=None):
     """Run the end to end accuracy test."""
     print(f"Running test for model: {model_name}")
 
@@ -112,51 +46,25 @@ def run_test(model_name, expected_values_data, expected_json_filepath, more_args
         tasks="gsm8k",
         batch_size="auto",
     )
-
-    # Execute default behavior when `expected_json_filepath` is not set.
-    if expected_json_filepath is None:
-        print(f"Execute default behavior")
-        measured_value = results["results"][TASK][FILTER]
-        assert model_name in EXPECTED_VALUES, (
-            f"Cannot find the expected value for the model {model_name=}")
-        expected_value = EXPECTED_VALUES[model_name]
-        assert (measured_value - RTOL < expected_value
-                and measured_value + RTOL > expected_value
-                ), f"Expected: {expected_value} |  Measured: {measured_value}"
-    else:
-        print(f"Execute specific models behavior")
-        measured_value = results["results"][TASK][FILTER]
-        expected_value = expected_values_data.get(model_name)
-
-        # Model expected value not exist, write in file
-        if model_name not in expected_values_data:
-            print(f"Warning: No expected value found for {model_name}. "
-                "Skipping accuracy check.")
-            print(f"Measured value: {measured_value}")
-            write_expected_value_to_json(model_name, measured_value, expected_json_filepath)
-
-        else:
-            print(f"Found expected value! {model_name=}, {measured_value=}, {expected_value=}")
-            assert (measured_value - RTOL < expected_value
-                and measured_value + RTOL > expected_value
-                ), f"Expected: {expected_value} |  Measured: {measured_value}"
+    
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < expected_value < measured_value + RTOL
+            ), f"Expected: {expected_value} |  Measured: {measured_value}"
 
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch,
+                                    request: pytest.FixtureRequest):
     """Run with the V1 Engine."""
+    model = request.config.getoption("--model-name")
     print(f"Testing model: {model}...")
 
     tp_size = request.config.getoption("--tensor-parallel-size")
-    expected_json_filepath = request.config.getoption("--expected-values-file")
     expected_value = request.config.getoption("--expected-value")
 
-    expected_values_data = read_expected_value(expected_json_filepath)
-
-    # Add expected-value to expected_values_data with model name
-    if expected_value is not None:
-        expected_values_data[model] = float(expected_value)
+    if expected_value is None:
+        raise ValueError
 
     if tp_size is None:
         tp_size = 1
@@ -173,26 +81,24 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch, requ
         
         print(f"common args: {more_args}")
 
-        run_test(model, expected_values_data, expected_json_filepath, more_args)
+        run_test(model, expected_value, expected_json_filepath,
+                 more_args)
 
 
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
 def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
-        fp8_kv_model, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest):
+        monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest):
     """Run with the V1 Engine."""
+    fp8_kv_model = request.config.getoption("--fp8-kv-model-name")
     print(f"Testing fp8_kv_model: {fp8_kv_model}...")
 
     tp_size = request.config.getoption("--tensor-parallel-size")
-    expected_json_filepath = request.config.getoption("--expected-values-file")
     expected_value = request.config.getoption("--expected-value")
 
-    expected_values_data = read_expected_value(expected_json_filepath)
-
-    # Add expected-value to expected_values_data with model name
-    if expected_value is not None:
-        expected_values_data[model] = float(expected_value)
+    if expected_value is None:
+        raise ValueError
 
     if tp_size is None:
         tp_size = 1
diff --git a/tests/e2e/benchmarking/test_accuracy.sh b/tests/e2e/benchmarking/test_accuracy.sh
index 1bae5455af..570e3fce65 100644
--- a/tests/e2e/benchmarking/test_accuracy.sh
+++ b/tests/e2e/benchmarking/test_accuracy.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-model_list="meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.1-70B-Instruct"
+model_name=""
 tensor_parallel_size=1
 expected_value=0
 
@@ -15,9 +15,9 @@ helpFunction()
    echo ""
    echo "Usage: $0 [-r full_path_to_root_dir -m model_id]"
    echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)"
-   echo -e "\t-m A space-separated list of HuggingFace model ids to use (default: meta-llama/Llama-3.1-8B-Instruct and meta-llama/Llama-3.1-70B-Instruct)"
+   echo -e "\t-m A space-separated list of HuggingFace model ids to use (Required)"
    echo -e "\t-t Tensor parallel size (default: 1)"
-   echo -e "\t-e Excepted value"
+   echo -e "\t-e Excepted value (Required)"
    exit 1
 }
 
@@ -29,7 +29,7 @@ while [[ "$#" -gt 0 ]]; do
             shift
             ;;
         -m|--model)
-            model_list="$2"
+            model_name="$2"
             shift
             shift
             ;;
@@ -53,21 +53,45 @@ while [[ "$#" -gt 0 ]]; do
     esac
 done
 
+# Check if model_name is provided and not empty
+if [[ -z "$model_name" ]]; then
+    echo "Error: Model name (-m) is a required argument." >&2
+    has_error=1
+fi
+
+# Check if tensor_parallel_size is an integer and greater than 0
+if ! [[ "$tensor_parallel_size" =~ ^[1-9][0-9]*$ ]]; then
+    echo "Error: Tensor parallel size (-t) must be an integer greater than 0. Got: '$tensor_parallel_size'" >&2
+    has_error=1
+fi
+
+# Check if expected_value is a float and greater than 0
+if ! awk -v num="$expected_value" 'BEGIN { exit !(num > 0) }'; then
+    echo "Error: Expected value (-e) must be a number greater than 0. Got: '$expected_value'" >&2
+    has_error=1
+fi
+
+# If any validation failed, print help and exit
+if [[ "$has_error" -ne 0 ]]; then
+    helpFunction
+fi
+
+
 echo "Using the root directory at $root_dir"
-echo "Testing $model_list prompts"
 
 cd "$root_dir"/vllm/tests/entrypoints/llm || exit
 
 # Overwrite a few of the vLLM benchmarking scripts with the TPU Commons ones
 cp "$root_dir"/tpu_commons/scripts/vllm/integration/*.py "$root_dir"/vllm/tests/entrypoints/llm/
 
-comma_model_list=${model_list// /,}
-
 echo "--------------------------------------------------"
-echo "Running integration for models: $comma_model_list"
+echo "Running integration for model: $model_name"
 echo "--------------------------------------------------"
 
 # Default action
-python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine --tensor-parallel-size="$tensor_parallel_size" --model-names="$comma_model_list" --expected-value="$expected_value"
+python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    --tensor-parallel-size="$tensor_parallel_size" \
+    --model-name="$model_name" \
+    --expected-value="$expected_value"
 
 exit $exit_code
\ No newline at end of file

From cc8f5efa8757073943c81734ecdcbb06576bb8b2 Mon Sep 17 00:00:00 2001
From: StingLin <sting.lin@cienet.com>
Date: Fri, 26 Sep 2025 10:55:19 +0800
Subject: [PATCH 38/38] fix

---
 scripts/vllm/integration/test_accuracy.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/vllm/integration/test_accuracy.py b/scripts/vllm/integration/test_accuracy.py
index e88bcf3545..d0511dfdd9 100644
--- a/scripts/vllm/integration/test_accuracy.py
+++ b/scripts/vllm/integration/test_accuracy.py
@@ -31,7 +31,6 @@
 
 def run_test(model_name,
              expected_value,
-             expected_json_filepath,
              more_args=None):
     """Run the end to end accuracy test."""
     print(f"Running test for model: {model_name}")
@@ -81,8 +80,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch,
         
         print(f"common args: {more_args}")
 
-        run_test(model, expected_value, expected_json_filepath,
-                 more_args)
+        run_test(model, expected_value, more_args)
 
 
 @pytest.mark.skipif(not current_platform.is_cuda()
@@ -116,4 +114,4 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
             
         print(f"common args: {more_args}")
 
-        run_test(fp8_kv_model, expected_values_data, expected_json_filepath, more_args)
\ No newline at end of file
+        run_test(fp8_kv_model, expected_values_data, more_args)
\ No newline at end of file