vllm-project
diff --git a/‎.buildkite/generate_index.py‎
Lines changed: 21 additions & 2 deletions b/‎.buildkite/generate_index.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml‎
Lines changed: 0 additions & 12 deletions b/‎.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎.buildkite/lm-eval-harness/configs/models-large.txt‎
Lines changed: 0 additions & 1 deletion b/‎.buildkite/lm-eval-harness/configs/models-large.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.buildkite/release-pipeline.yaml‎
Lines changed: 5 additions & 0 deletions b/‎.buildkite/release-pipeline.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 6 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.buildkite/scripts/tpu/cleanup_docker.sh‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/scripts/tpu/cleanup_docker.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/scripts/upload-wheels.sh‎
Lines changed: 13 additions & 2 deletions b/‎.buildkite/scripts/upload-wheels.sh‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 14 additions & 4 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎.github/workflows/lint-and-deploy.yaml‎
Lines changed: 0 additions & 89 deletions b/‎.github/workflows/lint-and-deploy.yaml‎
Lines changed: 0 additions & 89 deletions
diff --git a/‎.github/workflows/publish.yml‎
Lines changed: 0 additions & 111 deletions b/‎.github/workflows/publish.yml‎
Lines changed: 0 additions & 111 deletions
@@ -8,7 +8,8 @@
 <html>
     <body>
     <h1>Links for vLLM</h1/>
-        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
     </body>
 </html>
 """
@@ -21,7 +22,25 @@
 
 with open("index.html", "w") as f:
     print(f"Generated index.html for {args.wheel}")
+    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
+    if "x86_64" in filename:
+        x86_wheel = filename
+        arm_wheel = filename.replace("x86_64", "aarch64").replace(
+            "manylinux1", "manylinux2014"
+        )
+    elif "aarch64" in filename:
+        x86_wheel = filename.replace("aarch64", "x86_64").replace(
+            "manylinux2014", "manylinux1"
+        )
+        arm_wheel = filename
+    else:
+        raise ValueError(f"Unsupported wheel: {filename}")
     # cloudfront requires escaping the '+' character
     f.write(
-        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+        template.format(
+            x86_wheel=x86_wheel,
+            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+            arm_wheel=arm_wheel,
+            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+        )
     )
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-Meta-Llama-3-8B-QQQ.yaml
@@ -27,7 +27,12 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - block: "Build CUDA 12.6 wheel"
+    key: block-build-cu126-wheel
+    depends_on: ~
+
   - label: "Build wheel - CUDA 12.6"
+    depends_on: block-build-cu126-wheel
     id: build-wheel-cuda-12-6
     agents:
       queue: cpu_queue_postmerge
 
@@ -46,6 +46,11 @@ function cpu_tests() {
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -v -s tests/kernels/test_onednn.py"
+
   # Run basic model test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
@@ -99,4 +104,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
@@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
   # Remove dangling images (those that are not tagged and not used by any container)
   docker image prune -f
   # Remove unused volumes / force the system prune for old images as well.
-  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  docker volume prune -f && docker system prune --force --filter "until=24h" --all
   echo "Docker images and volumes cleanup completed."
 else
   echo "Disk usage is below $threshold%. No cleanup needed."
 
@@ -14,8 +14,19 @@ fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# Rename 'linux' to 'manylinux1' in the wheel filename
-new_wheel="${wheel/linux/manylinux1}"
+# Detect architecture and rename 'linux' to appropriate manylinux version
+arch=$(uname -m)
+if [[ $arch == "x86_64" ]]; then
+    manylinux_version="manylinux1"
+elif [[ $arch == "aarch64" ]]; then
+    manylinux_version="manylinux2014"
+else
+    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
+    manylinux_version="manylinux1"
+fi
+
+# Rename 'linux' to the appropriate manylinux version in the wheel filename
+new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
 
 
@@ -328,6 +328,7 @@ steps:
     - pytest -v -s compile/test_sequence_parallelism.py
     - pytest -v -s compile/test_async_tp.py
     - pytest -v -s compile/test_fusion_all_reduce.py
+    - pytest -v -s compile/test_decorator.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
   mirror_hardwares: [amdexperimental]
@@ -341,6 +342,7 @@ steps:
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
   - pytest -v -s compile/piecewise/test_full_cudagraph.py
+  - pytest -v -s compile/piecewise/test_multiple_graphs.py
 
 - label: PyTorch Fullgraph Test # 18min
   mirror_hardwares: [amdexperimental]
@@ -543,6 +545,15 @@ steps:
   commands:
     - pytest -v -s models/language/pooling -m 'not core_model'
 
+- label: Multi-Modal Processor Test
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
 - label: Multi-Modal Models Test (Standard)
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
@@ -552,9 +563,7 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal/processing
-    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
-    - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model  # Needs mp_method="spawn"
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Models Test (Extended) 1
@@ -565,7 +574,7 @@ steps:
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
 
 - label: Multi-Modal Models Test (Extended) 2
   mirror_hardwares: [amdexperimental]
@@ -646,6 +655,7 @@ steps:
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
     # Fusion
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern