lcskrishna · lcskrishna · May 7, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml
@@ -14,13 +14,15 @@ steps:
   - tests/kernels/moe/test_cpu_fused_moe.py
   - tests/kernels/test_onednn.py
   - tests/kernels/test_awq_int4_to_int8.py
+  - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
   commands:
     - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
       pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
       pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
       pytest -x -v -s tests/kernels/test_onednn.py
-      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py"
+      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py
+      pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py"
 
 - label: CPU-Compatibility Tests
   depends_on: []
@@ -69,11 +71,11 @@ steps:
       pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
       pytest -x -v -s tests/quantization/test_cpu_wna16.py"
 
-- label: CPU-Distributed Tests
+- label: CPU-Distributed Tests (PP+TP)
   depends_on: []
   device: intel_cpu
   no_plugin: true
-  source_file_dependencies:
+  source_file_dependencies: &cpu_distributed_deps
   - csrc/cpu/shm.cpp
   - vllm/v1/worker/cpu_worker.py
   - vllm/v1/worker/gpu_worker.py
@@ -82,10 +84,21 @@ steps:
   - vllm/platforms/cpu.py
   - vllm/distributed/parallel_state.py
   - vllm/distributed/device_communicators/cpu_communicator.py
+  - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp"
+
+- label: CPU-Distributed Tests (DP+TP)
+  depends_on: []
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies: *cpu_distributed_deps
   commands:
     - |
       bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp"
 
 - label: CPU-Multi-Modal Model Tests %N
   depends_on: []

diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
@@ -192,6 +192,7 @@ export BUILDKITE_COMMIT
 export PARENT_COMMIT
 export IMAGE_TAG
 export IMAGE_TAG_LATEST
+export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}"
 export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
@@ -8,8 +8,6 @@ if [ -z "${RELEASE_VERSION}" ]; then
   RELEASE_VERSION="1.0.0.dev"
 fi
 
-ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
-
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
@@ -25,95 +23,5 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`
 
-
-To download and upload the image:
-
-\`\`\`
-# Download images:
-
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
-docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
-docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
-
-# Tag and push images:
-
-## CUDA
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai:latest-x86_64
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129 vllm/vllm-openai:x86_64-cu129
-docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129
-docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
-docker push vllm/vllm-openai:latest-x86_64-cu129
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker push vllm/vllm-openai:latest-aarch64
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129 vllm/vllm-openai:aarch64-cu129
-docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129
-docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-docker push vllm/vllm-openai:latest-aarch64-cu129
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-
-## ROCm
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker push vllm/vllm-openai-rocm:latest-base
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-
-## CPU
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai-cpu:latest-x86_64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker push vllm/vllm-openai-cpu:latest-arm64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-
-# Create multi-arch manifest:
-
-docker manifest rm vllm/vllm-openai:latest
-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker manifest push vllm/vllm-openai:latest
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
-
-docker manifest rm vllm/vllm-openai:latest-cu129
-docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-docker manifest push vllm/vllm-openai:latest-cu129
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129
-
-docker manifest rm vllm/vllm-openai-cpu:latest || true
-docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
-docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker manifest push vllm/vllm-openai-cpu:latest
-docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
-\`\`\`
+Docker images are published automatically by the "Publish release images to DockerHub" pipeline step.
 EOF
diff --git a/.buildkite/scripts/ci-fetch-log.sh b/.buildkite/scripts/ci-fetch-log.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Usage: ./ci-fetch-log.sh <buildkite_job_url> [output_file]
+#        ./ci-fetch-log.sh <build_number> <job_uuid> [output_file]
+#
+# Downloads the raw log for a Buildkite job from the public, unauthenticated
+# /organizations/<org>/pipelines/<pipeline>/builds/<n>/jobs/<uuid>/download
+# endpoint, then strips ANSI/timestamps via ci-clean-log.sh.
+#
+# Find <build_number> and <job_uuid> via:
+#   gh pr checks <PR> --repo vllm-project/vllm
+# Each failing row's URL is .../builds/<build_number>#<job_uuid>.
+
+set -euo pipefail
+
+ORG="vllm"
+PIPELINE="ci"
+
+usage() {
+    echo "Usage: $0 <buildkite_job_url> [output_file]"
+    echo "       $0 <build_number> <job_uuid> [output_file]"
+    exit 1
+}
+
+if [ $# -lt 1 ]; then usage; fi
+
+if [[ "$1" == https://* ]]; then
+    BUILD=$(echo "$1" | sed -nE 's#.*/builds/([0-9]+).*#\1#p')
+    JOB=$(echo "$1" | grep -oE '[0-9a-f]{8}-[0-9a-f-]+' | head -n 1)
+    OUT="${2:-ci-${BUILD}-${JOB:0:8}.log}"
+else
+    if [ $# -lt 2 ]; then usage; fi
+    BUILD="$1"
+    JOB="$2"
+    OUT="${3:-ci-${BUILD}-${JOB:0:8}.log}"
+fi
+
+if [ -z "$BUILD" ] || [ -z "$JOB" ]; then
+    echo "Could not parse build number or job UUID from: $1" >&2
+    usage
+fi
+
+COOKIES=$(mktemp)
+trap 'rm -f "$COOKIES"' EXIT
+
+# Buildkite issues a session cookie on first hit; subsequent /download needs it.
+curl -fsSL -c "$COOKIES" -A "vllm-ci-fetch-log" \
+    "https://buildkite.com/${ORG}/${PIPELINE}/builds/${BUILD}" -o /dev/null
+
+curl -fsSL -b "$COOKIES" -A "vllm-ci-fetch-log" \
+    "https://buildkite.com/organizations/${ORG}/pipelines/${PIPELINE}/builds/${BUILD}/jobs/${JOB}/download" \
+    -o "$OUT"
+
+bash "$(dirname "$0")/ci-clean-log.sh" "$OUT"
+
+echo "$OUT"
diff --git a/.buildkite/scripts/detect-manylinux-tag.py b/.buildkite/scripts/detect-manylinux-tag.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Detect the manylinux platform tag for a wheel and rename it in place.
+
+vLLM's build images produce wheels with the generic ``linux_<arch>`` platform
+tag, which installers like ``pip`` won't accept off PyPI/our index. We need to
+rewrite the platform tag to the appropriate ``manylinux_<major>_<minor>_<arch>``
+before uploading.
+
+Historically the tag was hard-coded per build (``manylinux_2_31`` for the
+Ubuntu 20.04-based image, ``manylinux_2_35`` for the Ubuntu 22.04-based
+images). That is brittle: bumping the base image silently produces wheels
+labelled with the wrong glibc requirement. This script asks ``auditwheel``
+to derive the tag from the symbol versions actually referenced by the
+binaries inside the wheel, so the label tracks reality.
+
+We can't simply call ``auditwheel repair`` -- it tries to graft external
+shared libraries into the wheel and fails on vLLM's CUDA/cuBLAS dependencies.
+Instead we use ``auditwheel.wheel_abi.analyze_wheel_abi`` directly, which is
+the same call that powers ``auditwheel show``, and read off
+``winfo.sym_policy.name``.
+
+Usage:
+    detect-manylinux-tag.py <wheel_path>
+
+The wheel is renamed in place; the new path is printed on stdout. All
+diagnostics go to stderr so callers can capture stdout safely.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+from auditwheel.error import (
+    AuditwheelError,
+    NonPlatformWheelError,
+    WheelToolsError,
+)
+from auditwheel.wheel_abi import analyze_wheel_abi
+from auditwheel.wheeltools import get_wheel_architecture, get_wheel_libc
+
+
+def detect_platform_tag(wheel_path: Path) -> str:
+    """Return the most precise platform tag the wheel is consistent with.
+
+    Mirrors ``auditwheel show`` but returns ``sym_policy`` rather than
+    ``overall_policy``: we only care about the glibc symbol versions used,
+    not about other policy axes (ISA extensions, blacklist, etc.) that
+    ``overall_policy`` folds in.
+    """
+    fn = wheel_path.name
+
+    try:
+        arch = get_wheel_architecture(fn)
+    except (WheelToolsError, NonPlatformWheelError):
+        # Architecture isn't deducible from the filename; let auditwheel
+        # infer it from the ELF binaries inside the wheel.
+        arch = None
+
+    try:
+        libc = get_wheel_libc(fn)
+    except WheelToolsError:
+        # An unrepaired wheel uses ``linux_<arch>``, which doesn't encode
+        # libc. Let auditwheel infer it from the ELF binaries.
+        libc = None
+
+    winfo = analyze_wheel_abi(
+        libc,
+        arch,
+        wheel_path,
+        frozenset(),
+        disable_isa_ext_check=False,
+        allow_graft=False,
+    )
+    return winfo.sym_policy.name
+
+
+def rename_wheel(wheel_path: Path, new_platform_tag: str) -> Path:
+    """Rename the wheel in place, replacing only its platform tag."""
+    # Wheel filename per PEP 427:
+    #   {distribution}-{version}(-{build})?-{python}-{abi}-{platform}.whl
+    # The platform tag is always the last ``-``-separated token before
+    # ``.whl``. Compound tags like ``manylinux_2_31_x86_64`` use ``_`` as the
+    # internal separator, so ``-``-splitting is unambiguous.
+    parts = wheel_path.stem.split("-")
+    if len(parts) < 5:
+        raise ValueError(f"Unrecognised wheel filename: {wheel_path.name}")
+    parts[-1] = new_platform_tag
+    new_path = wheel_path.with_name("-".join(parts) + ".whl")
+    if new_path != wheel_path:
+        wheel_path.rename(new_path)
+    return new_path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Detect a wheel's manylinux platform tag with "
+        "auditwheel and rename the wheel in place."
+    )
+    parser.add_argument(
+        "wheel",
+        type=Path,
+        help="Path to the wheel to inspect and rename.",
+    )
+    args = parser.parse_args()
+
+    wheel_path: Path = args.wheel
+    if not wheel_path.is_file():
+        print(f"error: {wheel_path} is not a file", file=sys.stderr)
+        return 1
+
+    # Catch the things that ``analyze_wheel_abi`` and ``rename_wheel`` can
+    # raise: any subclass of ``AuditwheelError`` (pure-Python wheels,
+    # invalid libc, malformed wheels), filesystem errors, or our own
+    # ``ValueError`` for an unrecognised wheel filename. Print a single
+    # ``ERROR_TYPE: message`` line to stderr instead of a Python
+    # traceback, which is much friendlier in CI logs.
+    try:
+        new_tag = detect_platform_tag(wheel_path)
+        print(f"detected platform tag: {new_tag}", file=sys.stderr)
+        new_path = rename_wheel(wheel_path, new_tag)
+    except (AuditwheelError, ValueError, OSError) as e:
+        print(
+            f"error: failed to retag {wheel_path.name}: {type(e).__name__}: {e}",
+            file=sys.stderr,
+        )
+        return 2
+
+    if new_path != wheel_path:
+        print(f"renamed {wheel_path.name} -> {new_path.name}", file=sys.stderr)
+    else:
+        print(f"wheel already tagged {new_tag}", file=sys.stderr)
+
+    print(new_path)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())