macrodata-labs · Hynek Kydlíček (hynky1999) · Jun 2, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
@@ -8,6 +8,7 @@ run_all_patterns:
   - "CMakeLists.txt"
   - "requirements/common.txt"
   - "requirements/cuda.txt"
+  - "requirements/kv_connectors.txt"
   - "requirements/build/cuda.txt"
   - "requirements/test/cuda.txt"
   - "setup.py"

diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml
@@ -17,6 +17,26 @@ steps:
       --target test
       --no-cache
       --progress plain .
+    - |
+      docker run --rm --network=none --entrypoint /bin/bash "rocm/vllm-ci:${BUILDKITE_COMMIT}" -ec '
+        if [ ! -d /vllm-workspace ]; then echo Missing directory: /vllm-workspace >&2; exit 1; fi
+        if [ ! -d /vllm-workspace/tests ]; then echo Missing directory: /vllm-workspace/tests >&2; exit 1; fi
+        if [ ! -d /vllm-workspace/src/vllm ]; then echo Missing directory: /vllm-workspace/src/vllm >&2; exit 1; fi
+        if [ ! -x /vllm-workspace/src/vllm/vllm-rs ]; then echo Missing executable: /vllm-workspace/src/vllm/vllm-rs >&2; exit 1; fi
+        command -v python3
+        command -v uv
+        command -v pytest
+        if ! command -v amd-smi >/dev/null 2>&1 && ! command -v rocminfo >/dev/null 2>&1; then
+          echo No ROCm CLI found in image >&2
+          exit 1
+        fi
+        python3 - <<PY
+      import torch, vllm
+      print(torch.__version__)
+      print(vllm.__version__)
+      PY
+        echo AMD image smoke OK
+      '
     - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml
@@ -12,15 +12,19 @@ steps:
   - vllm/_custom_ops.py
   - tests/kernels/attention/test_cpu_attn.py
   - tests/kernels/moe/test_cpu_fused_moe.py
+  - tests/kernels/moe/test_cpu_quant_fused_moe.py
   - tests/kernels/test_onednn.py
   - tests/kernels/test_awq_int4_to_int8.py
+  - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
   commands:
     - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
       pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
       pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+      pytest -x -v -s tests/kernels/moe/test_cpu_quant_fused_moe.py
       pytest -x -v -s tests/kernels/test_onednn.py
-      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py"
+      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py
+      pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py"
 
 - label: CPU-Compatibility Tests
   depends_on: []
@@ -50,30 +54,49 @@ steps:
       pytest -x -v -s tests/models/language/generation -m cpu_model
       pytest -x -v -s tests/models/language/pooling -m cpu_model"
 
+- label: CPU-ModelRunnerV2 Tests
+  depends_on: []
+  device: intel_cpu
+  no_plugin: true
+  soft_fail: true
+  source_file_dependencies:
+  - vllm/v1/worker/cpu/
+  - vllm/v1/worker/gpu/
+  - vllm/v1/sample/ops/topk_topp_triton.py
+  - vllm/v1/sample/ops/topk_topp_sampler.py
+  - tests/v1/sample/test_topk_topp_sampler.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
+      uv pip install git+https://github.com/triton-lang/triton-cpu.git@270e696d
+      VLLM_USE_V2_MODEL_RUNNER=1 pytest -x -v -s tests/models/language/generation/test_granite.py -m cpu_model
+      # TODO: move to CPU-Kernel Tests once triton-cpu has a pre-built wheel
+      pytest -x -v -s tests/v1/sample/test_topk_topp_sampler.py::TestTritonTopkTopp"
+
 - label: CPU-Quantization Model Tests
   depends_on: []
   device: intel_cpu
   no_plugin: true
   source_file_dependencies:
   - csrc/cpu/
-  - vllm/model_executor/layers/quantization/cpu_wna16.py
-  - vllm/model_executor/layers/quantization/gptq_marlin.py
+  - vllm/model_executor/layers/quantization/auto_gptq.py
   - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
-  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
-  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+  - vllm/model_executor/kernels/linear/mixed_precision/cpu.py
+  - vllm/model_executor/kernels/linear/scaled_mm/cpu.py
+  - vllm/model_executor/layers/fused_moe/experts/cpu_moe.py
   - tests/quantization/test_compressed_tensors.py
   - tests/quantization/test_cpu_wna16.py
   commands:
     - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
       pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
       pytest -x -v -s tests/quantization/test_cpu_wna16.py"
 
-- label: CPU-Distributed Tests
+- label: CPU-Distributed Tests (PP+TP)
   depends_on: []
   device: intel_cpu
   no_plugin: true
-  source_file_dependencies:
+  source_file_dependencies: &cpu_distributed_deps
   - csrc/cpu/shm.cpp
   - vllm/v1/worker/cpu_worker.py
   - vllm/v1/worker/gpu_worker.py
@@ -82,10 +105,21 @@ steps:
   - vllm/platforms/cpu.py
   - vllm/distributed/parallel_state.py
   - vllm/distributed/device_communicators/cpu_communicator.py
+  - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp"
+
+- label: CPU-Distributed Tests (DP+TP)
+  depends_on: []
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies: *cpu_distributed_deps
   commands:
     - |
       bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp"
 
 - label: CPU-Multi-Modal Model Tests %N
   depends_on: []

diff --git a/.buildkite/hardware_tests/intel.yaml b/.buildkite/hardware_tests/intel.yaml
@@ -8,10 +8,3 @@ steps:
     commands: 
     - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
 
-  - label: "Intel GPU Test"
-    depends_on: []
-    soft_fail: true
-    device: intel_gpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
@@ -92,8 +92,8 @@ check_and_skip_if_image_exists() {
 }
 
 ecr_login() {
-    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
 }
 
 prepare_cache_tags() {
@@ -192,6 +192,7 @@ export BUILDKITE_COMMIT
 export PARENT_COMMIT
 export IMAGE_TAG
 export IMAGE_TAG_LATEST
+export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}"
 export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN

diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
@@ -13,6 +13,60 @@ steps:
         - exit_status: -10  # Agent was lost
           limit: 2
 
+  - label: ":docker: :smoking: Non-root smoke tests"
+    key: image-build-smoke-test
+    depends_on:
+      - image-build
+    commands:
+    # Smoke 1: the default (root) image must still be importable
+    # under a non-root UID via `--user 2000:0`. Validates the `vllm` passwd
+    # entry + group-0-writable /home/vllm + uv path cleanup from #31959.
+    # Uses `import vllm` rather than `vllm serve --help` because the latter
+    # instantiates `VllmConfig` which requires a GPU attached to the
+    # container.
+    - docker run --rm --user 2000:0 --entrypoint python3 "$IMAGE_TAG" -c "import vllm; print(vllm.__version__)"
+    # Smoke 2: assert the non-root enabling invariants are baked
+    # into the image. Runs as UID 2000:0 via a shell so we can verify
+    # filesystem perms + passwd/group file state + wrapper presence without
+    # triggering vLLM's GPU-requiring config-init path. The opt-in
+    # `vllm-openai-nonroot` target adds only `USER vllm`, `WORKDIR
+    # /home/vllm`, and an `ENTRYPOINT` override on top of these invariants;
+    # its build correctness is reviewed at the Dockerfile level. Wrapper
+    # logic is covered separately by the pre-commit hook
+    # `test-nonroot-entrypoint` (see .pre-commit-config.yaml).
+    - |
+      docker run --rm --user 2000:0 --entrypoint /bin/sh "$IMAGE_TAG" -ec '
+        if ! getent passwd 2000 | grep -q ^vllm:; then
+          echo FAIL: UID 2000 != vllm
+          exit 1
+        fi
+        if ! id -gn 2>/dev/null | grep -qx root; then
+          echo FAIL: GID 0 not root group
+          exit 1
+        fi
+        touch /home/vllm/.smoke && rm /home/vllm/.smoke
+        touch /opt/uv/cache/.smoke && rm /opt/uv/cache/.smoke
+        if ! test -x /usr/local/bin/vllm-nonroot-entrypoint.sh; then
+          echo FAIL: wrapper missing
+          exit 1
+        fi
+        if ! test -w /etc/passwd; then
+          echo FAIL: /etc/passwd not group-writable
+          exit 1
+        fi
+        if ! test -w /etc/group; then
+          echo FAIL: /etc/group not group-writable
+          exit 1
+        fi
+        echo non-root invariants OK
+      '
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
   - label: ":docker: Build CPU image"
     key: image-build-cpu
     depends_on: []
@@ -56,3 +110,21 @@ steps:
           limit: 2
         - exit_status: -10  # Agent was lost
           limit: 2
+
+  - label: ":docker: Build arm64 image"
+    key: arm64-image-build
+    depends_on: []
+    source_file_dependencies:
+      - ".buildkite/image_build/image_build.yaml"
+      - ".buildkite/image_build/image_build_arm64.sh"
+      - "docker/Dockerfile"
+    commands:
+    - .buildkite/image_build/image_build_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
diff --git a/.buildkite/image_build/image_build_arm64.sh b/.buildkite/image_build/image_build_arm64.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build (Grace/GH200 is the arm64 GPU target; sm_90)
+docker build --file docker/Dockerfile \
+  --platform linux/arm64 \
+  --build-arg max_jobs=16 \
+  --build-arg nvcc_threads=4 \
+  --build-arg torch_cuda_arch_list="9.0" \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64 \
+  --target test \
+  --progress plain .
+
+# push
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then

diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then

diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then

diff --git a/.buildkite/image_build/image_build_torch_nightly.sh b/.buildkite/image_build/image_build_torch_nightly.sh
@@ -46,7 +46,7 @@ echo "Image not found, proceeding with build..."
 
 # --- CUDA 13.0 for nightly builds ---
 # Nightly CI uses CUDA 13.0 while regular CI stays on CUDA 12.9
-NIGHTLY_CUDA_VERSION="13.0.0"
+NIGHTLY_CUDA_VERSION="13.0.2"
 NIGHTLY_BUILD_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-devel-ubuntu22.04"
 NIGHTLY_FINAL_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-base-ubuntu22.04"
 

diff --git a/.buildkite/image_build/image_build_xpu.sh b/.buildkite/image_build/image_build_xpu.sh
@@ -11,8 +11,8 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
 
 # skip build if image already exists
 if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then

diff --git a/.buildkite/intel_jobs/engine_intel.yaml b/.buildkite/intel_jobs/engine_intel.yaml
@@ -0,0 +1,21 @@
+group: Engine Intel
+depends_on:
+  - image-build-xpu
+steps:
+- label: Engine (1 GPU)
+  timeout_in_minutes: 30
+  device: intel_gpu
+  no_plugin: true
+  working_dir: "."
+  env:
+    REGISTRY: "public.ecr.aws/q9t5s3a7"
+    REPO: "vllm-ci-test-repo"
+    VLLM_TEST_DEVICE: "xpu"
+  source_file_dependencies:
+    - vllm/v1/engine/
+    - tests/v1/engine/
+  commands:
+    - >-
+      bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+      'cd tests &&
+      pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py'
diff --git a/.buildkite/intel_jobs/kernels_intel.yaml b/.buildkite/intel_jobs/kernels_intel.yaml
@@ -0,0 +1,21 @@
+group: Kernels Intel
+depends_on: 
+  - image-build-xpu
+steps:
+- label: vLLM IR Tests
+  timeout_in_minutes: 30
+  device: intel_gpu
+  no_plugin: true
+  working_dir: "."
+  env:
+    REGISTRY: "public.ecr.aws/q9t5s3a7"
+    REPO: "vllm-ci-test-repo"
+    VLLM_TEST_DEVICE: "xpu"
+  source_file_dependencies:
+    - vllm/ir
+    - vllm/kernels
+  commands:
+    - >-
+      bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+      'cd tests &&
+      pytest -v -s kernels/ir'