Ri0S · Ri0S · Jan 9, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/.buildkite/scripts/run-multi-node-test.sh b/.buildkite/scripts/run-multi-node-test.sh
@@ -2,6 +2,17 @@
 
 set -euox pipefail
 
+# To detect ROCm
+# Check multiple indicators:
+if [ -e /dev/kfd ] || \
+    [ -d /opt/rocm ] || \
+    command -v rocm-smi &> /dev/null || \
+    [ -n "${ROCM_HOME:-}" ]; then
+    IS_ROCM=1
+else
+    IS_ROCM=0
+fi
+
 if [[ $# -lt 4 ]]; then
     echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
     exit 1
@@ -26,21 +37,28 @@ for command in "${COMMANDS[@]}"; do
     echo "$command"
 done
 
+
 start_network() {
     docker network create --subnet=192.168.10.0/24 docker-net
 }
 
 start_nodes() {
     for node in $(seq 0 $(($NUM_NODES-1))); do
-        GPU_DEVICES='"device='
+        if [ "$IS_ROCM" -eq 1 ]; then
+            GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
+        else
+            GPU_DEVICES='--gpus "device='
+        fi
         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
             GPU_DEVICES+=$(($DEVICE_NUM))
             if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                 GPU_DEVICES+=','
             fi
         done
-        GPU_DEVICES+='"'
+        if [ "$IS_ROCM" -eq 0 ]; then
+            GPU_DEVICES+='"'
+        fi
 
         # start the container in detached mode
         # things to note:
@@ -49,7 +67,7 @@ start_nodes() {
         # 3. map the huggingface cache directory to the container
         # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
         #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+        docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
             -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
             --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
             /bin/bash -c "tail -f /dev/null"

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -163,9 +163,7 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
-  # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
-  # TODO: Remove after next torch update
-  - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
+  - pytest -v -s entrypoints/openai/test_vision_embeds.py
   - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Entrypoints Integration Test (API Server 2)
@@ -519,8 +517,7 @@ steps:
   - tests/samplers
   - tests/conftest.py
   commands:
-    - pytest -v -s samplers
-    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+    - pytest -v -s -m 'not skip_v1' samplers
 
 - label: LoRA Test %N # 20min each
   timeout_in_minutes: 30
@@ -989,9 +986,7 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
-    # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
-    # TODO: Remove after next torch update
-    - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
+    - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Accuracy Eval (Small Models) # 5min
@@ -1356,9 +1351,7 @@ steps:
   # end platform plugin tests
   # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
   - pip install -e ./plugins/prithvi_io_processor_plugin
-  # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
-  # TODO: Remove after next torch update
-  - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
   - pip uninstall prithvi_io_processor_plugin -y
   # end io_processor plugins test
   # begin stat_logger plugins test
@@ -1455,7 +1448,21 @@ steps:
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+    - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+    - VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 ##### multi gpus test #####
 ##### A100 test #####

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -1104,6 +1104,7 @@ steps:
   - vllm/model_executor/models/
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
+  - .buildkite/scripts/run-multi-node-test.sh
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1266,8 +1267,8 @@ steps:
   commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 
-- label: NixlConnector PD accuracy tests (Distributed) # 30min
-  timeout_in_minutes: 30
+- label: NixlConnector PD accuracy tests (Distributed) # 40min
+  timeout_in_minutes: 40
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -1277,8 +1278,8 @@ steps:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
-- label: DP EP NixlConnector PD accuracy tests (Distributed)
-  timeout_in_minutes: 30
+- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
+  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -1406,3 +1407,26 @@ steps:
   working_dir: "/vllm-workspace"
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+##### MoE Refactor (Temporary) Tests #####
+
+- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
+
+- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
@@ -182,7 +182,7 @@ steps:
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 - label: Pipeline + Context Parallelism (4 GPUs))
   timeout_in_minutes: 60

diff --git a/.gitignore b/.gitignore
@@ -227,3 +227,8 @@ ep_kernels_workspace/
 
 # Allow tracked library source folders under submodules (e.g., benchmarks/lib)
 !vllm/benchmarks/lib/
+
+# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
+vllm/grpc/vllm_engine_pb2.py
+vllm/grpc/vllm_engine_pb2_grpc.py
+vllm/grpc/vllm_engine_pb2.pyi
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,47 +1,30 @@
 # Releasing vLLM
 
-vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
+vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via [PyPI](https://pypi.org/project/vllm). These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
 
-## Release Versioning
+## Release Cadence and Versioning
 
-vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
+We aim to have a regular release every 2 weeks. Since v0.12.0, regular releases increment the minor version rather than patch version. The list of past releases can be found [here](https://vllm.ai/releases).
 
-* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
-* _minor_ major features
-* _patch_ features and backwards-compatible bug fixes
-* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
+Our version numbers are expressed in the form `vX.Y.Z`, where `X` is the major version, `Y` is the minor version, and `Z` is the patch version. They are incremented according to the following rules:
 
-## Release Cadence
+* _Major_ releases are reserved for architectural milestones involving sweeping API changes, similar to PyTorch 2.0.
+* _Minor_ releases correspond to regular releases, which include new features, bug fixes and other backwards-compatible changes.
+* _Patch_ releases correspond to special releases for new models, as well as emergency patches for critical performance, functionality and security issues.
 
-Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
-Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
+This versioning scheme is similar to [SemVer](https://semver.org/) for compatibility purposes, except that backwards compatibility is only guaranteed for a limited number of minor releases (see our [deprecation policy](https://docs.vllm.ai/en/latest/contributing/deprecation_policy) for details).
 
-| Release Date | Patch release versions | Post Release versions |
-| --- | --- | --- |
-| Jan 2025 | 0.7.0 | --- |
-| Feb 2025 | 0.7.1, 0.7.2, 0.7.3  | --- |
-| Mar 2025 | 0.7.4, 0.7.5 | --- |
-| Apr 2025 | 0.7.6, 0.7.7 | --- |
-| May 2025 | 0.7.8, 0.7.9 | --- |
-| Jun 2025 | 0.7.10, 0.7.11 | --- |
-| Jul 2025 | 0.7.12, 0.7.13 | --- |
-| Aug 2025 | 0.7.14, 0.7.15 | --- |
-| Sep 2025 | 0.7.16, 0.7.17 | --- |
-| Oct 2025 | 0.7.18, 0.7.19 | --- |
-| Nov 2025 | 0.7.20, 0.7.21 | --- |
-| Dec 2025 | 0.7.22, 0.7.23 | --- |
-
-## Release branch
+## Release Branch
 
 Each release is built from a dedicated release branch.
 
-* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
-* For post releases, previously cut release branch is reused
-* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
-* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
-* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
+* For _major_ and _minor_ releases, the release branch cut is performed 1-2 days before release is live.
+* For _patch_ releases, previously cut release branch is reused.
+* Release builds are triggered via push to RC tag like `vX.Y.Z-rc1`. This enables us to build and test multiple RCs for each release.
+* Final tag: `vX.Y.Z` does not trigger the build but used for Release notes and assets.
+* After branch cut is created, we monitor the main branch for any reverts and apply these reverts to a release branch.
 
-## Release Cherry-Pick Criteria
+### Cherry-Pick Criteria
 
 After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
 

diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -343,7 +343,9 @@ def bench(
         return bench_int8(dtype, m, k, n, label, sub_label)
     if dtype == torch.float8_e4m3fn:
         return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
+    raise ValueError(
+        f"Unsupported dtype {dtype}: should be one of torch.int8, torch.float8_e4m3fn."
+    )
 
 
 # runner

diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
@@ -8,10 +8,9 @@
 
 import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
 
 batch_size_range = [1, 16, 128]
 seq_len_range = [1, 16, 64, 1024, 4096]
@@ -30,7 +29,7 @@ def benchmark_activation(
     device = "cuda"
     num_tokens = batch_size * seq_len
     dim = intermediate_size
-    current_platform.seed_everything(42)
+    set_random_seed(42)
     torch.set_default_device(device)
 
     if func_name == "gelu_and_mul":