diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml
index 9b1044443780..77abb5532dbe 100644
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -14,13 +14,15 @@ steps:
   - tests/kernels/moe/test_cpu_fused_moe.py
   - tests/kernels/test_onednn.py
   - tests/kernels/test_awq_int4_to_int8.py
+  - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
   commands:
     - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
       pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
       pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
       pytest -x -v -s tests/kernels/test_onednn.py
-      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py"
+      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py
+      pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py"
 
 - label: CPU-Compatibility Tests
   depends_on: []
@@ -69,11 +71,11 @@ steps:
       pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
       pytest -x -v -s tests/quantization/test_cpu_wna16.py"
       
-- label: CPU-Distributed Tests
+- label: CPU-Distributed Tests (PP+TP)
   depends_on: []
   device: intel_cpu
   no_plugin: true
-  source_file_dependencies:
+  source_file_dependencies: &cpu_distributed_deps
   - csrc/cpu/shm.cpp
   - vllm/v1/worker/cpu_worker.py
   - vllm/v1/worker/gpu_worker.py
@@ -82,10 +84,21 @@ steps:
   - vllm/platforms/cpu.py
   - vllm/distributed/parallel_state.py
   - vllm/distributed/device_communicators/cpu_communicator.py
+  - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp"
+
+- label: CPU-Distributed Tests (DP+TP)
+  depends_on: []
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies: *cpu_distributed_deps
   commands:
     - |
       bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp"
 
 - label: CPU-Multi-Modal Model Tests %N
   depends_on: []
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
index 00ae34bba6d7..10c03c3e1773 100755
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -192,6 +192,7 @@ export BUILDKITE_COMMIT
 export PARENT_COMMIT
 export IMAGE_TAG
 export IMAGE_TAG_LATEST
+export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}"
 export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 8fce15680173..8a900c0bf862 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -27,7 +27,7 @@ steps:
           - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
           - "mkdir artifacts"
           - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
         env:
           DOCKER_BUILDKIT: "1"
 
@@ -37,10 +37,10 @@ steps:
         agents:
           queue: arm64_cpu_queue_release
         commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
           - "mkdir artifacts"
           - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
         env:
           DOCKER_BUILDKIT: "1"
 
@@ -53,7 +53,7 @@ steps:
           - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
           - "mkdir artifacts"
           - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
         env:
           DOCKER_BUILDKIT: "1"
 
@@ -66,7 +66,7 @@ steps:
           - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
           - "mkdir artifacts"
           - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
         env:
           DOCKER_BUILDKIT: "1"
 
@@ -76,10 +76,10 @@ steps:
         agents:
           queue: cpu_queue_release
         commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
           - "mkdir artifacts"
           - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
         env:
           DOCKER_BUILDKIT: "1"
 
@@ -92,7 +92,7 @@ steps:
           - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
           - "mkdir artifacts"
           - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
         env:
           DOCKER_BUILDKIT: "1"
 
@@ -121,7 +121,19 @@ steps:
           queue: cpu_queue_release
         commands:
           - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - |
+            DOCKER_BUILDKIT=1 docker build \
+              $(bash .buildkite/scripts/docker-build-metadata-args.sh) \
+              --build-arg max_jobs=16 \
+              --build-arg USE_SCCACHE=1 \
+              --build-arg GIT_REPO_CHECK=1 \
+              --build-arg CUDA_VERSION=13.0.2 \
+              --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \
+              --build-arg INSTALL_KV_CONNECTORS=true \
+              --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \
+              --target vllm-openai \
+              --progress plain \
+              -f docker/Dockerfile .
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
           # re-tag to default image tag and push, just in case arm64 build fails
           - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
@@ -134,7 +146,19 @@ steps:
           queue: arm64_cpu_queue_release
         commands:
           - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - |
+            DOCKER_BUILDKIT=1 docker build \
+              $(bash .buildkite/scripts/docker-build-metadata-args.sh) \
+              --build-arg max_jobs=16 \
+              --build-arg USE_SCCACHE=1 \
+              --build-arg GIT_REPO_CHECK=1 \
+              --build-arg CUDA_VERSION=13.0.2 \
+              --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \
+              --build-arg INSTALL_KV_CONNECTORS=true \
+              --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \
+              --target vllm-openai \
+              --progress plain \
+              -f docker/Dockerfile .
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 
       - label: "Build release image - x86_64 - CUDA 12.9"
@@ -144,7 +168,18 @@ steps:
           queue: cpu_queue_release
         commands:
           - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - |
+            DOCKER_BUILDKIT=1 docker build \
+              $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \
+              --build-arg max_jobs=16 \
+              --build-arg USE_SCCACHE=1 \
+              --build-arg GIT_REPO_CHECK=1 \
+              --build-arg CUDA_VERSION=12.9.1 \
+              --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \
+              --build-arg INSTALL_KV_CONNECTORS=true \
+              --target vllm-openai \
+              --progress plain \
+              -f docker/Dockerfile .
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"
           # re-tag to default image tag and push, just in case arm64 build fails
           - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"
@@ -157,7 +192,18 @@ steps:
           queue: arm64_cpu_queue_release
         commands:
           - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - |
+            DOCKER_BUILDKIT=1 docker build \
+              $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \
+              --build-arg max_jobs=16 \
+              --build-arg USE_SCCACHE=1 \
+              --build-arg GIT_REPO_CHECK=1 \
+              --build-arg CUDA_VERSION=12.9.1 \
+              --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \
+              --build-arg INSTALL_KV_CONNECTORS=true \
+              --target vllm-openai \
+              --progress plain \
+              -f docker/Dockerfile .
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"
 
       - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
@@ -167,7 +213,21 @@ steps:
           queue: cpu_queue_release
         commands:
           - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - |
+            DOCKER_BUILDKIT=1 docker build \
+              $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \
+              --build-arg max_jobs=16 \
+              --build-arg USE_SCCACHE=1 \
+              --build-arg GIT_REPO_CHECK=1 \
+              --build-arg CUDA_VERSION=13.0.2 \
+              --build-arg UBUNTU_VERSION=24.04 \
+              --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+              --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \
+              --build-arg INSTALL_KV_CONNECTORS=true \
+              --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \
+              --target vllm-openai \
+              --progress plain \
+              -f docker/Dockerfile .
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
           - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
@@ -179,7 +239,21 @@ steps:
           queue: arm64_cpu_queue_release
         commands:
           - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - |
+            DOCKER_BUILDKIT=1 docker build \
+              $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \
+              --build-arg max_jobs=16 \
+              --build-arg USE_SCCACHE=1 \
+              --build-arg GIT_REPO_CHECK=1 \
+              --build-arg CUDA_VERSION=13.0.2 \
+              --build-arg UBUNTU_VERSION=24.04 \
+              --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+              --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \
+              --build-arg INSTALL_KV_CONNECTORS=true \
+              --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \
+              --target vllm-openai \
+              --progress plain \
+              -f docker/Dockerfile .
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
 
       - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
@@ -189,7 +263,20 @@ steps:
           queue: cpu_queue_release
         commands:
           - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - |
+            DOCKER_BUILDKIT=1 docker build \
+              $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \
+              --build-arg max_jobs=16 \
+              --build-arg USE_SCCACHE=1 \
+              --build-arg GIT_REPO_CHECK=1 \
+              --build-arg CUDA_VERSION=12.9.1 \
+              --build-arg UBUNTU_VERSION=24.04 \
+              --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+              --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \
+              --build-arg INSTALL_KV_CONNECTORS=true \
+              --target vllm-openai \
+              --progress plain \
+              -f docker/Dockerfile .
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"
           - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"
@@ -201,7 +288,20 @@ steps:
           queue: arm64_cpu_queue_release
         commands:
           - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - |
+            DOCKER_BUILDKIT=1 docker build \
+              $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \
+              --build-arg max_jobs=16 \
+              --build-arg USE_SCCACHE=1 \
+              --build-arg GIT_REPO_CHECK=1 \
+              --build-arg CUDA_VERSION=12.9.1 \
+              --build-arg UBUNTU_VERSION=24.04 \
+              --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+              --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \
+              --build-arg INSTALL_KV_CONNECTORS=true \
+              --target vllm-openai \
+              --progress plain \
+              -f docker/Dockerfile .
           - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"
 
       - block: "Build release image for x86_64 CPU"
@@ -209,6 +309,7 @@ steps:
         depends_on: ~
 
       - label: "Build release image - x86_64 - CPU"
+        key: build-cpu-release-image-x86
         depends_on:
           - block-cpu-release-image-build
           - input-release-version
@@ -227,7 +328,8 @@ steps:
         depends_on: ~
 
       - label: "Build release image - arm64 - CPU"
-        depends_on: 
+        key: build-cpu-release-image-arm64
+        depends_on:
           - block-arm64-cpu-release-image-build
           - input-release-version
         agents:
@@ -336,6 +438,41 @@ steps:
           DOCKER_BUILDKIT: "1"
           DOCKERHUB_USERNAME: "vllmbot"
 
+      - block: "Publish release images to DockerHub"
+        key: block-publish-release-images
+        depends_on:
+          - create-multi-arch-manifest
+          - create-multi-arch-manifest-cuda-12-9
+          - create-multi-arch-manifest-ubuntu2404
+          - create-multi-arch-manifest-cuda-12-9-ubuntu2404
+          - build-rocm-release-image
+          - input-release-version
+          # Wait for CPU builds if their block steps were unblocked, so publish
+          # doesn't race the in-progress CPU build. allow_failure lets publish
+          # proceed when the operator legitimately leaves the CPU block steps
+          # unblocked or the CPU build fails.
+          - step: build-cpu-release-image-x86
+            allow_failure: true
+          - step: build-cpu-release-image-arm64
+            allow_failure: true
+        if: build.env("NIGHTLY") != "1"
+
+      - label: "Publish release images to DockerHub"
+        depends_on:
+          - block-publish-release-images
+        key: publish-release-images-dockerhub
+        agents:
+          queue: small_cpu_queue_release
+        commands:
+          - "bash .buildkite/scripts/publish-release-images.sh"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"
+
   - group: "Publish wheels"
     key: "publish-wheels"
     steps:
@@ -623,7 +760,7 @@ steps:
       - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
     env:
       S3_BUCKET: "vllm-wheels"
-      VARIANT: "rocm721"
+      VARIANT: "rocm722"
 
   # ROCm Job 6: Build ROCm Release Docker Image
   - label: ":docker: Build release image - x86_64 - ROCm"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 6f41d1cdda47..afa884fba46b 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -8,8 +8,6 @@ if [ -z "${RELEASE_VERSION}" ]; then
   RELEASE_VERSION="1.0.0.dev"
 fi
 
-ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
-
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
@@ -25,95 +23,5 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`
 
-
-To download and upload the image:
-
-\`\`\`
-# Download images:
-
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
-docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
-docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
-
-# Tag and push images:
-
-## CUDA
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai:latest-x86_64
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129 vllm/vllm-openai:x86_64-cu129
-docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129
-docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
-docker push vllm/vllm-openai:latest-x86_64-cu129
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker push vllm/vllm-openai:latest-aarch64
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129 vllm/vllm-openai:aarch64-cu129
-docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129
-docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-docker push vllm/vllm-openai:latest-aarch64-cu129
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-
-## ROCm
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker push vllm/vllm-openai-rocm:latest-base
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-
-## CPU
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai-cpu:latest-x86_64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker push vllm/vllm-openai-cpu:latest-arm64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-
-# Create multi-arch manifest:
-
-docker manifest rm vllm/vllm-openai:latest
-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker manifest push vllm/vllm-openai:latest
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
-
-docker manifest rm vllm/vllm-openai:latest-cu129
-docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-docker manifest push vllm/vllm-openai:latest-cu129
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129
-
-docker manifest rm vllm/vllm-openai-cpu:latest || true
-docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
-docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker manifest push vllm/vllm-openai-cpu:latest
-docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
-\`\`\`
+Docker images are published automatically by the "Publish release images to DockerHub" pipeline step.
 EOF
diff --git a/.buildkite/scripts/ci-fetch-log.sh b/.buildkite/scripts/ci-fetch-log.sh
new file mode 100755
index 000000000000..02798b56f4a9
--- /dev/null
+++ b/.buildkite/scripts/ci-fetch-log.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Usage: ./ci-fetch-log.sh <buildkite_job_url> [output_file]
+#        ./ci-fetch-log.sh <build_number> <job_uuid> [output_file]
+#
+# Downloads the raw log for a Buildkite job from the public, unauthenticated
+# /organizations/<org>/pipelines/<pipeline>/builds/<n>/jobs/<uuid>/download
+# endpoint, then strips ANSI/timestamps via ci-clean-log.sh.
+#
+# Find <build_number> and <job_uuid> via:
+#   gh pr checks <PR> --repo vllm-project/vllm
+# Each failing row's URL is .../builds/<build_number>#<job_uuid>.
+
+set -euo pipefail
+
+ORG="vllm"
+PIPELINE="ci"
+
+usage() {
+    echo "Usage: $0 <buildkite_job_url> [output_file]"
+    echo "       $0 <build_number> <job_uuid> [output_file]"
+    exit 1
+}
+
+if [ $# -lt 1 ]; then usage; fi
+
+if [[ "$1" == https://* ]]; then
+    BUILD=$(echo "$1" | sed -nE 's#.*/builds/([0-9]+).*#\1#p')
+    JOB=$(echo "$1" | grep -oE '[0-9a-f]{8}-[0-9a-f-]+' | head -n 1)
+    OUT="${2:-ci-${BUILD}-${JOB:0:8}.log}"
+else
+    if [ $# -lt 2 ]; then usage; fi
+    BUILD="$1"
+    JOB="$2"
+    OUT="${3:-ci-${BUILD}-${JOB:0:8}.log}"
+fi
+
+if [ -z "$BUILD" ] || [ -z "$JOB" ]; then
+    echo "Could not parse build number or job UUID from: $1" >&2
+    usage
+fi
+
+COOKIES=$(mktemp)
+trap 'rm -f "$COOKIES"' EXIT
+
+# Buildkite issues a session cookie on first hit; subsequent /download needs it.
+curl -fsSL -c "$COOKIES" -A "vllm-ci-fetch-log" \
+    "https://buildkite.com/${ORG}/${PIPELINE}/builds/${BUILD}" -o /dev/null
+
+curl -fsSL -b "$COOKIES" -A "vllm-ci-fetch-log" \
+    "https://buildkite.com/organizations/${ORG}/pipelines/${PIPELINE}/builds/${BUILD}/jobs/${JOB}/download" \
+    -o "$OUT"
+
+bash "$(dirname "$0")/ci-clean-log.sh" "$OUT"
+
+echo "$OUT"
diff --git a/.buildkite/scripts/detect-manylinux-tag.py b/.buildkite/scripts/detect-manylinux-tag.py
new file mode 100644
index 000000000000..40fa6c6ffbb7
--- /dev/null
+++ b/.buildkite/scripts/detect-manylinux-tag.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Detect the manylinux platform tag for a wheel and rename it in place.
+
+vLLM's build images produce wheels with the generic ``linux_<arch>`` platform
+tag, which installers like ``pip`` won't accept off PyPI/our index. We need to
+rewrite the platform tag to the appropriate ``manylinux_<major>_<minor>_<arch>``
+before uploading.
+
+Historically the tag was hard-coded per build (``manylinux_2_31`` for the
+Ubuntu 20.04-based image, ``manylinux_2_35`` for the Ubuntu 22.04-based
+images). That is brittle: bumping the base image silently produces wheels
+labelled with the wrong glibc requirement. This script asks ``auditwheel``
+to derive the tag from the symbol versions actually referenced by the
+binaries inside the wheel, so the label tracks reality.
+
+We can't simply call ``auditwheel repair`` -- it tries to graft external
+shared libraries into the wheel and fails on vLLM's CUDA/cuBLAS dependencies.
+Instead we use ``auditwheel.wheel_abi.analyze_wheel_abi`` directly, which is
+the same call that powers ``auditwheel show``, and read off
+``winfo.sym_policy.name``.
+
+Usage:
+    detect-manylinux-tag.py <wheel_path>
+
+The wheel is renamed in place; the new path is printed on stdout. All
+diagnostics go to stderr so callers can capture stdout safely.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+from auditwheel.error import (
+    AuditwheelError,
+    NonPlatformWheelError,
+    WheelToolsError,
+)
+from auditwheel.wheel_abi import analyze_wheel_abi
+from auditwheel.wheeltools import get_wheel_architecture, get_wheel_libc
+
+
+def detect_platform_tag(wheel_path: Path) -> str:
+    """Return the most precise platform tag the wheel is consistent with.
+
+    Mirrors ``auditwheel show`` but returns ``sym_policy`` rather than
+    ``overall_policy``: we only care about the glibc symbol versions used,
+    not about other policy axes (ISA extensions, blacklist, etc.) that
+    ``overall_policy`` folds in.
+    """
+    fn = wheel_path.name
+
+    try:
+        arch = get_wheel_architecture(fn)
+    except (WheelToolsError, NonPlatformWheelError):
+        # Architecture isn't deducible from the filename; let auditwheel
+        # infer it from the ELF binaries inside the wheel.
+        arch = None
+
+    try:
+        libc = get_wheel_libc(fn)
+    except WheelToolsError:
+        # An unrepaired wheel uses ``linux_<arch>``, which doesn't encode
+        # libc. Let auditwheel infer it from the ELF binaries.
+        libc = None
+
+    winfo = analyze_wheel_abi(
+        libc,
+        arch,
+        wheel_path,
+        frozenset(),
+        disable_isa_ext_check=False,
+        allow_graft=False,
+    )
+    return winfo.sym_policy.name
+
+
+def rename_wheel(wheel_path: Path, new_platform_tag: str) -> Path:
+    """Rename the wheel in place, replacing only its platform tag."""
+    # Wheel filename per PEP 427:
+    #   {distribution}-{version}(-{build})?-{python}-{abi}-{platform}.whl
+    # The platform tag is always the last ``-``-separated token before
+    # ``.whl``. Compound tags like ``manylinux_2_31_x86_64`` use ``_`` as the
+    # internal separator, so ``-``-splitting is unambiguous.
+    parts = wheel_path.stem.split("-")
+    if len(parts) < 5:
+        raise ValueError(f"Unrecognised wheel filename: {wheel_path.name}")
+    parts[-1] = new_platform_tag
+    new_path = wheel_path.with_name("-".join(parts) + ".whl")
+    if new_path != wheel_path:
+        wheel_path.rename(new_path)
+    return new_path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Detect a wheel's manylinux platform tag with "
+        "auditwheel and rename the wheel in place."
+    )
+    parser.add_argument(
+        "wheel",
+        type=Path,
+        help="Path to the wheel to inspect and rename.",
+    )
+    args = parser.parse_args()
+
+    wheel_path: Path = args.wheel
+    if not wheel_path.is_file():
+        print(f"error: {wheel_path} is not a file", file=sys.stderr)
+        return 1
+
+    # Catch the things that ``analyze_wheel_abi`` and ``rename_wheel`` can
+    # raise: any subclass of ``AuditwheelError`` (pure-Python wheels,
+    # invalid libc, malformed wheels), filesystem errors, or our own
+    # ``ValueError`` for an unrecognised wheel filename. Print a single
+    # ``ERROR_TYPE: message`` line to stderr instead of a Python
+    # traceback, which is much friendlier in CI logs.
+    try:
+        new_tag = detect_platform_tag(wheel_path)
+        print(f"detected platform tag: {new_tag}", file=sys.stderr)
+        new_path = rename_wheel(wheel_path, new_tag)
+    except (AuditwheelError, ValueError, OSError) as e:
+        print(
+            f"error: failed to retag {wheel_path.name}: {type(e).__name__}: {e}",
+            file=sys.stderr,
+        )
+        return 2
+
+    if new_path != wheel_path:
+        print(f"renamed {wheel_path.name} -> {new_path.name}", file=sys.stderr)
+    else:
+        print(f"wheel already tagged {new_tag}", file=sys.stderr)
+
+    print(new_path)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.buildkite/scripts/docker-build-metadata-args.sh b/.buildkite/scripts/docker-build-metadata-args.sh
new file mode 100644
index 000000000000..9aa6fa9314f7
--- /dev/null
+++ b/.buildkite/scripts/docker-build-metadata-args.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Emit docker build flags for release image provenance metadata.
+# Keep this helper best-effort: missing Buildkite metadata should fall back to
+# local/default values instead of blocking the Docker build.
+
+# Variant examples: "", "cu129", "ubuntu2404", "cu129-ubuntu2404".
+variant="${1:-}"
+variant_suffix="${variant:+-${variant}}"
+
+image_name="${VLLM_DOCKER_IMAGE_NAME:-vllm/vllm-openai}"
+staging_repo="${VLLM_STAGING_IMAGE_REPO:-public.ecr.aws/q9t5s3a7/vllm-release-repo}"
+build_commit="${VLLM_BUILD_COMMIT:-${BUILDKITE_COMMIT:-unknown}}"
+build_pipeline="${VLLM_BUILD_PIPELINE:-${BUILDKITE_PIPELINE_ID:-${BUILDKITE_PIPELINE_SLUG:-local}}}"
+build_url="${VLLM_BUILD_URL:-${BUILDKITE_BUILD_URL:-}}"
+tag_commit="${BUILDKITE_COMMIT:-${build_commit}}"
+
+if [[ -n "${BUILDKITE:-}" || -n "${BUILDKITE_COMMIT:-}" ]]; then
+  release_version="${RELEASE_VERSION:-}"
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    release_version="${release_version:-$(buildkite-agent meta-data get release-version 2>/dev/null)}"
+  fi
+  release_version="${release_version#v}"
+  release_version="${release_version:-${tag_commit}}"
+
+  staging_image_ref="${staging_repo}:${tag_commit}-$(uname -m)${variant_suffix}"
+
+  if [[ "${NIGHTLY:-}" == "1" ]]; then
+    if [[ -z "${variant}" ]]; then
+      image_tag="${image_name}:nightly-${tag_commit}"
+    elif [[ "${variant}" == cu* ]]; then
+      cuda_variant="${variant%%-*}"
+      remaining_variant="${variant#${cuda_variant}}"
+      image_tag="${image_name}:${cuda_variant}-nightly-${tag_commit}${remaining_variant}"
+    else
+      image_tag="${image_name}:nightly-${tag_commit}${variant_suffix}"
+    fi
+  else
+    image_tag="${image_name}:v${release_version}${variant_suffix}"
+  fi
+else
+  image_tag="${VLLM_IMAGE_TAG:-local/vllm-openai:dev}"
+  staging_image_ref="${image_tag}"
+fi
+
+emit_arg() {
+  printf -- "--build-arg %s=%s " "$1" "$2"
+}
+
+emit_arg VLLM_BUILD_COMMIT "${build_commit}"
+emit_arg VLLM_BUILD_PIPELINE "${build_pipeline}"
+emit_arg VLLM_BUILD_URL "${build_url}"
+# This is the intended public tag. The final digest is only known after push.
+emit_arg VLLM_IMAGE_TAG "${image_tag}"
+printf -- "--tag %s " "${staging_image_ref}"
diff --git a/.buildkite/scripts/generate-and-upload-nightly-index.sh b/.buildkite/scripts/generate-and-upload-nightly-index.sh
index 88c4f5173139..502ed0609310 100755
--- a/.buildkite/scripts/generate-and-upload-nightly-index.sh
+++ b/.buildkite/scripts/generate-and-upload-nightly-index.sh
@@ -10,20 +10,13 @@ set -ex
 BUCKET="vllm-wheels"
 INDICES_OUTPUT_DIR="indices"
 DEFAULT_VARIANT_ALIAS="cu130" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
-PYTHON="${PYTHON_PROG:-python3}" # try to read from env var, otherwise use python3
 SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 
-# detect if python3.12+ is available
-has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
-if [[ "$has_new_python" -eq 0 ]]; then
-    # use new python from docker
-    docker pull python:3-slim
-    PYTHON="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
-fi
-
-echo "Using python interpreter: $PYTHON"
-echo "Python version: $($PYTHON --version)"
+# Select python3 (>= 3.12) -- local if available, else a docker fallback.
+# shellcheck source=lib/select-python.sh
+source .buildkite/scripts/lib/select-python.sh
+select_python
 
 # ======== generate and upload indices ========
 
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 703a7d753220..7e8ddb12ec98 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -378,9 +378,11 @@ HF_MOUNT="/root/.cache/huggingface"
 # double-quotes will have been stripped by the calling shell.
 if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
   commands="${VLLM_TEST_COMMANDS}"
+  commands_source="env"
   echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
 else
   commands="$*"
+  commands_source="argv"
   if [[ -z "$commands" ]]; then
     echo "Error: No test commands provided." >&2
     echo "Usage:" >&2
@@ -397,9 +399,15 @@ fi
 
 echo "Raw commands: $commands"
 
-# Fix quoting before ROCm overrides (so overrides see correct structure)
-commands=$(re_quote_pytest_markers "$commands")
-echo "After re-quoting: $commands"
+# Only try to repair stripped pytest -m/-k quoting in legacy argv mode.
+# VLLM_TEST_COMMANDS preserves inner quoting already, and re-quoting that path
+# can corrupt embedded echo strings or otherwise well-formed shell fragments.
+if [[ "$commands_source" == "argv" ]]; then
+  commands=$(re_quote_pytest_markers "$commands")
+  echo "After re-quoting: $commands"
+else
+  echo "Skipping re-quoting for VLLM_TEST_COMMANDS input"
+fi
 
 commands=$(apply_rocm_test_overrides "$commands")
 echo "Final commands: $commands"
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
index f12bb524d4cb..8ac27ed6583a 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -3,42 +3,37 @@ set -euox pipefail
 export VLLM_CPU_CI_ENV=0
 export VLLM_CPU_KVCACHE_SPACE=1 # avoid OOM
 
-echo "--- PP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 --max-model-len=4096 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
-vllm bench serve \
-    --backend vllm \
-    --dataset-name random \
-    --model meta-llama/Llama-3.2-3B-Instruct \
-    --num-prompts 20 \
-    --result-dir ./test_results \
-    --result-filename tp_pp.json \
-    --save-result \
-    --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
-failed_req=$(jq '.failed' ./test_results/tp_pp.json)
-if [ "$failed_req" -ne 0 ]; then
-  echo "Some requests were failed!"
-  exit 1
-fi
+MODE=${1:-all}
 
-echo "--- DP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
-vllm bench serve \
-   --backend vllm \
-   --dataset-name random \
-   --model meta-llama/Llama-3.2-3B-Instruct \
-   --num-prompts 20 \
-   --result-dir ./test_results \
-   --result-filename dp_pp.json \
-   --save-result \
-   --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
-failed_req=$(jq '.failed' ./test_results/dp_pp.json)
-if [ "$failed_req" -ne 0 ]; then
- echo "Some requests were failed!"
- exit 1
-fi
+run_scenario() {
+    local label="$1" result_file="$2"
+    shift 2
+    echo "--- $label"
+    vllm serve meta-llama/Llama-3.2-3B-Instruct "$@" --max-model-len=4096 &
+    local server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
+    vllm bench serve \
+        --backend vllm \
+        --dataset-name random \
+        --model meta-llama/Llama-3.2-3B-Instruct \
+        --num-prompts 20 \
+        --result-dir ./test_results \
+        --result-filename "$result_file" \
+        --save-result \
+        --endpoint /v1/completions
+    kill -s SIGTERM "$server_pid"; wait "$server_pid" || true
+    if [ "$(jq '.failed' "./test_results/$result_file")" -ne 0 ]; then
+        echo "Some requests were failed in $label!"
+        exit 1
+    fi
+}
+
+case "$MODE" in
+    tp_pp) run_scenario "PP+TP" tp_pp.json -tp=2 -pp=2 ;;
+    dp_tp) run_scenario "DP+TP" dp_tp.json -tp=2 -dp=2 ;;
+    all)
+        run_scenario "PP+TP" tp_pp.json -tp=2 -pp=2
+        run_scenario "DP+TP" dp_tp.json -tp=2 -dp=2
+        ;;
+    *) echo "ERROR: unknown mode '$MODE' (expected: tp_pp | dp_tp | all)" >&2; exit 1 ;;
+esac
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index 7166435ac1e9..0322397394d1 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -67,6 +67,21 @@ function cpu_tests() {
       --num-prompts 20 \
       --endpoint /v1/completions
     kill -s SIGTERM $server_pid &'
+
+  # smoke test for Gated DeltaNet
+  docker exec cpu-test bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3.5-0.8B --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model Qwen/Qwen3.5-0.8B \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
diff --git a/.buildkite/scripts/lib/manylinux.sh b/.buildkite/scripts/lib/manylinux.sh
new file mode 100644
index 000000000000..bde2dfe0a3dc
--- /dev/null
+++ b/.buildkite/scripts/lib/manylinux.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Shared helper for rewriting a wheel's platform tag from the generic
+# ``linux_<arch>`` to the correct ``manylinux_<major>_<minor>_<arch>``.
+# After sourcing, call ``apply_manylinux_tag <wheel>`` on each wheel
+# that still carries the generic tag; the renamed path is printed on
+# stdout (logs go to stderr).
+#
+# Why a pinned Docker container instead of using whatever Python
+# happens to be on the agent:
+#   - vLLM's release agents are heterogeneous -- they don't agree on
+#     a Python minor version, and we can't rely on a particular
+#     ``auditwheel`` being installed.
+#   - ``detect-manylinux-tag.py`` reads ``auditwheel.wheel_abi`` and
+#     ``Policy.sym_policy``, which are *internal* APIs without a
+#     stability promise. Pinning both Python and auditwheel makes the
+#     detected tag a function of the inputs alone, and shifts version
+#     bumps from "implicit drift" to "deliberate, retested change".
+#   - Other release scripts (``generate-and-upload-nightly-index.sh``,
+#     ``upload-rocm-wheels.sh``) already use the python:3-slim image
+#     when the agent's interpreter is too old; this is the same idea
+#     made stricter.
+#
+# To keep the per-wheel cost down (the ROCm upload retags ~10 wheels
+# each run), we install auditwheel into a long-lived helper container
+# once on source, then ``docker exec`` into it for each call.
+#
+# Trap behaviour:
+# - Sourcing installs an EXIT trap that calls ``manylinux_cleanup`` to
+#   tear down the helper container. Any EXIT trap that was already in
+#   place when this file was sourced is captured and run AFTER our
+#   cleanup, so we don't silently clobber it.
+# - If a caller sets a new EXIT trap *after* sourcing, that trap will
+#   replace ours; in that case the caller should call
+#   ``manylinux_cleanup`` from their own handler.
+
+if [[ -n "${_MANYLINUX_LIB_SOURCED:-}" ]]; then
+    return 0
+fi
+_MANYLINUX_LIB_SOURCED=1
+
+# Pin both sides. Bump these deliberately and re-run a representative
+# wheel from each build target through the detection.
+_MANYLINUX_PYTHON_IMAGE="python:3.12-slim"
+_MANYLINUX_AUDITWHEEL_VERSION="6.6.0"
+
+# Resolve our own directory (and the sibling detect script) using the
+# canonical, symlink-resolved path. The container mounts cwd at the
+# same absolute path on both sides, so all paths we hand to it -- the
+# script, the wheel -- must canonicalise to a location under cwd.
+_MANYLINUX_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
+_MANYLINUX_DETECT_SCRIPT="$(cd "${_MANYLINUX_LIB_DIR}/.." && pwd -P)/detect-manylinux-tag.py"
+_MANYLINUX_CWD="$(pwd -P)"
+
+docker pull --quiet "$_MANYLINUX_PYTHON_IMAGE" >/dev/null
+
+# Spin up a long-lived helper container so we install auditwheel once
+# and then ``docker exec`` into it for each wheel.
+#
+# The container runs as root so ``pip install`` can write into the
+# system site-packages; individual ``docker exec`` calls below pin
+# themselves to the host UID so any file rename happens with host
+# ownership, not root.
+_MANYLINUX_CONTAINER="$(docker run -d --rm \
+    -v "$_MANYLINUX_CWD:$_MANYLINUX_CWD" \
+    -w "$_MANYLINUX_CWD" \
+    "$_MANYLINUX_PYTHON_IMAGE" \
+    sleep infinity)"
+docker exec "$_MANYLINUX_CONTAINER" \
+    pip install --quiet --disable-pip-version-check \
+    --root-user-action=ignore \
+    "auditwheel==${_MANYLINUX_AUDITWHEEL_VERSION}"
+
+# Public cleanup -- safe to call multiple times.
+manylinux_cleanup() {
+    if [[ -n "${_MANYLINUX_CONTAINER:-}" ]]; then
+        docker rm -f "$_MANYLINUX_CONTAINER" >/dev/null 2>&1 || true
+        _MANYLINUX_CONTAINER=""
+    fi
+}
+
+# Capture any EXIT trap that was already in place so we can chain to
+# it rather than overwrite it. ``trap -p EXIT`` prints the handler in
+# eval-able form (``trap -- 'CMD' EXIT``) or nothing if unset; we
+# strip the wrapper to recover ``CMD``. Handles the common case --
+# CMDs without embedded single quotes -- and degrades gracefully (we
+# still run our own cleanup) for the pathological case.
+_manylinux_prev_exit_trap_cmd=""
+_manylinux_existing_exit_trap="$(trap -p EXIT)"
+if [[ -n "$_manylinux_existing_exit_trap" ]]; then
+    _tmp="${_manylinux_existing_exit_trap#trap -- \'}"
+    _manylinux_prev_exit_trap_cmd="${_tmp%\' EXIT}"
+    unset _tmp
+fi
+unset _manylinux_existing_exit_trap
+
+_manylinux_run_exit_chain() {
+    manylinux_cleanup
+    if [[ -n "$_manylinux_prev_exit_trap_cmd" ]]; then
+        eval "$_manylinux_prev_exit_trap_cmd"
+    fi
+}
+trap _manylinux_run_exit_chain EXIT
+
+# Detect the manylinux platform tag for a single wheel and rename it
+# in place, printing the renamed wheel path on stdout. Returns
+# non-zero on failure (which under ``set -e`` propagates to caller).
+#
+# The wheel must be reachable via a path under the host cwd so it's
+# visible inside the helper container; in CI the wheels always live
+# under ``artifacts/`` so this is fine.
+apply_manylinux_tag() {
+    local wheel="$1"
+    local abs_wheel
+    abs_wheel="$(realpath "$wheel")"
+    local new_wheel
+    new_wheel="$(docker exec -u "$(id -u):$(id -g)" \
+        "$_MANYLINUX_CONTAINER" \
+        python "$_MANYLINUX_DETECT_SCRIPT" "$abs_wheel")"
+    if [[ -z "$new_wheel" || ! -f "$new_wheel" ]]; then
+        echo "apply_manylinux_tag: detect-manylinux-tag.py did not produce a valid wheel path for $wheel" >&2
+        return 1
+    fi
+    printf '%s\n' "$new_wheel"
+}
diff --git a/.buildkite/scripts/lib/select-python.sh b/.buildkite/scripts/lib/select-python.sh
new file mode 100644
index 000000000000..bc53030a2b50
--- /dev/null
+++ b/.buildkite/scripts/lib/select-python.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Pick a Python interpreter for buildkite scripts: prefer a local
+# ``python3`` if it is recent enough (>= 3.12), otherwise fall back to
+# a one-shot Docker container running ``python:3-slim``. After
+# ``select_python`` returns, ``$PYTHON`` is set in the caller's shell
+# and is safe to use as a command (e.g. ``$PYTHON some_script.py``).
+#
+# The 3.12 threshold matches what the existing nightly-index work
+# expects -- typing features used by ``generate-nightly-index.py``.
+# This helper does not pin the *minor* version; if you need stricter
+# reproducibility (e.g. relying on auditwheel internals), invoke
+# Docker yourself with a pinned tag rather than calling this.
+
+if [[ -n "${_SELECT_PYTHON_LIB_SOURCED:-}" ]]; then
+    return 0
+fi
+_SELECT_PYTHON_LIB_SOURCED=1
+
+# Sets ``PYTHON`` in the caller's shell and exports it. Idempotent --
+# calling twice is safe and the second call simply re-runs the probe.
+select_python() {
+    local py="${PYTHON_PROG:-python3}"
+    local has_new_python
+    has_new_python=$("$py" -c \
+        "print(1 if __import__('sys').version_info >= (3,12) else 0)" \
+        2>/dev/null || echo 0)
+    if [[ "$has_new_python" -eq 0 ]]; then
+        # ``-u $(id -u):$(id -g)`` so files created via the container
+        # end up owned by the host user, not root.
+        docker pull python:3-slim
+        PYTHON="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
+    else
+        PYTHON="$py"
+    fi
+    export PYTHON
+    echo "Using python interpreter: $PYTHON"
+    echo "Python version: $($PYTHON --version)"
+}
diff --git a/.buildkite/scripts/publish-release-images.sh b/.buildkite/scripts/publish-release-images.sh
new file mode 100755
index 000000000000..ec319aa76006
--- /dev/null
+++ b/.buildkite/scripts/publish-release-images.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Publish release Docker images from ECR to DockerHub.
+# Pulls per-arch images, tags with latest and versioned tags, pushes them,
+# then creates and pushes multi-arch manifests.
+
+set -euo pipefail
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version --default "" | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+  echo "ERROR: release-version metadata not set"
+  exit 1
+fi
+
+COMMIT="$BUILDKITE_COMMIT"
+ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
+
+echo "========================================"
+echo "Publishing release images v${RELEASE_VERSION}"
+echo "  Commit: ${COMMIT}"
+echo "  ROCm base cache key: ${ROCM_BASE_CACHE_KEY}"
+echo "========================================"
+
+# Login to ECR to pull staging images
+aws ecr-public get-login-password --region us-east-1 | \
+  docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+
+# ---- CUDA (default: 13.0) ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 vllm/vllm-openai:latest-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 vllm/vllm-openai:latest-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest rm vllm/vllm-openai:latest || true
+docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION} || true
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+# ---- CUDA 12.9 ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
+docker push vllm/vllm-openai:latest-x86_64-cu129
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
+docker push vllm/vllm-openai:latest-aarch64-cu129
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
+
+docker manifest rm vllm/vllm-openai:latest-cu129 || true
+docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-cu129 || true
+docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
+docker manifest push vllm/vllm-openai:latest-cu129
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129
+
+# ---- Ubuntu 24.04 (CUDA 13.0) ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 vllm/vllm-openai:latest-x86_64-ubuntu2404
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404
+docker push vllm/vllm-openai:latest-x86_64-ubuntu2404
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 vllm/vllm-openai:latest-aarch64-ubuntu2404
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404
+docker push vllm/vllm-openai:latest-aarch64-ubuntu2404
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404
+
+docker manifest rm vllm/vllm-openai:latest-ubuntu2404 || true
+docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 || true
+docker manifest create vllm/vllm-openai:latest-ubuntu2404 vllm/vllm-openai:latest-x86_64-ubuntu2404 vllm/vllm-openai:latest-aarch64-ubuntu2404
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404
+docker manifest push vllm/vllm-openai:latest-ubuntu2404
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404
+
+# ---- Ubuntu 24.04 (CUDA 12.9) ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404
+docker push vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404
+docker push vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404
+
+docker manifest rm vllm/vllm-openai:latest-cu129-ubuntu2404 || true
+docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 || true
+docker manifest create vllm/vllm-openai:latest-cu129-ubuntu2404 vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404 vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404
+docker manifest push vllm/vllm-openai:latest-cu129-ubuntu2404
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404
+
+# ---- ROCm ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm vllm/vllm-openai-rocm:latest
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:latest-base
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+# ---- CPU ----
+# CPU images are behind separate block steps and may not have been built.
+# All-or-nothing: inspect both arches first, then either publish everything
+# (per-arch + multi-arch manifest) or skip everything. Publishing only one
+# arch would leave `:latest-x86_64` pointing at the new release while the
+# `:latest` multi-arch manifest still resolves to the previous release.
+
+CPU_X86_TAG=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+CPU_ARM_TAG=public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+
+CPU_X86_AVAILABLE=false
+CPU_ARM_AVAILABLE=false
+docker manifest inspect "${CPU_X86_TAG}" >/dev/null 2>&1 && CPU_X86_AVAILABLE=true
+docker manifest inspect "${CPU_ARM_TAG}" >/dev/null 2>&1 && CPU_ARM_AVAILABLE=true
+
+if [ "$CPU_X86_AVAILABLE" = "true" ] && [ "$CPU_ARM_AVAILABLE" = "true" ]; then
+  docker pull "${CPU_X86_TAG}"
+  docker tag "${CPU_X86_TAG}" vllm/vllm-openai-cpu:latest-x86_64
+  docker tag "${CPU_X86_TAG}" vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+  docker push vllm/vllm-openai-cpu:latest-x86_64
+  docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+  docker pull "${CPU_ARM_TAG}"
+  docker tag "${CPU_ARM_TAG}" vllm/vllm-openai-cpu:latest-arm64
+  docker tag "${CPU_ARM_TAG}" vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+  docker push vllm/vllm-openai-cpu:latest-arm64
+  docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+  docker manifest rm vllm/vllm-openai-cpu:latest || true
+  docker manifest rm vllm/vllm-openai-cpu:v${RELEASE_VERSION} || true
+  docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+  docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+  docker manifest push vllm/vllm-openai-cpu:latest
+  docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
+elif [ "$CPU_X86_AVAILABLE" = "false" ] && [ "$CPU_ARM_AVAILABLE" = "false" ]; then
+  echo "WARNING: Neither CPU image found in ECR, skipping CPU publish (ensure block-cpu-release-image-build and block-arm64-cpu-release-image-build were unblocked and the builds finished pushing)"
+else
+  # Partial state: one arch built, the other did not. Fail loudly rather than
+  # ship a Docker Hub state where `:latest-${arch}` and `:latest` (multi-arch)
+  # disagree on which release they point at.
+  echo "ERROR: Partial CPU build detected (x86_64=${CPU_X86_AVAILABLE}, arm64=${CPU_ARM_AVAILABLE})."
+  echo "       Refusing to publish to avoid split-tag drift between per-arch and multi-arch tags."
+  echo "       Re-run the missing CPU build and retry, or manually publish if a single-arch release is intended."
+  exit 1
+fi
+
+echo ""
+echo "Successfully published release images for v${RELEASE_VERSION}"
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
index de48eb282a65..0eadfa1f80b4 100755
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -51,6 +51,7 @@ vllm serve "$MODEL" \
   --offload-num-in-group 2 \
   --offload-prefetch-step 1 \
   --offload-params w13_weight w2_weight \
+  --generation-config vllm \
   --port "$PORT" \
   ${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
 SERVER_PID=$!
diff --git a/.buildkite/scripts/tool_call/run-bfcl-eval.sh b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
index f3e5009e6fe3..3748cab62c7c 100755
--- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh
+++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
@@ -28,6 +28,7 @@
 #   BFCL_MAX_MODEL_LEN  - Max model length (default: 4096)
 #   BFCL_PORT           - Server port (default: 8000)
 #   BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
+#   BFCL_TEMPERATURE    - Temperature (default: 0.0)
 #   BFCL_EXTRA_ARGS     - Additional vLLM server args
 
 set -euo pipefail
@@ -43,6 +44,7 @@ TP_SIZE="${BFCL_TP_SIZE:-1}"
 MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
 PORT="${BFCL_PORT:-8000}"
 REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
+TEMPERATURE="${BFCL_TEMPERATURE:-0.0}"
 EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
 
 # Set up output directory
@@ -139,7 +141,7 @@ echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
 # be patched in-process so BFCL knows to use the OpenAI-compatible handler
 # against our local vLLM server.
 bfcl_exit_code=0
-python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
+python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$TEMPERATURE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
 import os
 import sys
 
@@ -148,7 +150,8 @@ test_category = sys.argv[2]
 num_threads = int(sys.argv[3])
 port = sys.argv[4]
 api_type = sys.argv[5]
-output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
+temperature = float(sys.argv[6])
+output_dir = sys.argv[7] if len(sys.argv) > 7 and sys.argv[7] else os.getcwd()
 
 os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
 os.environ["OPENAI_API_KEY"] = "dummy"
@@ -204,6 +207,7 @@ gen_kwargs["model"] = [model]
 gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
 gen_kwargs["skip_server_setup"] = True
 gen_kwargs["num_threads"] = num_threads
+gen_kwargs["temperature"] = temperature
 generate(**gen_kwargs)
 
 # ---- evaluate ----
diff --git a/.buildkite/scripts/upload-nightly-wheels.sh b/.buildkite/scripts/upload-nightly-wheels.sh
index cc72cda7d505..8cef31908809 100644
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -2,14 +2,18 @@
 
 set -ex
 
-# Upload a single wheel to S3 (rename linux -> manylinux).
+# Upload a single wheel to S3, after detecting and applying the appropriate
+# manylinux platform tag with auditwheel.
 # Index generation is handled separately by generate-and-upload-nightly-index.sh.
 
+# shellcheck source=lib/manylinux.sh
+source .buildkite/scripts/lib/manylinux.sh
+
 BUCKET="vllm-wheels"
 SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 
-# ========= collect, rename & upload the wheel ==========
+# ========= locate the wheel ==========
 
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
@@ -21,19 +25,9 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
 fi
 wheel="${wheel_files[0]}"
 
-# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
-# we also accept params as manylinux tag
-# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-manylinux_version="${1:-manylinux_2_31}"
+# ========= detect manylinux tag and rename ==========
 
-# Rename 'linux' to the appropriate manylinux version in the wheel filename
-if [[ "$wheel" != *"linux"* ]]; then
-  echo "Error: Wheel filename does not contain 'linux': $wheel"
-  exit 1
-fi
-new_wheel="${wheel/linux/$manylinux_version}"
-mv -- "$wheel" "$new_wheel"
-wheel="$new_wheel"
+wheel="$(apply_manylinux_tag "$wheel")"
 echo "Renamed wheel to: $wheel"
 
 # Extract the version from the wheel
diff --git a/.buildkite/scripts/upload-release-wheels-pypi.sh b/.buildkite/scripts/upload-release-wheels-pypi.sh
index 058e5bbe4f4c..7e2077a2692c 100644
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -39,10 +39,11 @@ fi
 
 set -x # avoid printing secrets above
 
-# install twine from pypi
+# install twine and sdist build prerequisites from pypi
 python3 -m venv /tmp/vllm-release-env
 source /tmp/vllm-release-env/bin/activate
 pip install twine
+pip install -r requirements/build/cuda.txt
 python3 -m twine --version
 
 # copy release wheels to local directory
diff --git a/.buildkite/scripts/upload-rocm-wheels.sh b/.buildkite/scripts/upload-rocm-wheels.sh
index a42848a16ffe..1f3655631204 100755
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -20,10 +20,6 @@ BUCKET="${S3_BUCKET:-vllm-wheels}"
 ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
 S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
 INDICES_OUTPUT_DIR="rocm-indices"
-PYTHON="${PYTHON_PROG:-python3}"
-
-# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
-MANYLINUX_VERSION="manylinux_2_35"
 
 echo "========================================"
 echo "ROCm Wheel Upload Configuration"
@@ -34,19 +30,21 @@ echo "Commit: $BUILDKITE_COMMIT"
 echo "Branch: $BUILDKITE_BRANCH"
 echo "========================================"
 
-# ======== Part 0: Setup Python ========
+# ======== Part 0: Setup Python and helpers ========
 
-# Detect if python3.12+ is available
-has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
-if [[ "$has_new_python" -eq 0 ]]; then
-    # Use new python from docker
-    # Use --user to ensure files are created with correct ownership (not root)
-    docker pull python:3-slim
-    PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
-fi
+# Pick a Python interpreter for index generation -- local if recent
+# enough, else a one-shot docker fallback.
+# shellcheck source=lib/select-python.sh
+source .buildkite/scripts/lib/select-python.sh
+select_python
 
-echo "Using python interpreter: $PYTHON"
-echo "Python version: $($PYTHON --version)"
+# Set up auditwheel-in-a-container for the manylinux retagging step.
+# Distinct from select_python: ``manylinux.sh`` deliberately pins both
+# the Python and auditwheel versions (the script reads auditwheel
+# internals) and so always runs in a known-good container regardless
+# of what's on the agent.
+# shellcheck source=lib/manylinux.sh
+source .buildkite/scripts/lib/manylinux.sh
 
 # ======== Part 1: Collect and prepare wheels ========
 
@@ -63,11 +61,18 @@ if [ "$WHEEL_COUNT" -eq 0 ]; then
     exit 1
 fi
 
-# Rename linux to manylinux in wheel filenames
+# Detect the appropriate manylinux platform tag for any wheel that still
+# carries the generic ``linux_<arch>`` tag, and rename it in place. We use
+# auditwheel via ``apply_manylinux_tag`` (see lib/manylinux.sh) rather than
+# a hard-coded ``manylinux_2_35`` string so that the label tracks the actual
+# glibc symbol versions used by the binaries (and stays correct if the
+# rocm_base image is rebased).
+#
+# The ``linux``/``manylinux`` filter below skips both pre-tagged wheels
+# (e.g. upstream torch) and pure-Python ``-any.whl`` wheels.
 for wheel in all-rocm-wheels/*.whl; do
     if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
-        new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
-        mv -- "$wheel" "$new_wheel"
+        new_wheel="$(apply_manylinux_tag "$wheel")"
         echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
     fi
 done
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 68179dcb68cd..b3c77dcac7c8 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -230,7 +230,6 @@ steps:
   - tests/entrypoints/llm/test_collective_rpc.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
@@ -272,7 +271,6 @@ steps:
   - tests/v1/worker/test_worker_memory_snapshot.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
@@ -395,11 +393,11 @@ steps:
     # Pooling models
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # Features demo
-    - python3 offline_inference/prefix_caching.py
+    - python3 features/automatic_prefix_caching/prefix_caching_offline.py
     - python3 offline_inference/llm_engine_example.py
     - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
 #----------------------------------------------------------  mi250 · kernels  ----------------------------------------------------------#
 
@@ -590,7 +588,6 @@ steps:
   - vllm/platforms/rocm.py
   commands:
   - pip freeze | grep -E 'torch'
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
 - label: Multi-Modal Models (Extended Generation 2) # TBD
@@ -621,6 +618,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -864,7 +862,6 @@ steps:
   - tests/entrypoints/openai/test_multi_api_servers.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -930,6 +927,7 @@ steps:
   - tests/renderers
   - tests/standalone_tests/lazy_imports.py
   - tests/tokenizers_
+  - tests/reasoning
   - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
@@ -942,7 +940,7 @@ steps:
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s renderers
   - pytest -v -s tokenizers_
-  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py
+  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py
   - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
@@ -1108,6 +1106,7 @@ steps:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
   - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
   - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+  - pytest -v -s tests/compile/passes/distributed/test_tp2_ar_rms.py::test_tp2_ar_rms_fusions
 
 #-----------------------------------------------------------  mi300 · cuda  ------------------------------------------------------------#
 
@@ -1168,13 +1167,12 @@ steps:
   - vllm/v1/attention/backends/
   - vllm/v1/attention/selector.py
   - tests/distributed/test_context_parallel.py
-  - examples/offline_inference/data_parallel.py
+  - examples/features/data_parallel/data_parallel_offline.py
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s tests/distributed/test_context_parallel.py
-  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
+  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
 
 - label: Distributed Tests (4xA100-4xMI300) # TBD
   timeout_in_minutes: 180
@@ -1186,7 +1184,6 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s distributed/test_custom_all_reduce.py
   - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
@@ -1203,17 +1200,16 @@ steps:
   - tests/distributed/test_torchrun_example.py
   - tests/distributed/test_torchrun_example_moe.py
   - examples/rl/
-  - tests/examples/offline_inference/data_parallel.py
+  - tests/examples/features/data_parallel/data_parallel_offline.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
   - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
   - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
   - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - python3 ../examples/features/data_parallel/data_parallel_offline.py --enforce-eager
   # rlhf examples
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py
@@ -1252,7 +1248,6 @@ steps:
   - vllm/platforms/rocm.py
   commands:
   - export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s distributed/test_ray_v2_executor.py
   - pytest -v -s distributed/test_ray_v2_executor_e2e.py
   - pytest -v -s distributed/test_pipeline_parallel.py -k "ray"
@@ -1266,7 +1261,7 @@ steps:
   optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - examples/offline_inference/torchrun_dp_example.py
+  - examples/features/torchrun/torchrun_dp_example_offline.py
   - vllm/config/parallel.py
   - vllm/distributed/
   - vllm/v1/engine/llm_engine.py
@@ -1274,8 +1269,7 @@ steps:
   - vllm/v1/worker/gpu_worker.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
-  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+  - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 
 #--------------------------------------------------------  mi300 · entrypoints  --------------------------------------------------------#
 
@@ -1654,11 +1648,11 @@ steps:
     # Pooling models
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # Features demo
-    - python3 offline_inference/prefix_caching.py
+    - python3 features/automatic_prefix_caching/prefix_caching_offline.py
     - python3 offline_inference/llm_engine_example.py
     - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
 #----------------------------------------------------------  mi300 · kernels  ----------------------------------------------------------#
 
@@ -1802,6 +1796,7 @@ steps:
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_1
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -1843,6 +1838,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_1
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2203,7 +2199,6 @@ steps:
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_1
-  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2280,7 +2275,6 @@ steps:
   - tests/entrypoints/openai/test_multi_api_servers.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -2300,9 +2294,8 @@ steps:
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
-  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
   - pytest -v -s tests/v1/distributed/test_dbo.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py
   - pytest -v -s tests/distributed/test_packed_tensor.py
@@ -2363,7 +2356,6 @@ steps:
   - tests/distributed/test_utils
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -2493,7 +2485,6 @@ steps:
   - tests/entrypoints/llm/test_collective_rpc.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
@@ -2518,7 +2509,6 @@ steps:
   - tests/v1/worker/test_worker_memory_snapshot.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
@@ -2539,7 +2529,6 @@ steps:
   - tests/distributed/test_multiproc_executor.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
@@ -2627,6 +2616,7 @@ steps:
   agent_pool: mi325_1
   torch_nightly: true
   parallelism: 2
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2652,6 +2642,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2713,11 +2704,10 @@ steps:
   - vllm/v1/attention/selector.py
   - tests/distributed/test_context_parallel.py
   - tests/v1/distributed/test_dbo.py
-  - examples/offline_inference/data_parallel.py
+  - examples/features/data_parallel/data_parallel_offline.py
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s tests/distributed/test_context_parallel.py
   - pytest -v -s tests/v1/distributed/test_dbo.py
 
@@ -2748,6 +2738,7 @@ steps:
   agent_pool: mi355_1
   fast_check: true
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2763,6 +2754,7 @@ steps:
   agent_pool: mi355_1
   fast_check: true
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2937,11 +2929,11 @@ steps:
   # Pooling models
   - python3 pooling/embed/vision_embedding_offline.py --seed 0
   # Features demo
-  - python3 offline_inference/prefix_caching.py
+  - python3 features/automatic_prefix_caching/prefix_caching_offline.py
   - python3 offline_inference/llm_engine_example.py
   - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-  - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-  - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+  - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+  - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
 #----------------------------------------------------------  mi355 · kernels  ----------------------------------------------------------#
 
@@ -3059,6 +3051,7 @@ steps:
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -3258,6 +3251,7 @@ steps:
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -3283,6 +3277,7 @@ steps:
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
index 4bcf116f2756..d3947a03162b 100644
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: V1 attention (H100)
+  key: v1-attention-h100
   timeout_in_minutes: 30
   device: h100
   source_file_dependencies:
@@ -14,8 +15,9 @@ steps:
     - pytest -v -s v1/attention
 
 - label: V1 attention (B200)
+  key: v1-attention-b200
   timeout_in_minutes: 30
-  device: b200
+  device: b200-k8s
   source_file_dependencies:
     - vllm/config/attention.py
     - vllm/model_executor/layers/attention
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
index 042734e8433b..5d547cd48637 100644
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Basic Correctness
+  key: basic-correctness
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
index 4cda6fff1443..85f804780179 100644
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Benchmarks CLI Test
+  key: benchmarks-cli-test
   timeout_in_minutes: 20
   device: h200_18gb
   source_file_dependencies:
@@ -12,7 +13,8 @@ steps:
   - pytest -v -s benchmarks/
 
 - label: Attention Benchmarks Smoke Test (B200)
-  device: b200
+  key: attention-benchmarks-smoke-test-b200
+  device: b200-k8s
   num_gpus: 2
   optional: true
   working_dir: "/vllm-workspace/"
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
index aa46447c24af..01248738d519 100644
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Sequence Parallel Correctness Tests (2 GPUs)
+  key: sequence-parallel-correctness-tests-2-gpus
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
   num_devices: 2
@@ -17,6 +18,7 @@ steps:
   - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
 
 - label: Sequence Parallel Correctness Tests (2xH100)
+  key: sequence-parallel-correctness-tests-2xh100
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
   device: h100
@@ -27,6 +29,7 @@ steps:
   - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
 
 - label: AsyncTP Correctness Tests (2xH100)
+  key: asynctp-correctness-tests-2xh100
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
   device: h100
@@ -37,9 +40,10 @@ steps:
   - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 
 - label: AsyncTP Correctness Tests (B200)
+  key: asynctp-correctness-tests-b200
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   optional: true
   num_devices: 2
   commands:
@@ -47,6 +51,7 @@ steps:
   - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 
 - label: Distributed Compile Unit Tests (2xH100)
+  key: distributed-compile-unit-tests-2xh100
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
   device: h100
@@ -60,9 +65,10 @@ steps:
   - pytest -s -v tests/compile/passes/distributed
 
 - label: Fusion and Compile Unit Tests (2xB200)
+  key: fusion-and-compile-unit-tests-2xb200
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   source_file_dependencies:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/
@@ -89,6 +95,7 @@ steps:
     - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Fusion E2E Quick (H100)
+  key: fusion-e2e-quick-h100
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/"
   device: h100
@@ -107,6 +114,7 @@ steps:
     - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
 
 - label: Fusion E2E Config Sweep (H100)
+  key: fusion-e2e-config-sweep-h100
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
   device: h100
@@ -126,9 +134,10 @@ steps:
     - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
 
 - label: Fusion E2E Config Sweep (B200)
+  key: fusion-e2e-config-sweep-b200
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   num_devices: 1
   optional: true
   commands:
@@ -139,6 +148,7 @@ steps:
     - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
 
 - label: Fusion E2E TP2 Quick (H100)
+  key: fusion-e2e-tp2-quick-h100
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
   device: h100
@@ -156,6 +166,7 @@ steps:
     - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
 
 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
+  key: fusion-e2e-tp2-ar-rms-config-sweep-h100
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   device: h100
@@ -175,6 +186,7 @@ steps:
     - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
 
 - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
+  key: fusion-e2e-tp2-asynctp-config-sweep-h100
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   device: h100
@@ -194,9 +206,10 @@ steps:
     - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
 
 - label: Fusion E2E TP2 (B200)
+  key: fusion-e2e-tp2-b200
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
index 4d1efdb13c88..9cb19ac40801 100644
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Platform Tests (CUDA)
+  key: platform-tests-cuda
   timeout_in_minutes: 15
   device: h200_18gb
   source_file_dependencies:
@@ -13,6 +14,7 @@ steps:
     - pytest -v -s cuda/test_platform_no_cuda_init.py
 
 - label: Cudagraph
+  key: cudagraph
   timeout_in_minutes: 20
   source_file_dependencies:
   - tests/v1/cudagraph
diff --git a/.buildkite/test_areas/disaggregated.yaml b/.buildkite/test_areas/disaggregated.yaml
index a10fda41ef0d..e68b9e1add8b 100644
--- a/.buildkite/test_areas/disaggregated.yaml
+++ b/.buildkite/test_areas/disaggregated.yaml
@@ -3,65 +3,71 @@ depends_on:
   - image-build
 steps:
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
+  key: distributed-nixlconnector-pd-accuracy-4-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: Distributed FlashInfer NixlConnector PD accuracy (4 GPUs)
+  key: distributed-flashinfer-nixlconnector-pd-accuracy-4-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - FLASHINFER=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 - label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
+  key: dp-ep-distributed-nixlconnector-pd-accuracy-tests-4-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
+  key: crosslayer-kv-layout-distributed-nixlconnector-pd-accuracy-tests-4-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 - label: Hybrid SSM NixlConnector PD accuracy tests (4 GPUs)
+  key: hybrid-ssm-nixlconnector-pd-accuracy-tests-4-gpus
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - HYBRID_SSM=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
 - label: MultiConnector (Nixl+Offloading) PD accuracy (2 GPUs)
+  key: multiconnector-nixl-offloading-pd-accuracy-2-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
     - vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
     - vllm/distributed/kv_transfer/kv_connector/v1/offloading/
@@ -71,12 +77,13 @@ steps:
     - bash v1/kv_connector/nixl_integration/run_multi_connector_accuracy_test.sh
 
 - label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
+  key: nixlconnector-pd-spec-decode-acceptance-2-gpus
   timeout_in_minutes: 30
   device: a100
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - vllm/v1/worker/kv_connector_model_runner_mixin.py
     - tests/v1/kv_connector/nixl_integration/
   commands:
@@ -84,11 +91,12 @@ steps:
     - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
 
 - label: MultiConnector (Nixl+Offloading) PD edge cases (2 GPUs)
+  key: multiconnector-nixl-offloading-pd-edge-cases-2-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
     - vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
     - vllm/distributed/kv_transfer/kv_connector/v1/offloading/
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 093f3ab4fe1f..8aa41a9a26ab 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Distributed Comm Ops
+  key: distributed-comm-ops
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
@@ -16,6 +17,7 @@ steps:
   - pytest -v -s distributed/test_shm_storage.py
 
 - label: Distributed DP Tests (2 GPUs)
+  key: distributed-dp-tests-2-gpus
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
@@ -37,6 +39,7 @@ steps:
   - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
 
 - label: Distributed Compile + RPC Tests (2 GPUs)
+  key: distributed-compile-rpc-tests-2-gpus
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
@@ -59,6 +62,7 @@ steps:
   - pytest -v -s ./compile/test_wrapper.py
 
 - label: Distributed Torchrun + Shutdown Tests (2 GPUs)
+  key: distributed-torchrun-shutdown-tests-2-gpus
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
@@ -81,6 +85,7 @@ steps:
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 
 - label: Distributed Torchrun + Examples (4 GPUs)
+  key: distributed-torchrun-examples-4-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace"
   num_devices: 4
@@ -88,9 +93,8 @@ steps:
   - vllm/distributed/
   - tests/distributed/test_torchrun_example.py
   - tests/distributed/test_torchrun_example_moe.py
-  - examples/offline_inference/rlhf_colocate.py
   - examples/rl/
-  - tests/examples/offline_inference/data_parallel.py
+  - tests/examples/features/data_parallel/data_parallel_offline.py
   commands:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
@@ -107,12 +111,13 @@ steps:
   # test with torchrun tp=2 and dp=2 with ep
   - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
   # test with internal dp
-  - python3 examples/offline_inference/data_parallel.py --enforce-eager
+  - python3 examples/features/data_parallel/data_parallel_offline.py --enforce-eager
   # rlhf examples
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py
 
 - label: Distributed DP Tests (4 GPUs)
+  key: distributed-dp-tests-4-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
@@ -133,6 +138,7 @@ steps:
   - pytest -v -s distributed/test_utils.py
 
 - label: Distributed Compile + Comm (4 GPUs)
+  key: distributed-compile-comm-4-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
@@ -154,24 +160,28 @@ steps:
   - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
 
 - label: Distributed Tests (8 GPUs)(H100)
+  key: distributed-tests-8-gpus-h100
   timeout_in_minutes: 10
   device: h100
   num_devices: 8
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - examples/offline_inference/torchrun_dp_example.py
+  - examples/features/torchrun/torchrun_dp_example_offline.py
   - vllm/config/parallel.py
   - vllm/distributed/
   - vllm/v1/engine/llm_engine.py
   - vllm/v1/executor/uniproc_executor.py
   - vllm/v1/worker/gpu_worker.py
+  - tests/distributed/test_mnnvl_alltoall.py
+
   commands:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   # test with torchrun tp=2 and dp=4 with ep
-  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+  - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 
 - label: Distributed Tests (4 GPUs)(A100)
+  key: distributed-tests-4-gpus-a100
   device: a100
   optional: true
   num_devices: 4
@@ -186,6 +196,7 @@ steps:
   - pytest -v -s -x lora/test_mixtral.py
 
 - label: Distributed Tests (2 GPUs)(H100)
+  key: distributed-tests-2-gpus-h100
   timeout_in_minutes: 15
   device: h100
   optional: true
@@ -194,13 +205,14 @@ steps:
   commands:
     - pytest -v -s tests/distributed/test_context_parallel.py
     - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
-    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
     - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py
     - pytest -v -s tests/distributed/test_packed_tensor.py
 
 - label: Distributed Tests (2 GPUs)(B200)
-  device: b200
+  key: distributed-tests-2-gpus-b200
+  device: b200-k8s
   optional: true
   working_dir: "/vllm-workspace/"
   num_devices: 2
@@ -208,8 +220,12 @@ steps:
     - pytest -v -s tests/distributed/test_context_parallel.py
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
     - pytest -v -s tests/v1/distributed/test_dbo.py
+    - pytest -v -s tests/distributed/test_mnnvl_alltoall.py
+
+    
 
 - label: 2 Node Test (4 GPUs)
+  key: 2-node-test-4-gpus
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
@@ -222,11 +238,12 @@ steps:
   - vllm/executor/
   - vllm/model_executor/models/
   - tests/distributed/
-  - tests/examples/offline_inference/data_parallel.py
+  - tests/examples/features/data_parallel/data_parallel_offline.py
   commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/features/data_parallel/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/features/data_parallel/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
 
 - label: Pipeline + Context Parallelism (4 GPUs)
+  key: pipeline-context-parallelism-4-gpus
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
@@ -241,6 +258,7 @@ steps:
   - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: RayExecutorV2 (4 GPUs)
+  key: rayexecutorv2-4-gpus
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
diff --git a/.buildkite/test_areas/docker.yaml b/.buildkite/test_areas/docker.yaml
new file mode 100644
index 000000000000..9bf96221abe0
--- /dev/null
+++ b/.buildkite/test_areas/docker.yaml
@@ -0,0 +1,16 @@
+group: Docker
+depends_on:
+  - image-build-cpu
+steps:
+- label: Docker Build Metadata
+  timeout_in_minutes: 10
+  device: cpu-small
+  source_file_dependencies:
+    - .buildkite/release-pipeline.yaml
+    - .buildkite/scripts/docker-build-metadata-args.sh
+    - docker/Dockerfile
+    - docker/Dockerfile.cpu
+    - docker/docker-bake.hcl
+    - tests/tools/test_docker_build_metadata_args.py
+  commands:
+    - pytest -v -s tools/test_docker_build_metadata_args.py
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index 857fefd268a4..bb8aa14eac18 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: DeepSeek V2-Lite Accuracy
+  key: deepseek-v2-lite-accuracy
   timeout_in_minutes: 60
   device: h100
   optional: true
@@ -12,6 +13,7 @@ steps:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
 - label: Qwen3-30B-A3B-FP8-block Accuracy
+  key: qwen3-30b-a3b-fp8-block-accuracy
   timeout_in_minutes: 60
   device: h100
   optional: true
@@ -21,8 +23,9 @@ steps:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  key: qwen3-30b-a3b-fp8-block-accuracy-b200
   timeout_in_minutes: 60
-  device: b200
+  device: b200-k8s
   optional: true
   num_devices: 2
   working_dir: "/vllm-workspace"
@@ -30,6 +33,7 @@ steps:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 
 - label: Qwen3-30B-A3B-FP8 DP4 Async EPLB Accuracy
+  key: qwen3-30b-a3b-fp8-dp4-async-eplb-accuracy
   timeout_in_minutes: 60
   device: h100
   optional: true
@@ -39,6 +43,7 @@ steps:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_dp4_async_eplb.sh 0.8 200 8050
 
 - label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
+  key: deepseek-v2-lite-prefetch-offload-accuracy-h100
   timeout_in_minutes: 60
   device: h100
   optional: true
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index 5e4361ec9ad6..cf0f028255d2 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Engine
+  key: engine
   timeout_in_minutes: 15
   device: h200_18gb
   source_file_dependencies:
@@ -12,10 +13,12 @@ steps:
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
+  - tests/test_jit_monitor.py
   commands:
-  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py test_jit_monitor.py
 
 - label: Engine (1 GPU)
+  key: engine-1-gpu
   timeout_in_minutes: 30
   source_file_dependencies:
     - vllm/v1/engine/
@@ -25,6 +28,7 @@ steps:
     - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
 
 - label: e2e Scheduling (1 GPU)
+  key: e2e-scheduling-1-gpu
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
@@ -34,6 +38,7 @@ steps:
     - pytest -v -s v1/e2e/general/test_async_scheduling.py
 
 - label: e2e Core (1 GPU)
+  key: e2e-core-1-gpu
   timeout_in_minutes: 30
   source_file_dependencies:
     - vllm/v1/
@@ -42,6 +47,7 @@ steps:
     - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
 
 - label: V1 e2e (2 GPUs)
+  key: v1-e2e-2-gpus
   timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
   optional: true
   num_devices: 2
@@ -51,13 +57,9 @@ steps:
   commands:
     # Only run tests that need exactly 2 GPUs
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
-  mirror:
-    amd:
-      device: mi325_2
-      depends_on:
-      - image-build-amd
 
 - label: V1 e2e (4 GPUs)
+  key: v1-e2e-4-gpus
   timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
   optional: true
   num_devices: 4
@@ -67,13 +69,9 @@ steps:
   commands:
     # Only run tests that need 4 GPUs
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
-  mirror:
-    amd:
-      device: mi325_4
-      depends_on:
-      - image-build-amd
 
 - label: V1 e2e (4xH100)
+  key: v1-e2e-4xh100
   timeout_in_minutes: 60
   device: h100
   num_devices: 4
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index 8c2b529a8068..ba92d3a3aec0 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -2,7 +2,8 @@ group: Entrypoints
 depends_on: 
   - image-build
 steps:
-- label: Entrypoints Unit Tests  
+- label: Entrypoints Unit Tests
+  key: entrypoints-unit-tests
   timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -13,6 +14,7 @@ steps:
   - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration (LLM)
+  key: entrypoints-integration-llm
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -24,8 +26,14 @@ steps:
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  mirror:
+    amd:
+      device: mi300_1
+      depends_on:
+      - image-build-amd
 
 - label: Entrypoints Integration (API Server openai - Part 1)
+  key: entrypoints-integration-api-server-openai-part-1
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -35,14 +43,10 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 
 - label: Entrypoints Integration (API Server openai - Part 2)
+  key: entrypoints-integration-api-server-openai-part-2
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -53,13 +57,9 @@ steps:
   - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
   - pytest -v -s entrypoints/openai/speech_to_text/
   - pytest -v -s entrypoints/test_chat_utils.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Entrypoints Integration (API Server openai - Part 3)
+  key: entrypoints-integration-api-server-openai-part-3
   timeout_in_minutes: 50
   device: h200_18gb
   working_dir: "/vllm-workspace/tests"
@@ -72,6 +72,7 @@ steps:
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
 
 - label: Entrypoints Integration (API Server 2)
+  key: entrypoints-integration-api-server-2
   timeout_in_minutes: 130
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -86,6 +87,7 @@ steps:
   - pytest -v -s tool_use
 
 - label: Entrypoints Integration (Pooling)
+  key: entrypoints-integration-pooling
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -96,6 +98,7 @@ steps:
   - pytest -v -s entrypoints/pooling
 
 - label: Entrypoints Integration (Responses API)
+  key: entrypoints-integration-responses-api
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -105,6 +108,7 @@ steps:
   - pytest -v -s entrypoints/openai/responses
 
 - label: OpenAI API Correctness
+  key: openai-api-correctness
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
index c2adf52a2d57..0f7ab0d7157c 100644
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: EPLB Algorithm
+  key: eplb-algorithm
   timeout_in_minutes: 15
   device: h200_18gb
   working_dir: "/vllm-workspace/tests"
@@ -15,6 +16,7 @@ steps:
   - pytest -v -s distributed/test_eplb_utils.py
 
 - label: EPLB Execution # 17min
+  key: eplb-execution
   timeout_in_minutes: 27
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
@@ -26,6 +28,7 @@ steps:
   - pytest -v -s distributed/test_eplb_spec_decode.py
 
 - label: Elastic EP Scaling Test
+  key: elastic-ep-scaling-test
   timeout_in_minutes: 20
   device: h100
   working_dir: "/vllm-workspace/tests"
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
index 86e09f3de4b5..34e1e4832d9d 100644
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: vLLM IR Tests
+  key: vllm-ir-tests
   timeout_in_minutes: 10
   device: h200_18gb
   working_dir: "/vllm-workspace/"
@@ -14,6 +15,7 @@ steps:
     - pytest -v -s tests/kernels/ir
 
 - label: Kernels Core Operation Test
+  key: kernels-core-operation-test
   timeout_in_minutes: 75
   source_file_dependencies:
   - csrc/
@@ -23,6 +25,7 @@ steps:
     - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py  kernels/test_concat_mla_q.py
 
 - label: Kernels MiniMax Reduce RMS Test (2 GPUs)
+  key: kernels-minimax-reduce-rms-test-2-gpus
   timeout_in_minutes: 15
   num_devices: 2
   device: h100
@@ -36,6 +39,7 @@ steps:
     - pytest -v -s kernels/core/test_minimax_reduce_rms.py
 
 - label: Kernels Attention Test %N
+  key: kernels-attention-test
   timeout_in_minutes: 35
   source_file_dependencies:
   - csrc/attention/
@@ -49,6 +53,7 @@ steps:
   parallelism: 2
 
 - label: Kernels Quantization Test %N
+  key: kernels-quantization-test
   timeout_in_minutes: 90
   source_file_dependencies:
   - csrc/quantization/
@@ -59,6 +64,7 @@ steps:
   parallelism: 2
 
 - label: Kernels MoE Test %N
+  key: kernels-moe-test
   timeout_in_minutes: 25
   source_file_dependencies:
   - csrc/quantization/cutlass_w8a8/moe/
@@ -74,6 +80,7 @@ steps:
   parallelism: 5
 
 - label: Kernels Mamba Test
+  key: kernels-mamba-test
   timeout_in_minutes: 45
   source_file_dependencies:
   - csrc/mamba/
@@ -82,7 +89,18 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
+- label: Kernels KDA Test
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/model_executor/layers/fla/ops/kda.py
+  - vllm/model_executor/layers/fla/ops/chunk_delta_h.py
+  - vllm/model_executor/layers/fla/ops/l2norm.py
+  - tests/kernels/test_kda.py
+  commands:
+    - pytest -v -s kernels/test_kda.py
+
 - label: Kernels DeepGEMM Test (H100)
+  key: kernels-deepgemm-test-h100
   timeout_in_minutes: 45
   device: h100
   num_devices: 1
@@ -104,9 +122,10 @@ steps:
     - pytest -v -s quantization/test_cutlass_w4a16.py
 
 - label: Kernels (B200)
+  key: kernels-b200
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   # optional: true
   source_file_dependencies:
   - csrc/quantization/fp4/
@@ -152,6 +171,7 @@ steps:
     - pytest -v -s tests/models/quantization/test_nvfp4.py
 
 - label: Kernels Helion Test
+  key: kernels-helion-test
   timeout_in_minutes: 30
   device: h100
   source_file_dependencies:
@@ -163,6 +183,7 @@ steps:
 
  
 - label: Kernels FP8 MoE Test (1 H100)
+  key: kernels-fp8-moe-test-1-h100
   timeout_in_minutes: 90
   device: h100
   num_devices: 1
@@ -179,6 +200,7 @@ steps:
     - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
 
 - label: Kernels FP8 MoE Test (2 H100s)
+  key: kernels-fp8-moe-test-2-h100s
   timeout_in_minutes: 90
   device: h100
   num_devices: 2
@@ -188,8 +210,9 @@ steps:
     - pytest -v -s kernels/moe/test_deepep_moe.py
 
 - label: Kernels Fp4 MoE Test (B200)
+  key: kernels-fp4-moe-test-b200
   timeout_in_minutes: 60
-  device: b200
+  device: b200-k8s
   num_devices: 1
   optional: true
   commands:
@@ -200,6 +223,7 @@ steps:
 
 
 - label: Kernels FusedMoE Layer Test (2 H100s)
+  key: kernels-fusedmoe-layer-test-2-h100s
   timeout_in_minutes: 90
   device: h100
   num_devices: 2
@@ -216,6 +240,7 @@ steps:
 
 
 - label: Kernels FusedMoE Layer Test (2 B200s)
+  key: kernels-fusedmoe-layer-test-2-b200s
   timeout_in_minutes: 90
   device: b200
   num_devices: 2
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
index a07d702cf3ce..e5a163d17c7e 100644
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: LM Eval Small Models
+  key: lm-eval-small-models
   timeout_in_minutes: 75
   source_file_dependencies:
   - csrc/
@@ -24,6 +25,7 @@ steps:
 #   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
 - label: LM Eval Large Models (4 GPUs)(H100)
+  key: lm-eval-large-models-4-gpus-h100
   device: h100
   optional: true
   num_devices: 4
@@ -36,6 +38,7 @@ steps:
     - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
 
 - label: LM Eval Small Models (B200)
+  key: lm-eval-small-models-b200
   timeout_in_minutes: 120
   device: b200
   optional: true
@@ -46,8 +49,9 @@ steps:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
 - label: LM Eval Qwen3.5 Models (B200)
+  key: lm-eval-qwen3-5-models-b200
   timeout_in_minutes: 120
-  device: b200
+  device: b200-k8s
   optional: true
   num_devices: 2
   source_file_dependencies:
@@ -62,6 +66,7 @@ steps:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
 
 - label: LM Eval Large Models (H200)
+  key: lm-eval-large-models-h200
   timeout_in_minutes: 60
   device: h200
   optional: true
@@ -70,6 +75,7 @@ steps:
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
 
 - label: MoE Refactor Integration Test (H100 - TEMPORARY)
+  key: moe-refactor-integration-test-h100-temporary
   device: h100
   optional: true
   num_devices: 2
@@ -77,13 +83,15 @@ steps:
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
   
 - label: MoE Refactor Integration Test (B200 - TEMPORARY)
-  device: b200
+  key: moe-refactor-integration-test-b200-temporary
+  device: b200-k8s
   optional: true
   num_devices: 2
   commands:
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
 
 - label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
+  key: moe-refactor-integration-test-b200-dp-temporary
   device: b200
   optional: true
   num_devices: 2
@@ -92,6 +100,7 @@ steps:
 
 
 - label: LM Eval TurboQuant KV Cache
+  key: lm-eval-turboquant-kv-cache
   timeout_in_minutes: 75
   source_file_dependencies:
   - vllm/model_executor/layers/quantization/turboquant/
@@ -102,6 +111,7 @@ steps:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-turboquant.txt
 
 - label: GPQA Eval (GPT-OSS) (H100)
+  key: gpqa-eval-gpt-oss-h100
   timeout_in_minutes: 120
   device: h100
   optional: true
@@ -115,6 +125,7 @@ steps:
     - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
 
 - label: GPQA Eval (GPT-OSS) (B200)
+  key: gpqa-eval-gpt-oss-b200
   timeout_in_minutes: 120
   device: b200
   optional: true
@@ -126,3 +137,10 @@ steps:
   commands:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
     - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
+
+- label: MRCR Eval Small Models
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - tests/evals/mrcr/
+  commands:
+    - pytest -s -v evals/mrcr/test_mrcr_correctness.py --config-list-file=evals/mrcr/configs/models-small.txt
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index 21f392ff737b..f540eb2fcc2a 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: LoRA %N
+  key: lora
   timeout_in_minutes: 30
   source_file_dependencies:
   - vllm/lora
@@ -13,6 +14,7 @@ steps:
 
 
 - label: LoRA TP (Distributed)
+  key: lora-tp-distributed
   timeout_in_minutes: 30
   num_devices: 4
   source_file_dependencies:
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index d0930be156d2..c34d4c10b49a 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: V1 Spec Decode
+  key: v1-spec-decode
   timeout_in_minutes: 30
   source_file_dependencies:
     - vllm/
@@ -11,13 +12,9 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     # TODO: create another `optional` test group for slow tests
     - pytest -v -s -m 'not slow_test' v1/spec_decode
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: V1 Sample + Logits
+  key: v1-sample-logits
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
@@ -36,11 +33,12 @@ steps:
     - pytest -v -s v1/test_outputs.py
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
 - label: V1 Core + KV + Metrics
+  key: v1-core-kv-metrics
   timeout_in_minutes: 30
   source_file_dependencies:
     - vllm/
@@ -64,13 +62,9 @@ steps:
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: V1 Others (CPU)
+  key: v1-others-cpu
   depends_on:
     - image-build-cpu
   source_file_dependencies:
@@ -86,6 +80,7 @@ steps:
     - pytest -v -s -m 'cpu_test' v1/metrics
 
 - label: Regression
+  key: regression
   timeout_in_minutes: 20
   device: h200_18gb
   source_file_dependencies:
@@ -97,6 +92,7 @@ steps:
   working_dir: "/vllm-workspace/tests" # optional
 
 - label: Examples
+  key: examples
   timeout_in_minutes: 45
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
@@ -120,14 +116,15 @@ steps:
      # for pooling models
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
-    - python3 offline_inference/prefix_caching.py
+    - python3 features/automatic_prefix_caching/prefix_caching_offline.py
     - python3 offline_inference/llm_engine_example.py
     - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
 - label: Metrics, Tracing (2 GPUs)
+  key: metrics-tracing-2-gpus
   timeout_in_minutes: 20
   num_devices: 2
   source_file_dependencies:
@@ -142,6 +139,7 @@ steps:
   - pytest -v -s v1/tracing
 
 - label: Python-only Installation
+  key: python-only-installation
   depends_on: ~
   timeout_in_minutes: 20
   source_file_dependencies:
@@ -151,6 +149,7 @@ steps:
   - bash standalone_tests/python_only_compile.sh
 
 - label: Async Engine, Inputs, Utils, Worker
+  key: async-engine-inputs-utils-worker
   timeout_in_minutes: 50
   source_file_dependencies:
   - vllm/
@@ -163,7 +162,8 @@ steps:
   - pytest -v -s utils_
 
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-  depends_on: 
+  key: async-engine-inputs-utils-worker-config-cpu
+  depends_on:
   - image-build-cpu
   timeout_in_minutes: 30
   source_file_dependencies:
@@ -190,12 +190,13 @@ steps:
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s renderers
   - pytest -v -s tokenizers_
-  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py
+  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py
   - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
 - label: Batch Invariance (H100)
+  key: batch-invariance-h100
   timeout_in_minutes: 30
   device: h100
   source_file_dependencies:
@@ -211,8 +212,9 @@ steps:
     - VLLM_TEST_MODEL=Qwen/Qwen3-30B-A3B-Thinking-2507-FP8 pytest -v -s v1/determinism/test_batch_invariance.py::test_v1_generation_is_deterministic_across_batch_sizes_with_needle[FLASH_ATTN]
 
 - label: Batch Invariance (B200)
+  key: batch-invariance-b200
   timeout_in_minutes: 30
-  device: b200
+  device: b200-k8s
   source_file_dependencies:
     - vllm/v1/attention
     - vllm/model_executor/layers
@@ -227,6 +229,7 @@ steps:
     - pytest -v -s v1/determinism/test_nvfp4_batch_invariant.py
   
 - label: Acceptance Length Test (Large Models) # optional
+  key: acceptance-length-test-large-models
   timeout_in_minutes: 25
   gpu: h100
   optional: true
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
index 212abfdbb906..c41ef8a7110d 100644
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Model Executor
+  key: model-executor
   timeout_in_minutes: 35
   source_file_dependencies:
   - vllm/engine/arg_utils.py
diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml
index 2b88c00d6b77..6a4338a5e40a 100644
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Model Runner V2 Core Tests
+  key: model-runner-v2-core-tests
   timeout_in_minutes: 45
   source_file_dependencies:
   - vllm/v1/worker/gpu/
@@ -25,14 +26,16 @@ steps:
   - pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
 
 - label: Model Runner V2 Examples
+  key: model-runner-v2-examples
   timeout_in_minutes: 45
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
     - vllm/v1/worker/gpu/
     - vllm/v1/core/sched/
     - vllm/v1/worker/gpu_worker.py
-    - examples/offline_inference/
     - examples/basic/offline_inference/
+    - examples/generate/multimodal/
+    - examples/features/
     - examples/pooling/embed/vision_embedding_offline.py
     - examples/others/tensorize_vllm_model.py
   commands:
@@ -51,14 +54,15 @@ steps:
     # for pooling models
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
-    - python3 offline_inference/prefix_caching.py
+    - python3 features/automatic_prefix_caching/prefix_caching_offline.py
     - python3 offline_inference/llm_engine_example.py
     - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
 - label: Model Runner V2 Distributed (2 GPUs)
+  key: model-runner-v2-distributed-2-gpus
   timeout_in_minutes: 45
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
@@ -79,6 +83,7 @@ steps:
     - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
 
 - label: Model Runner V2 Pipeline Parallelism (4 GPUs)
+  key: model-runner-v2-pipeline-parallelism-4-gpus
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
@@ -94,6 +99,7 @@ steps:
     - pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
 
 - label: Model Runner V2 Spec Decode
+  key: model-runner-v2-spec-decode
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index 73cf8c53bc92..8fca203de44f 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Basic Models Tests (Initialization)
+  key: basic-models-tests-initialization
   timeout_in_minutes: 45
   torch_nightly: true
   source_file_dependencies:
@@ -16,6 +17,7 @@ steps:
     torch_nightly: {}
 
 - label: Basic Models Tests (Extra Initialization) %N
+  key: basic-models-tests-extra-initialization
   timeout_in_minutes: 45
   source_file_dependencies:
   - vllm/model_executor/models/
@@ -31,6 +33,7 @@ steps:
     torch_nightly: {}
 
 - label: Basic Models Tests (Other)
+  key: basic-models-tests-other
   timeout_in_minutes: 45
   source_file_dependencies:
   - vllm/
@@ -39,14 +42,9 @@ steps:
   - tests/models/test_registry.py
   commands:
     - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
 
 - label: Basic Models Test (Other CPU) # 5min
+  key: basic-models-test-other-cpu
   depends_on:
   - image-build-cpu
   timeout_in_minutes: 10
@@ -59,6 +57,7 @@ steps:
     - pytest -v -s models/test_utils.py models/test_vision.py
 
 - label: Transformers Nightly Models
+  key: transformers-nightly-models
   working_dir: "/vllm-workspace/"
   optional: true
   soft_fail: true
@@ -74,6 +73,7 @@ steps:
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/generate/multimodal/audio_language_offline.py --model-type whisper
 
 - label: Transformers Backward Compatibility Models Test
+  key: transformers-backward-compatibility-models-test
   working_dir: "/vllm-workspace/"
   optional: true
   soft_fail: true
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
index 55e7410b8af4..b5758c55affa 100644
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Distributed Model Tests (2 GPUs)
+  key: distributed-model-tests-2-gpus
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
index c13371e25f1d..b560c5a4769a 100644
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Language Models Tests (Standard)
+  key: language-models-tests-standard
   timeout_in_minutes: 25
   source_file_dependencies:
   - vllm/
@@ -15,6 +16,7 @@ steps:
     torch_nightly: {}
 
 - label: Language Models Tests (Extra Standard) %N
+  key: language-models-tests-extra-standard
   timeout_in_minutes: 45
   source_file_dependencies:
   - vllm/model_executor/models/
@@ -31,6 +33,7 @@ steps:
     torch_nightly: {}
 
 - label: Language Models Tests (Hybrid) %N
+  key: language-models-tests-hybrid
   timeout_in_minutes: 75
   source_file_dependencies:
   - vllm/
@@ -45,8 +48,17 @@ steps:
   parallelism: 2
   mirror:
     torch_nightly: {}
+    amd:
+      device: mi300_1
+      depends_on:
+      - image-build-amd
+      commands:
+      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
+      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
+      - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
 - label: Language Models Test (Extended Generation) # 80min
+  key: language-models-test-extended-generation
   timeout_in_minutes: 110
   optional: true
   source_file_dependencies:
@@ -58,17 +70,9 @@ steps:
     - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
-      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
 - label: Language Models Test (PPL)
+  key: language-models-test-ppl
   timeout_in_minutes: 110
   device: h200_18gb
   optional: true
@@ -79,6 +83,7 @@ steps:
     - pytest -v -s models/language/generation_ppl_test
 
 - label: Language Models Test (Extended Pooling)  # 36min
+  key: language-models-test-extended-pooling
   timeout_in_minutes: 50
   optional: true
   source_file_dependencies:
@@ -86,13 +91,9 @@ steps:
   - tests/models/language/pooling
   commands:
     - pytest -v -s models/language/pooling -m 'not core_model'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Language Models Test (MTEB)
+  key: language-models-test-mteb
   timeout_in_minutes: 110
   device: h200_18gb
   optional: true
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
index 245ef24026d2..1f66393df818 100644
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: "Multi-Modal Models (Standard) 1: qwen2"
+  key: multi-modal-models-standard-1-qwen2
   timeout_in_minutes: 45
   device: h200_18gb
   source_file_dependencies:
@@ -14,11 +15,12 @@ steps:
     - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
 - label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
+  key: multi-modal-models-standard-2-qwen3-gemma
   timeout_in_minutes: 45
   device: h200_18gb
   source_file_dependencies:
@@ -31,11 +33,12 @@ steps:
     - pytest -v -s models/multimodal/generation/test_vit_cudagraph.py -m core_model
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
 - label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
+  key: multi-modal-models-standard-3-llava-qwen2-vl
   timeout_in_minutes: 45
   source_file_dependencies:
   - vllm/
@@ -46,11 +49,12 @@ steps:
     - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
 - label: "Multi-Modal Models (Standard) 4: other + whisper"
+  key: multi-modal-models-standard-4-other-whisper
   timeout_in_minutes: 45
   source_file_dependencies:
   - vllm/
@@ -60,14 +64,10 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py  --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing
     - pytest models/multimodal/generation/test_memory_leak.py -m core_model
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Multi-Modal Processor (CPU)
-  depends_on: 
+  key: multi-modal-processor-cpu
+  depends_on:
   - image-build-cpu
   timeout_in_minutes: 60
   source_file_dependencies:
@@ -80,6 +80,7 @@ steps:
     - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
 
 - label: Multi-Modal Processor # 44min
+  key: multi-modal-processor
   timeout_in_minutes: 60
   device: h200_18gb
   source_file_dependencies:
@@ -91,6 +92,7 @@ steps:
     - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  key: multi-modal-accuracy-eval-small-models
   timeout_in_minutes: 70
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
@@ -101,6 +103,7 @@ steps:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 
 - label: Multi-Modal Models (Extended Generation 1)
+  key: multi-modal-models-extended-generation-1
   optional: true
   source_file_dependencies:
   - vllm/
@@ -112,11 +115,12 @@ steps:
     - pytest -v -s models/multimodal/test_mapping.py
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
 - label: Multi-Modal Models (Extended Generation 2)
+  key: multi-modal-models-extended-generation-2
   optional: true
   source_file_dependencies:
   - vllm/
@@ -126,6 +130,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
 - label: Multi-Modal Models (Extended Generation 3)
+  key: multi-modal-models-extended-generation-3
   optional: true
   source_file_dependencies:
   - vllm/
@@ -135,6 +140,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
 - label: Multi-Modal Models (Extended Pooling)
+  key: multi-modal-models-extended-pooling
   optional: true
   device: h200_18gb
   source_file_dependencies:
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
index 8e0eb0284019..0d23180f3ef7 100644
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Plugin Tests (2 GPUs)
+  key: plugin-tests-2-gpus
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
index a3648219d89d..a470cc60a3e5 100644
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
+  key: pytorch-compilation-unit-tests
   timeout_in_minutes: 10
   source_file_dependencies:
     - vllm/
@@ -18,6 +19,7 @@ steps:
   - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Compilation Unit Tests (H100)
+  key: pytorch-compilation-unit-tests-h100
   timeout_in_minutes: 30
   device: h100
   num_devices: 1
@@ -28,6 +30,7 @@ steps:
   - "find compile/h100/ -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Compilation Passes Unit Tests
+  key: pytorch-compilation-passes-unit-tests
   timeout_in_minutes: 20
   source_file_dependencies:
     - vllm/
@@ -36,6 +39,7 @@ steps:
   - pytest -s -v compile/passes --ignore compile/passes/distributed
 
 - label: PyTorch Fullgraph Smoke Test
+  key: pytorch-fullgraph-smoke-test
   timeout_in_minutes: 35
   source_file_dependencies:
   - vllm/
@@ -48,6 +52,7 @@ steps:
   - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Fullgraph
+  key: pytorch-fullgraph
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
@@ -58,6 +63,7 @@ steps:
   - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
 
 - label: Pytorch Nightly Dependency Override Check # 2min
+  key: pytorch-nightly-dependency-override-check
   # if this test fails, it means the nightly torch version is not compatible with some
   # of the dependencies. Please check the error message and add the package to whitelist
   # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
index a42d59b021c6..8a9a36da4481 100644
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Quantization
+  key: quantization
   timeout_in_minutes: 90
   source_file_dependencies:
   - csrc/
@@ -21,9 +22,10 @@ steps:
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: Quantized MoE Test (B200)
+  key: quantized-moe-test-b200
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   source_file_dependencies:
   - tests/quantization/test_blackwell_moe.py
   - vllm/model_executor/models/deepseek_v2.py
@@ -38,6 +40,7 @@ steps:
     - pytest -s -v tests/quantization/test_blackwell_moe.py
 
 - label: Quantized Models Test
+  key: quantized-models-test
   timeout_in_minutes: 60
   source_file_dependencies:
   - vllm/model_executor/layers/quantization
diff --git a/.buildkite/test_areas/ray_compat.yaml b/.buildkite/test_areas/ray_compat.yaml
index 3485e346532c..9207621a5830 100644
--- a/.buildkite/test_areas/ray_compat.yaml
+++ b/.buildkite/test_areas/ray_compat.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Ray Dependency Compatibility Check
+  key: ray-dependency-compatibility-check
   # Informational only — does not block the pipeline.
   # If this fails, it means the PR introduces a dependency that
   # conflicts with Ray's dependency constraints.
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
index 2052a379827a..48e9f55571e4 100644
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Samplers Test
+  key: samplers-test
   timeout_in_minutes: 75
   source_file_dependencies:
   - vllm/model_executor/layers
@@ -10,11 +11,13 @@ steps:
   - tests/samplers
   - tests/conftest.py
   commands:
-    - pytest -v -s samplers
+    # VLLM_USE_FLASHINFER_SAMPLER defaults to 1 now, so we need to pin both
+    # values explicitly to still cover the PyTorch-native (Triton) path.
+    - VLLM_USE_FLASHINFER_SAMPLER=0 pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
       commands:
diff --git a/.buildkite/test_areas/spec_decode.yaml b/.buildkite/test_areas/spec_decode.yaml
index 05925da0da01..5253f54735aa 100644
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Spec Decode Eagle
+  key: spec-decode-eagle
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
@@ -13,6 +14,7 @@ steps:
     - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
 
 - label: Spec Decode Eagle Nightly B200
+  key: spec-decode-eagle-nightly-b200
   timeout_in_minutes: 30
   device: b200
   optional: true
@@ -24,6 +26,7 @@ steps:
     - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
 
 - label: Spec Decode Speculators + MTP
+  key: spec-decode-speculators-mtp
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
@@ -35,6 +38,7 @@ steps:
     - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
 
 - label: Spec Decode Speculators + MTP Nightly B200
+  key: spec-decode-speculators-mtp-nightly-b200
   timeout_in_minutes: 30
   device: b200
   optional: true
@@ -47,6 +51,7 @@ steps:
     - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
   
 - label: Spec Decode Ngram + Suffix
+  key: spec-decode-ngram-suffix
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
@@ -57,6 +62,7 @@ steps:
     - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
 
 - label: Spec Decode Draft Model
+  key: spec-decode-draft-model
   timeout_in_minutes: 30
   device: h200_18gb
   source_file_dependencies:
@@ -67,8 +73,9 @@ steps:
     - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
 
 - label: Spec Decode Draft Model Nightly B200
+  key: spec-decode-draft-model-nightly-b200
   timeout_in_minutes: 30
-  device: b200
+  device: b200-k8s
   optional: true
   source_file_dependencies:
     - vllm/v1/spec_decode/
@@ -78,6 +85,7 @@ steps:
     - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
 
 - label: DFlash Speculators Correctness
+  key: dflash-speculators-correctness
   timeout_in_minutes: 30
   device: h100
   optional: true
@@ -89,3 +97,16 @@ steps:
   commands:
     - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
     - pytest -v -s v1/spec_decode/test_speculators_dflash.py -m slow_test
+
+- label: Spec Decode MTP hybrid (B200)
+  timeout_in_minutes: 30
+  device: b200
+  optional: true
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - vllm/model_executor/models/qwen3_5.py
+    - vllm/model_executor/models/qwen3_5_mtp.py
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "qwen3_5-hybrid"
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
index 8e86374a8ad0..01c6bb7809bc 100644
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -3,6 +3,7 @@ depends_on:
   - image-build
 steps:
 - label: Weight Loading Multiple GPU  # 33min
+  key: weight-loading-multiple-gpu
   timeout_in_minutes: 45
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 8ca00d6e7d2d..2d36e3507028 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -308,8 +308,7 @@ pull_request_rules:
       - files=benchmarks/benchmark_serving_structured_output.py
       - files=benchmarks/run_structured_output_benchmark.sh
       - files=docs/features/structured_outputs.md
-      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/structured_outputs/structured_outputs.py
+      - files=^examples/features/structured_outputs/
       - files~=^tests/v1/structured_output/
       - files=tests/entrypoints/llm/test_struct_output_generate.py
       - files~=^vllm/v1/structured_output/
@@ -325,7 +324,7 @@ pull_request_rules:
     - or:
       - files~=^vllm/v1/spec_decode/
       - files~=^tests/v1/spec_decode/
-      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
+      - files=^examples/features/speculative_decoding/
       - files~=^vllm/model_executor/models/.*eagle.*\.py
       - files=vllm/model_executor/models/mlp_speculator.py
       - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
@@ -478,9 +477,7 @@ pull_request_rules:
   conditions:
     - label != stale
     - or:
-      - files~=^examples/online_serving/disaggregated[^/]*/.*
-      - files~=^examples/offline_inference/disaggregated[^/]*/.*
-      - files~=^examples/others/lmcache/
+      - files~=^examples/disaggregated/
       - files~=^tests/v1/kv_connector/
       - files~=^vllm/distributed/kv_transfer/
       - title~=(?i)\bP/?D\b
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 8ab8d3e7035f..1dd31b0e50f6 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,11 +16,7 @@ permissions:
 
 jobs:
   pre-run-check:
-    if: >-
-      github.event_name == 'pull_request' &&
-      (github.event.action != 'labeled' ||
-       github.event.label.name == 'ready' ||
-       github.event.label.name == 'verified')
+    if: github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     steps:
     - name: Check PR label and author merge count
@@ -49,12 +45,7 @@ jobs:
 
   pre-commit:
     needs: pre-run-check
-    if: >-
-      always() &&
-      (github.event.action != 'labeled' ||
-       github.event.label.name == 'ready' ||
-       github.event.label.name == 'verified') &&
-      (needs.pre-run-check.result == 'success' || needs.pre-run-check.result == 'skipped')
+    if: always() && (needs.pre-run-check.result == 'success' || needs.pre-run-check.result == 'skipped')
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
diff --git a/.gitignore b/.gitignore
index 134bbc5cc893..e53d19b35340 100644
--- a/.gitignore
+++ b/.gitignore
@@ -237,6 +237,7 @@ ep_kernels_workspace/
 
 # Allow tracked library source folders under submodules (e.g., benchmarks/lib)
 !vllm/benchmarks/lib/
+!.buildkite/scripts/lib/
 
 # Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
 vllm/grpc/vllm_engine_pb2.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb8a1d7e1e14..13788fa87437 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -307,12 +307,12 @@ set(VLLM_EXT_SRC
   "csrc/quantization/activation_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/custom_all_reduce.cu"
-  "csrc/torch_bindings.cpp")
+  "csrc/torch_bindings.cpp"
+  "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
-    "csrc/minimax_reduce_rms_kernel.cu"
-    "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu")
+    "csrc/minimax_reduce_rms_kernel.cu")
 
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
@@ -1047,14 +1047,13 @@ endif()
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/topk_softmax_kernels.cu")
+  "csrc/moe/topk_softmax_kernels.cu"
+  "csrc/moe/topk_softplus_sqrt_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC
     "csrc/moe/moe_wna16.cu"
-    "csrc/moe/grouped_topk_kernels.cu"
-    "csrc/moe/router_gemm.cu"
-    "csrc/moe/topk_softplus_sqrt_kernels.cu")
+    "csrc/moe/grouped_topk_kernels.cu")
 endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 881039f43f07..2f56099c66fd 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -217,6 +217,7 @@ async def send_request(
     min_tokens: int | None = None,
     max_tokens: int | None = None,
     timeout_sec: int = 120,
+    conversation_id: str | None = None,
 ) -> ServerResponse:
     payload = {
         "model": model,
@@ -225,6 +226,9 @@ async def send_request(
         "temperature": 0.0,
     }
 
+    if conversation_id is not None:
+        payload["conversation_id"] = conversation_id
+
     if stream:
         payload["stream"] = True
         payload["stream_options"] = {"include_usage": False}
@@ -419,6 +423,7 @@ async def send_turn(
         min_tokens,
         max_tokens,
         req_args.timeout_sec,
+        conversation_id=conv_id,
     )
 
     if response.valid is False:
@@ -1468,6 +1473,12 @@ async def main() -> None:
         "(for example: --warmup-percentages=0%%,50%%)",
     )
 
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code when loading the tokenizer.",
+    )
+
     args = parser.parse_args()
 
     logger.info(args)
@@ -1510,7 +1521,9 @@ async def main() -> None:
     np.random.seed(args.seed)
 
     logger.info("Loading tokenizer")
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code
+    )
 
     await get_server_info(args.url)
 
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 8535186cc1ec..650dbf365169 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -32,18 +32,23 @@ else()
         "-DVLLM_CPU_EXTENSION")
 
     # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
-    # and create a local shim dir with it
+    # and create a local shim dir with it. When PyTorch is built from source or packaged
+    # by a distro (common on RISC-V, s390x, Fedora/RHEL aarch64), no vendored libgomp
+    # exists and the shim dir is empty; fall back to the system libgomp in that case.
     vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
 
-    find_library(OPEN_MP
-        NAMES gomp
-        PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
-        NO_DEFAULT_PATH
-        REQUIRED
-    )
-    # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
-    if (OPEN_MP)
+    if(VLLM_TORCH_GOMP_SHIM_DIR)
+        find_library(OPEN_MP
+            NAMES gomp
+            PATHS "${VLLM_TORCH_GOMP_SHIM_DIR}"
+            NO_DEFAULT_PATH
+            REQUIRED
+        )
+        # Use the same libgomp as PyTorch at runtime
         set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+    else()
+        # Fall back to system / toolchain libgomp
+        find_library(OPEN_MP NAMES gomp REQUIRED)
     endif()
 endif()
 
@@ -321,14 +326,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
     set(ONEDNN_VERBOSE "ON")
     set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
-    # TODO: Refactor this
-    if (ENABLE_X86_ISA)
-        # Note: only enable oneDNN for AVX512
-        list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512})
-    else()
-        list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS})
-    endif()
-
     set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
     set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
     FetchContent_MakeAvailable(oneDNN)
@@ -341,8 +338,14 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
         PRIVATE ${oneDNN_SOURCE_DIR}/src
     )
     target_link_libraries(dnnl_ext dnnl torch)
-    target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC)
+    if (ENABLE_X86_ISA)
+        target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS_AVX2} -fPIC)
+    else()
+        target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
+    endif()
     list(APPEND LIBS dnnl_ext)
+
+
     set(USE_ONEDNN ON)
 else()
     set(USE_ONEDNN OFF)
@@ -430,10 +433,11 @@ if (ENABLE_X86_ISA)
         "csrc/cpu/pos_encoding.cpp"
         "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 
 
-    set(VLLM_EXT_SRC_AVX2 
+    set(VLLM_EXT_SRC_AVX2
         "csrc/cpu/utils.cpp"
         "csrc/cpu/spec_decode_utils.cpp"
         "csrc/cpu/cpu_attn.cpp"
+        "csrc/cpu/dnnl_kernels.cpp"
         "csrc/cpu/torch_bindings.cpp"
         # TODO: Remove these files
         "csrc/cpu/activation.cpp"
@@ -448,7 +452,7 @@ if (ENABLE_X86_ISA)
 
     set(_C_LIBS numa dnnl_ext)
     set(_C_AVX512_LIBS numa dnnl_ext)
-    set(_C_AVX2_LIBS numa)
+    set(_C_AVX2_LIBS numa dnnl_ext)
 
     # AMX + AVX512F + AVX512BF16 + AVX512VNNI
     define_extension_target(
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 7e456d32598b..895490f45a79 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -97,13 +97,13 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
 
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  // Use cuMemcpyBatchAsync (CUDA 12.8+) to submit all copies in a single
-  // driver call, amortizing per-copy submission overhead.
-  // int64_t and CUdeviceptr/size_t are both 8 bytes on 64-bit platforms,
-  // so we reinterpret_cast the tensor data directly to avoid copies.
-  static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
+  // Use cuMemcpyBatchAsync / hipMemcpyBatchAsync to submit all copies in a
+  // single driver call, amortizing per-copy submission overhead. int64_t
+  // and CUdeviceptr/void*/size_t are all 8 bytes on 64-bit platforms, so we
+  // reinterpret_cast the tensor data directly to avoid copies.
   static_assert(sizeof(size_t) == sizeof(int64_t));
 #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080
+  static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
   // Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that
   // binaries compiled with CUDA 12.8+ still work on older drivers, and
   // we avoid the CUDA 13.0 header remapping (#define to _v2 signature).
@@ -134,12 +134,30 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
                                &fail_idx, static_cast<CUstream>(stream));
     TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ",
                 fail_idx, " with error ", result);
-  } else
+    return;
+  }
+#elif defined(USE_ROCM) && defined(HIP_VERSION) && HIP_VERSION >= 70100000
+  // ROCm 7.1+ exposes hipMemcpyBatchAsync. The 7.2.1 implementation early-
+  // returns hipErrorNotSupported whenever numAttrs > 0 (see ROCm/clr @
+  // rocm-7.2.1 hipamd/src/hip_memory.cpp:2819-2822), so call with
+  // numAttrs=0.
+  {
+    hipMemcpyAttributes attr = {};
+    size_t attrs_idx = 0;
+    size_t fail_idx = 0;
+    hipError_t result = hipMemcpyBatchAsync(
+        reinterpret_cast<void**>(dst_data), reinterpret_cast<void**>(src_data),
+        reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), &attr,
+        &attrs_idx, 0, &fail_idx, static_cast<hipStream_t>(stream));
+    TORCH_CHECK(result == hipSuccess, "hipMemcpyBatchAsync failed at index ",
+                fail_idx, " with error ", result);
+    return;
+  }
 #endif
   {
-    // Fallback for CUDA < 12.8, older drivers, and ROCm:
-    // individual async copies.
-    // cudaMemcpyDefault lets the driver infer direction from pointer types.
+    // Fallback for CUDA < 12.8, older CUDA drivers, and ROCm < 7.1:
+    // individual async copies. cudaMemcpyDefault lets the driver infer
+    // direction from pointer types.
     for (int64_t i = 0; i < n; i++) {
       cudaMemcpyAsync(reinterpret_cast<void*>(dst_data[i]),
                       reinterpret_cast<void*>(src_data[i]),
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index a582b4b4d7cc..4750dd78838d 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -1,5 +1,16 @@
 #include "cpu_attn_dispatch_generated.h"
 
+// Maps kv_cache_dtype string to Fp8KVCacheDataType enum.
+// "auto" -> kAuto(0); "fp8"/"fp8_e4m3" -> kFp8E4M3; "fp8_e5m2" -> kFp8E5M2.
+static inline cpu_attention::Fp8KVCacheDataType parse_fp8_kv_dtype(
+    const std::string& kv_cache_dtype) {
+  if (kv_cache_dtype == "fp8_e5m2")
+    return cpu_attention::Fp8KVCacheDataType::kFp8E5M2;
+  if (kv_cache_dtype == "fp8_e4m3" || kv_cache_dtype == "fp8")
+    return cpu_attention::Fp8KVCacheDataType::kFp8E4M3;
+  return cpu_attention::Fp8KVCacheDataType::kAuto;
+}
+
 torch::Tensor get_scheduler_metadata(
     const int64_t num_req, const int64_t num_heads_q,
     const int64_t num_heads_kv, const int64_t head_dim,
@@ -18,6 +29,8 @@ torch::Tensor get_scheduler_metadata(
     isa = cpu_attention::ISA::NEON;
   } else if (isa_hint == "vxe") {
     isa = cpu_attention::ISA::VXE;
+  } else if (isa_hint == "vsx") {
+    isa = cpu_attention::ISA::VSX;
   } else {
     TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
   }
@@ -49,7 +62,7 @@ torch::Tensor get_scheduler_metadata(
   input.enable_kv_split = enable_kv_split;
 
   VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
-    CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
+    CPU_ATTN_DISPATCH(head_dim, isa, 0, [&]() {
       input.elem_size = sizeof(scalar_t);
       input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
       input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
@@ -72,7 +85,9 @@ void cpu_attn_reshape_and_cache(
         key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
     torch::Tensor&
         value_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
-    const torch::Tensor& slot_mapping, const std::string& isa) {
+    const torch::Tensor& slot_mapping, const std::string& isa,
+    const double k_scale = 1.0, const double v_scale = 1.0,
+    const std::string& kv_cache_dtype = "auto") {
   TORCH_CHECK_EQ(key.dim(), 3);
   TORCH_CHECK_EQ(value.dim(), 3);
   TORCH_CHECK_EQ(key_cache.dim(), 4);
@@ -80,18 +95,30 @@ void cpu_attn_reshape_and_cache(
   TORCH_CHECK_EQ(key.stride(2), 1);
   TORCH_CHECK_EQ(value.stride(2), 1);
 
+  const int64_t kv_cache_idx =
+      static_cast<int64_t>(parse_fp8_kv_dtype(kv_cache_dtype));
+  const bool is_fp8 = (kv_cache_idx != 0);
+
+  if (is_fp8) {
+    TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte,
+                "key_cache must be uint8 for FP8 path");
+    TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte,
+                "value_cache must be uint8 for FP8 path");
+    TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path");
+    TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path");
+  }
+
+  const float k_inv = is_fp8 ? 1.0f / static_cast<float>(k_scale) : 0.0f;
+  const float v_inv = is_fp8 ? 1.0f / static_cast<float>(v_scale) : 0.0f;
+
   const int64_t token_num = key.size(0);
-  const int64_t key_token_num_stride = key.stride(0);
-  const int64_t value_token_num_stride = value.stride(0);
-  const int64_t head_num = value.size(1);
-  const int64_t key_head_num_stride = key.stride(1);
-  const int64_t value_head_num_stride = value.stride(1);
+  const int64_t head_num = key.size(1);
+  const int64_t head_dim = key.size(2);
   const int64_t num_blocks = key_cache.size(0);
   const int64_t num_blocks_stride = key_cache.stride(0);
   const int64_t cache_head_num_stride = key_cache.stride(1);
   const int64_t block_size = key_cache.size(2);
   const int64_t block_size_stride = key_cache.stride(2);
-  const int64_t head_dim = key.size(-1);
 
   cpu_attention::ISA isa_tag = [&]() {
     if (isa == "amx") {
@@ -104,21 +131,31 @@ void cpu_attn_reshape_and_cache(
       return cpu_attention::ISA::NEON;
     } else if (isa == "vxe") {
       return cpu_attention::ISA::VXE;
+    } else if (isa == "vsx") {
+      return cpu_attention::ISA::VSX;
     } else {
       TORCH_CHECK(false, "Invalid ISA type: " + isa);
     }
   }();
 
+  if (is_fp8) {
+    TORCH_CHECK(isa_tag == cpu_attention::ISA::AMX ||
+                    isa_tag == cpu_attention::ISA::VEC,
+                "FP8 KV cache is only supported on x86 (AMX/VEC) ISA");
+  }
+
   VLLM_DISPATCH_FLOATING_TYPES(
       key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
-        CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
+        CPU_ATTN_DISPATCH(head_dim, isa_tag, kv_cache_idx, [&]() {
+          using kv_t = typename attn_impl::kv_cache_t;
           attn_impl::reshape_and_cache(
               key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-              key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-              slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
-              value_token_num_stride, head_num, key_head_num_stride,
-              value_head_num_stride, num_blocks, num_blocks_stride,
-              cache_head_num_stride, block_size, block_size_stride);
+              reinterpret_cast<kv_t*>(key_cache.data_ptr()),
+              reinterpret_cast<kv_t*>(value_cache.data_ptr()),
+              slot_mapping.data_ptr<int64_t>(), token_num, key.stride(0),
+              value.stride(0), head_num, key.stride(1), value.stride(1),
+              num_blocks, num_blocks_stride, cache_head_num_stride, block_size,
+              block_size_stride, k_inv, v_inv);
         });
       });
 }
@@ -137,13 +174,26 @@ void cpu_attention_with_kv_cache(
     const int64_t sliding_window_left, const int64_t sliding_window_right,
     const torch::Tensor& block_table,  // [num_tokens, max_block_num]
     const double softcap, const torch::Tensor& scheduler_metadata,
-    const std::optional<torch::Tensor>& s_aux  // [num_heads]
-) {
+    const std::optional<torch::Tensor>& s_aux,  // [num_heads]
+    const double k_scale = 1.0, const double v_scale = 1.0,
+    const std::string& kv_cache_dtype = "auto") {
   TORCH_CHECK_EQ(query.dim(), 3);
   TORCH_CHECK_EQ(query.stride(2), 1);
   TORCH_CHECK_EQ(key_cache.dim(), 4);
   TORCH_CHECK_EQ(value_cache.dim(), 4);
 
+  const int64_t kv_cache_idx =
+      static_cast<int64_t>(parse_fp8_kv_dtype(kv_cache_dtype));
+  const bool is_fp8 = (kv_cache_idx != 0);
+  if (is_fp8) {
+    TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte,
+                "key_cache must be uint8 for FP8 path");
+    TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte,
+                "value_cache must be uint8 for FP8 path");
+    TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path");
+    TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path");
+  }
+
   cpu_attention::AttentionInput input;
   input.metadata = reinterpret_cast<cpu_attention::AttentionMetadata*>(
       scheduler_metadata.data_ptr());
@@ -165,25 +215,32 @@ void cpu_attention_with_kv_cache(
   input.block_table = block_table.data_ptr<int32_t>();
   input.alibi_slopes =
       alibi_slopes.has_value() ? alibi_slopes->data_ptr<float>() : nullptr;
-  // For now sink must be bf16
   input.s_aux = s_aux.has_value() ? s_aux->data_ptr<c10::BFloat16>() : nullptr;
   input.scale = scale;
   input.causal = causal;
   input.sliding_window_left = sliding_window_left;
   input.sliding_window_right = sliding_window_right;
   if (input.causal) {
-    // to make boundary calculation easier
     input.sliding_window_right = 0;
   }
-  float softcap_fp32 = softcap;
-  input.softcap = softcap_fp32;
+  input.softcap = static_cast<float>(softcap);
+
+  if (is_fp8) {
+    input.k_scale_fp8 = static_cast<float>(k_scale);
+    input.v_scale_fp8 = static_cast<float>(v_scale);
+    TORCH_CHECK(input.metadata->isa == cpu_attention::ISA::AMX ||
+                    input.metadata->isa == cpu_attention::ISA::VEC,
+                "FP8 KV cache is only supported on x86 (AMX/VEC) ISA");
+  }
 
   VLLM_DISPATCH_FLOATING_TYPES(
       query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
-        CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
-          TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
-          cpu_attention::AttentionMainLoop<attn_impl> mainloop;
-          mainloop(&input);
-        });
+        CPU_ATTN_DISPATCH(
+            query.size(2), input.metadata->isa, kv_cache_idx, [&]() {
+              TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment,
+                             0);
+              cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+              mainloop(&input);
+            });
       });
 }
diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp
index 1c8644d52329..6a0341085dce 100644
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -1,6 +1,7 @@
 #ifndef CPU_ATTN_AMX_HPP
 #define CPU_ATTN_AMX_HPP
 
+#include "cpu_attn_fp8.hpp"
 #include "cpu_attn_impl.hpp"
 
 namespace cpu_attention {
@@ -21,9 +22,10 @@ typedef struct __tile_config {
 // 2-2-4 pattern, for 16 < m <= 32
 // TILE 0, 1: load A matrix, row num should be 16, m - 16
 // TILE 2, 3: load B matrix, row num should be 16
-// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
-// - 16
-template <typename kv_cache_t>
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16,
+// m - 16, m - 16
+// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type.
+template <typename q_buffer_t, typename kv_cache_t>
 class TileGemm224 {
  public:
   template <AttentionGemmPhase phase, int32_t k_size>
@@ -42,13 +44,56 @@ class TileGemm224 {
   }
 };
 
-template <>
-class TileGemm224<c10::BFloat16> {
+// Dequantize one FP8 tile (AMX_TILE_ROW_NUM rows x 32 cols) to BF16.
+template <typename kv_cache_t>
+FORCE_INLINE void deq_tile_amx(const uint8_t* src, c10::BFloat16* dst) {
+  for (int r = 0; r < AMX_TILE_ROW_NUM; ++r) {
+    if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e4m3fn>) {
+      vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e4m3_tag{})
+          .save(dst + r * 32);
+    } else {
+      vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e5m2_tag{})
+          .save(dst + r * 32);
+    }
+  }
+}
+
+// For FP8: dequant src into scratch and return scratch.
+// For BF16: return src directly (scratch is unused; the compiler elides it).
+template <typename kv_cache_t>
+FORCE_INLINE const c10::BFloat16* prepare_b_tile(const kv_cache_t* src,
+                                                 c10::BFloat16* scratch) {
+  if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<kv_cache_t, c10::Float8_e5m2>) {
+    deq_tile_amx<kv_cache_t>(reinterpret_cast<const uint8_t*>(src), scratch);
+    return scratch;
+  } else {
+    return reinterpret_cast<const c10::BFloat16*>(src);
+  }
+}
+
+// Handles both BF16 and FP8 KV cache (2-2-4 pattern).
+template <typename kv_cache_t>
+class TileGemm224<c10::BFloat16, kv_cache_t> {
+  static_assert(std::is_same_v<kv_cache_t, c10::BFloat16> ||
+                    std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+                    std::is_same_v<kv_cache_t, c10::Float8_e5m2>,
+                "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2");
+
+  static constexpr bool fp8_kv =
+      std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+      std::is_same_v<kv_cache_t, c10::Float8_e5m2>;
+
+  static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16);
+  // BF16 path: scratch_elems=1 so the scratch array is eliminated by the
+  // compiler.
+  static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1;
+
  public:
   template <AttentionGemmPhase phase, int32_t k_size>
   FORCE_INLINE static void gemm(const int32_t m_size,
                                 c10::BFloat16* __restrict__ a_tile,
-                                c10::BFloat16* __restrict__ b_tile,
+                                kv_cache_t* __restrict__ b_tile,
                                 float* __restrict__ c_tile, const int64_t lda,
                                 const int64_t ldb, const int64_t ldc,
                                 const int32_t block_size,
@@ -56,6 +101,7 @@ class TileGemm224<c10::BFloat16> {
                                 const bool accum_c) {
     const int32_t k_times =
         dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+
     c10::BFloat16* __restrict__ a_tile_0 = a_tile;
     c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
     const int64_t a_tile_stride = [&]() {
@@ -70,8 +116,8 @@ class TileGemm224<c10::BFloat16> {
       }
     }();
 
-    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
-    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+    kv_cache_t* __restrict__ b_tile_2 = b_tile;
+    kv_cache_t* __restrict__ b_tile_3 = [&]() {
       if constexpr (phase == AttentionGemmPhase::QK) {
         // k_cache is prepacked
         return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
@@ -106,11 +152,16 @@ class TileGemm224<c10::BFloat16> {
       _tile_zero(7);
     }
 
+    alignas(64) c10::BFloat16 scratch_2[scratch_elems];
+    alignas(64) c10::BFloat16 scratch_3[scratch_elems];
     for (int32_t k = 0; k < k_times; ++k) {
+      const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+      const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+
       _tile_loadd(0, a_tile_0, a_tile_stride);
-      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_stream_loadd(2, const_cast<c10::BFloat16*>(load_2), b_tile_stride);
       _tile_dpbf16ps(4, 0, 2);
-      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_stream_loadd(3, const_cast<c10::BFloat16*>(load_3), b_tile_stride);
       _tile_dpbf16ps(5, 0, 3);
       _tile_loadd(1, a_tile_1, a_tile_stride);
       _tile_dpbf16ps(6, 1, 2);
@@ -154,13 +205,13 @@ class TileGemm224<c10::BFloat16> {
 };
 
 // 1-2-2 pattern, for 0 < m <= 16
-// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
-// m, m
-// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
-// num should be 16
-// TILE 6, 7, (6, 7): store results C matrix, row num should be
-// m
-template <typename kv_cache_t>
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should
+// be m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row num
+// should be 16
+// TILE 6, 7: store results C matrix, row num should be m
+// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type.
+template <typename q_buffer_t, typename kv_cache_t>
 class TileGemm122 {
  public:
   template <AttentionGemmPhase phase, int32_t k_size>
@@ -179,13 +230,26 @@ class TileGemm122 {
   }
 };
 
-template <>
-class TileGemm122<c10::BFloat16> {
+// Handles both BF16 and FP8 KV cache (1-2-2 pattern).
+template <typename kv_cache_t>
+class TileGemm122<c10::BFloat16, kv_cache_t> {
+  static_assert(std::is_same_v<kv_cache_t, c10::BFloat16> ||
+                    std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+                    std::is_same_v<kv_cache_t, c10::Float8_e5m2>,
+                "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2");
+
+  static constexpr bool fp8_kv =
+      std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+      std::is_same_v<kv_cache_t, c10::Float8_e5m2>;
+
+  static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16);
+  static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1;
+
  public:
   template <AttentionGemmPhase phase, int32_t k_size>
   FORCE_INLINE static void gemm(const int32_t m_size,
                                 c10::BFloat16* __restrict__ a_tile,
-                                c10::BFloat16* __restrict__ b_tile,
+                                kv_cache_t* __restrict__ b_tile,
                                 float* __restrict__ c_tile, const int64_t lda,
                                 const int64_t ldb, const int64_t ldc,
                                 const int32_t block_size,
@@ -215,21 +279,19 @@ class TileGemm122<c10::BFloat16> {
       }
     }();
 
-    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
-    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+    kv_cache_t* __restrict__ b_tile_2 = b_tile;
+    kv_cache_t* __restrict__ b_tile_3 = [&]() {
       if constexpr (phase == AttentionGemmPhase::QK) {
-        // k_cache is prepacked
         return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
       } else if constexpr (phase == AttentionGemmPhase::PV) {
-        // v_cache is prepacked
         return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
       } else {
         TORCH_CHECK(false, "Unreachable");
       }
     }();
-    c10::BFloat16* __restrict__ b_tile_4 =
+    kv_cache_t* __restrict__ b_tile_4 =
         b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
-    c10::BFloat16* __restrict__ b_tile_5 =
+    kv_cache_t* __restrict__ b_tile_5 =
         b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
     int64_t b_stride = AMX_TILE_ROW_BYTES;
 
@@ -250,16 +312,25 @@ class TileGemm122<c10::BFloat16> {
       _tile_zero(7);
     }
 
+    alignas(64) c10::BFloat16 scratch_2[scratch_elems];
+    alignas(64) c10::BFloat16 scratch_3[scratch_elems];
+    alignas(64) c10::BFloat16 scratch_4[scratch_elems];
+    alignas(64) c10::BFloat16 scratch_5[scratch_elems];
     for (int32_t k = 0; k < k_group_times; ++k) {
+      const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+      const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+      const c10::BFloat16* load_4 = prepare_b_tile(b_tile_4, scratch_4);
+      const c10::BFloat16* load_5 = prepare_b_tile(b_tile_5, scratch_5);
+
       _tile_loadd(0, a_tile_0, a_tile_stride);
-      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_stream_loadd(2, const_cast<c10::BFloat16*>(load_2), b_stride);
       _tile_dpbf16ps(6, 0, 2);
-      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_stream_loadd(3, const_cast<c10::BFloat16*>(load_3), b_stride);
       _tile_dpbf16ps(7, 0, 3);
       _tile_loadd(1, a_tile_1, a_tile_stride);
-      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_stream_loadd(4, const_cast<c10::BFloat16*>(load_4), b_stride);
       _tile_dpbf16ps(6, 1, 4);
-      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_stream_loadd(5, const_cast<c10::BFloat16*>(load_5), b_stride);
       _tile_dpbf16ps(7, 1, 5);
 
       // update ptrs
@@ -279,10 +350,13 @@ class TileGemm122<c10::BFloat16> {
     }
 
     if (has_tail) {
+      const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+      const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+
       _tile_loadd(0, a_tile_0, a_tile_stride);
-      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_stream_loadd(2, const_cast<c10::BFloat16*>(load_2), b_stride);
       _tile_dpbf16ps(6, 0, 2);
-      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_stream_loadd(3, const_cast<c10::BFloat16*>(load_3), b_stride);
       _tile_dpbf16ps(7, 0, 3);
     }
 
@@ -302,21 +376,25 @@ class TileGemm122<c10::BFloat16> {
     _tile_loadconfig(&config);
   }
 };
+
 }  // namespace
 
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::AMX, scalar_t, head_dim, kv_cache_scalar_t> {
+  static constexpr bool fp8_kv =
+      std::is_same_v<kv_cache_scalar_t, c10::Float8_e4m3fn> ||
+      std::is_same_v<kv_cache_scalar_t, c10::Float8_e5m2>;
+
  public:
   using query_t = scalar_t;
   using q_buffer_t = scalar_t;
-  using kv_cache_t = scalar_t;
+  using kv_cache_t = kv_cache_scalar_t;
   using logits_buffer_t = float;
   using partial_output_buffer_t = float;
   using prob_buffer_t = scalar_t;
 
   constexpr static int64_t BlockSizeAlignment =
-      AMX_TILE_ROW_BYTES /
-      sizeof(kv_cache_t);  // KV token num unit of QK and PV phases
+      32;  // AMX_TILE_ROW_NUM = 16 tokens/tile; 32 = 2 tiles
   constexpr static int64_t HeadDimAlignment =
       2 * (AMX_TILE_ROW_BYTES / 4);  // headdim num unit of PV phase
   constexpr static int64_t MaxQHeadNumPerIteration = 32;
@@ -324,6 +402,9 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
   constexpr static ISA ISAType = ISA::AMX;
   constexpr static bool scale_on_logits = true;
 
+  float k_scale = 1.0f;
+  float v_scale = 1.0f;
+
  public:
   AttentionImpl() : current_q_head_num_(0) {
     // Use all columns in AMX tiles
@@ -332,21 +413,50 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
 
   ~AttentionImpl() { _tile_release(); }
 
+  void init_from_input(const AttentionInput* input) {
+    if constexpr (fp8_kv) {
+      k_scale = input->k_scale_fp8;
+      v_scale = input->v_scale_fp8;
+    }
+  }
+
+  float get_output_v_scale() const noexcept {
+    if constexpr (fp8_kv) {
+      // AMX dequant places FP8 payload into a BF16 field (exponent bias 127).
+      // Correction = 2^(127 - FP8_bias): E4M3 bias=7 → 2^120, E5M2 bias=15 →
+      // 2^112.
+      constexpr float bias =
+          std::is_same_v<kv_cache_t, c10::Float8_e5m2> ? 0x1p112f : 0x1p120f;
+      return v_scale * bias;
+    }
+    return 1.0f;
+  }
+
   template <template <typename tile_gemm_t> typename attention>
   FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    if constexpr (fp8_kv) {
+      // Same bias correction as get_output_v_scale: AMX FP8→BF16 dequant
+      // shifts the exponent bias from FP8 to BF16 (127), so we multiply by
+      // 2^(127-FP8_bias) to recover the true value. E4M3: 2^120, E5M2: 2^112.
+      const float bias =
+          std::is_same_v<kv_cache_t, c10::Float8_e5m2> ? 0x1p112f : 0x1p120f;
+      scale *= k_scale * bias;
+    }
     if (q_head_num > AMX_TILE_ROW_NUM) {
       if (q_head_num != current_q_head_num_) {
         current_q_head_num_ = q_head_num;
-        TileGemm224<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+        TileGemm224<q_buffer_t, kv_cache_t>::init_tile_config(q_head_num,
+                                                              amx_tile_config_);
       }
-      attention<TileGemm224<kv_cache_t>> attention_iteration;
+      attention<TileGemm224<q_buffer_t, kv_cache_t>> attention_iteration;
       attention_iteration(CPU_ATTENTION_PARAMS);
     } else {
       if (q_head_num != current_q_head_num_) {
         current_q_head_num_ = q_head_num;
-        TileGemm122<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+        TileGemm122<q_buffer_t, kv_cache_t>::init_tile_config(q_head_num,
+                                                              amx_tile_config_);
       }
-      attention<TileGemm122<kv_cache_t>> attention_iteration;
+      attention<TileGemm122<q_buffer_t, kv_cache_t>> attention_iteration;
       attention_iteration(CPU_ATTENTION_PARAMS);
     }
   }
@@ -411,13 +521,26 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
   // reshape KV to AMX friendly layout
   static void reshape_and_cache(
       const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      kv_cache_t* __restrict__ key_cache, kv_cache_t* __restrict__ value_cache,
       const int64_t* __restrict__ slot_mapping, const int64_t token_num,
       const int64_t key_token_num_stride, const int64_t value_token_num_stride,
       const int64_t head_num, const int64_t key_head_num_stride,
       const int64_t value_head_num_stride, const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
-      const int64_t block_size, const int64_t block_size_stride) {
+      const int64_t block_size, const int64_t block_size_stride,
+      const float k_inv = 0.0f, const float v_inv = 0.0f) {
+    if constexpr (fp8_kv) {
+      constexpr auto qfn = select_fp8_quant_fn<kv_cache_t>();
+      reshape_and_cache_fp8_amx_impl<scalar_t, qfn>(
+          key, value, reinterpret_cast<uint8_t*>(key_cache),
+          reinterpret_cast<uint8_t*>(value_cache), slot_mapping, token_num,
+          head_num, head_dim, block_size, key_token_num_stride,
+          key_head_num_stride, value_token_num_stride, value_head_num_stride,
+          num_blocks_stride, cache_head_num_stride, num_blocks_stride,
+          cache_head_num_stride, k_inv, v_inv);
+      return;
+    }
+
     // For AMX 2D tiles, size of each line is 64 bytes
     constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
     // For AMX B matrix, N always is 16
@@ -426,6 +549,9 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
     // For now suppose block_size is divisible by amx_tile_column_num
     TORCH_CHECK_EQ(block_size % amx_b_tile_k_size, 0);
 
+    scalar_t* __restrict__ kc = reinterpret_cast<scalar_t*>(key_cache);
+    scalar_t* __restrict__ vc = reinterpret_cast<scalar_t*>(value_cache);
+
 #pragma omp parallel for collapse(2)
     for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
       for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
@@ -453,8 +579,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
           constexpr int64_t quadword_num_per_group =
               token_num_per_group * quadword_num;
           int32_t* key_cache_start_ptr =
-              reinterpret_cast<int32_t*>(key_cache +
-                                         block_idx * num_blocks_stride +
+              reinterpret_cast<int32_t*>(kc + block_idx * num_blocks_stride +
                                          head_idx * cache_head_num_stride) +
               group_idx * quadword_num_per_group + group_offset;
 
@@ -483,7 +608,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
                                             token_idx * value_token_num_stride +
                                             head_idx * value_head_num_stride;
           scalar_t* value_cache_start_ptr =
-              value_cache + block_idx * num_blocks_stride +
+              vc + block_idx * num_blocks_stride +
               head_idx * cache_head_num_stride +
               sub_group_idx * token_num_per_sub_group * amx_b_tile_n_size +
               sub_group_offset;
diff --git a/csrc/cpu/cpu_attn_fp8.hpp b/csrc/cpu/cpu_attn_fp8.hpp
new file mode 100644
index 000000000000..764b6ed7f84a
--- /dev/null
+++ b/csrc/cpu/cpu_attn_fp8.hpp
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "cpu/utils.hpp"
+
+typedef uint32_t __attribute__((__may_alias__)) u32_alias_t;
+typedef uint16_t __attribute__((__may_alias__)) u16_alias_t;
+typedef float __attribute__((__may_alias__)) f32_alias_t;
+
+// Reference scalar dequant — used to verify vectorized AMX dequant.
+inline float fp8e4m3_to_float_scalar(uint8_t b, float scale) noexcept {
+  // NaN encoding in E4M3
+  if ((b & 0x7F) == 0x7F) return std::numeric_limits<float>::quiet_NaN();
+  uint32_t b_u32 = static_cast<uint32_t>(b);
+  uint32_t sign = (b_u32 & 0x80) << 24;
+  uint32_t payload = (b_u32 & 0x7F) << 20;
+  uint32_t bits = sign | payload;
+  float b_f32_unscaled = *reinterpret_cast<const f32_alias_t*>(&bits);
+  float b_f32_scaled = b_f32_unscaled * scale * 0x1p120f;
+  return b_f32_scaled;
+}
+
+inline uint8_t float_to_fp8e4m3_scalar(float v, float inv_scale) noexcept {
+  v *= inv_scale;
+  constexpr float fp8_max = 448.0f;
+  v = std::max(-fp8_max, std::min(fp8_max, v));
+  if (v == 0.0f) return 0;
+
+  // Inverse mapping of fp8e4m3_to_float_scalar: shift the effective exponent
+  // bias from fp32 (127) back to fp8 e4m3 (7), then pack sign|payload.
+  float v_f32_unscaled = v * 0x1p-120f;
+  uint32_t bits = *reinterpret_cast<const u32_alias_t*>(&v_f32_unscaled);
+  uint8_t sign = static_cast<uint8_t>((bits >> 24) & 0x80);
+  uint8_t payload = static_cast<uint8_t>((bits >> 20) & 0x7F);
+  if (payload == 0) return sign;
+  payload = std::min<uint8_t>(payload, 0x7E);  // keep 0x7F as NaN encoding
+  return static_cast<uint8_t>(sign | payload);
+}
+
+// ---------------------------------------------------------------------------
+// AMX reshape impl — parameterised on the quantisation function.
+// Writes key/value into uint8 FP8 KV cache using the AMX tile-friendly layout.
+// K: halfword-packed (2 FP8 per uint16, token_num_per_group=16).
+// V: sub-group packing (token_num_per_sub_group=2, head_elems_per_group=16).
+// block_size must be divisible by 32.
+// ---------------------------------------------------------------------------
+template <typename scalar_t, uint8_t (*quant_fn)(float, float)>
+inline void reshape_and_cache_fp8_amx_impl(
+    const scalar_t* key_ptr, const scalar_t* value_ptr, uint8_t* key_cache_ptr,
+    uint8_t* value_cache_ptr, const int64_t* slot_ptr, int64_t token_num,
+    int64_t head_num, int64_t head_dim, int64_t block_size, int64_t k_stride0,
+    int64_t k_stride1, int64_t v_stride0, int64_t v_stride1, int64_t kc_stride0,
+    int64_t kc_stride1, int64_t vc_stride0, int64_t vc_stride1, float k_inv,
+    float v_inv) {
+  constexpr int64_t token_num_per_group = 16;  // AMX_TILE_ROW_NUM
+  const int64_t halfword_num = head_dim / 2;   // 2 FP8 per uint16
+  const int64_t halfword_num_per_group = token_num_per_group * halfword_num;
+  constexpr int64_t head_elems_per_group = 16;
+  constexpr int64_t token_num_per_sub_group = 2;  // = 4 / sizeof(BF16)
+  const int64_t group_num = head_dim / head_elems_per_group;
+  const int64_t group_size = block_size * head_elems_per_group;
+
+#pragma omp parallel for collapse(2) schedule(static)
+  for (int64_t tok = 0; tok < token_num; ++tok) {
+    for (int64_t h = 0; h < head_num; ++h) {
+      const int64_t slot = slot_ptr[tok];
+      if (slot < 0) continue;
+      const int64_t block_idx = slot / block_size;
+      const int64_t block_offset = slot % block_size;
+
+      // Key: halfword-packed, 2 FP8 per uint16
+      {
+        const scalar_t* ksrc = key_ptr + tok * k_stride0 + h * k_stride1;
+        const int64_t group_idx = block_offset / token_num_per_group;
+        const int64_t group_offset = block_offset % token_num_per_group;
+        uint16_t* kdst =
+            reinterpret_cast<uint16_t*>(key_cache_ptr + block_idx * kc_stride0 +
+                                        h * kc_stride1) +
+            group_idx * halfword_num_per_group + group_offset;
+        for (int64_t j = 0; j < halfword_num; ++j) {
+          uint8_t fp8_0 = quant_fn(static_cast<float>(ksrc[j * 2]), k_inv);
+          uint8_t fp8_1 = quant_fn(static_cast<float>(ksrc[j * 2 + 1]), k_inv);
+          uint8_t bytes[2] = {fp8_0, fp8_1};
+          uint16_t hw = *reinterpret_cast<const u16_alias_t*>(bytes);
+          kdst[j * token_num_per_group] = hw;
+        }
+      }
+
+      // Value: sub-group packing (token_num_per_sub_group = 2)
+      {
+        const scalar_t* vsrc = value_ptr + tok * v_stride0 + h * v_stride1;
+        const int64_t sub_group_idx = block_offset / token_num_per_sub_group;
+        const int64_t sub_group_offset = block_offset % token_num_per_sub_group;
+        uint8_t* vdst =
+            value_cache_ptr + block_idx * vc_stride0 + h * vc_stride1 +
+            sub_group_idx * token_num_per_sub_group * head_elems_per_group +
+            sub_group_offset;
+        for (int64_t i = 0; i < group_num; ++i) {
+          for (int64_t j = 0; j < head_elems_per_group; ++j)
+            vdst[j * token_num_per_sub_group] =
+                quant_fn(static_cast<float>(vsrc[j]), v_inv);
+          vsrc += head_elems_per_group;
+          vdst += group_size;
+        }
+      }
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// FP8 E5M2 scalar helpers
+// ---------------------------------------------------------------------------
+
+// Reference scalar dequant — used to verify vectorized AMX dequant.
+// FP8 E5M2: s[7] e[6:2] m[1:0], exponent bias = 15 (same as FP16).
+// Byte b → FP16 bits = b << 8 (no bias correction needed).
+inline float fp8e5m2_to_float_scalar(uint8_t b, float scale) noexcept {
+  const uint8_t exp_bits = (b >> 2) & 0x1F;
+  const uint8_t mant_bits = b & 0x03;
+  // NaN: exp=11111, mant!=00
+  if (exp_bits == 0x1F && mant_bits != 0)
+    return std::numeric_limits<float>::quiet_NaN();
+  const uint32_t sign = static_cast<uint32_t>(b & 0x80) << 24;
+  if (exp_bits == 0x1F)
+    return sign ? -std::numeric_limits<float>::infinity()
+                : std::numeric_limits<float>::infinity();
+  if (exp_bits == 0) {  // subnormal: (-1)^s * 2^-14 * mant/4
+    if (mant_bits == 0) return 0.0f;
+    float v = mant_bits * 0x1p-16f;
+    return (sign ? -v : v) * scale;
+  }
+  // Normal: FP32 exp = exp5 - 15 + 127, mantissa top 2 bits
+  uint32_t fp32_bits = sign |
+                       ((static_cast<uint32_t>(exp_bits) - 15 + 127) << 23) |
+                       (static_cast<uint32_t>(mant_bits) << 21);
+  float val = *reinterpret_cast<const f32_alias_t*>(&fp32_bits);
+  return val * scale;
+}
+
+inline uint8_t float_to_fp8e5m2_scalar(float v, float inv_scale) noexcept {
+  v *= inv_scale;
+  constexpr float fp8_e5m2_max = 57344.0f;
+  v = std::max(-fp8_e5m2_max, std::min(fp8_e5m2_max, v));
+  if (v == 0.0f) return 0;
+  uint32_t bits = *reinterpret_cast<const u32_alias_t*>(&v);
+  const uint8_t sign = static_cast<uint8_t>((bits >> 24) & 0x80);
+  const int32_t exp_fp32 = static_cast<int32_t>((bits >> 23) & 0xFF) - 127;
+  const uint8_t mant2 = static_cast<uint8_t>((bits >> 21) & 0x03);
+  if (exp_fp32 < -14) {  // subnormal in E5M2
+    const int shift = -14 - exp_fp32;
+    if (shift + 21 >= 32)
+      return sign;  // underflow: too small for E5M2 subnormal
+    const uint32_t m = (0x800000u | (bits & 0x7FFFFFu)) >> (shift + 21);
+    return sign | static_cast<uint8_t>(std::min<uint32_t>(m, 3u));
+  }
+  const uint8_t exp5 = static_cast<uint8_t>(exp_fp32 + 15);
+  return sign | (exp5 << 2) | mant2;
+}
+
+// ---------------------------------------------------------------------------
+// Select the FP8 quant function at compile time based on kv_cache_t.
+// ---------------------------------------------------------------------------
+template <typename kv_cache_t>
+constexpr auto select_fp8_quant_fn() {
+  if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e5m2>)
+    return float_to_fp8e5m2_scalar;
+  else
+    return float_to_fp8e4m3_scalar;
+}
+
+// ---------------------------------------------------------------------------
+// VEC reshape impl — parameterised on the quantisation function.
+// Writes key (column-major) and value (row-major) into uint8 FP8 KV cache.
+// The pragma omp must live outside VLLM_DISPATCH_FLOATING_TYPES because
+// #pragma cannot appear inside variadic macro arguments.
+// ---------------------------------------------------------------------------
+template <typename scalar_t, uint8_t (*quant_fn)(float, float)>
+inline void reshape_and_cache_fp8_vec_impl(
+    const scalar_t* key_ptr, const scalar_t* value_ptr, uint8_t* key_cache_ptr,
+    uint8_t* value_cache_ptr, const int64_t* slot_ptr, int64_t token_num,
+    int64_t head_num, int64_t head_dim, int64_t block_size, int64_t k_stride0,
+    int64_t k_stride1, int64_t v_stride0, int64_t v_stride1, int64_t kc_stride0,
+    int64_t kc_stride1, int64_t vc_stride0, int64_t vc_stride1, float k_inv,
+    float v_inv) {
+#pragma omp parallel for collapse(2) schedule(static)
+  for (int64_t tok = 0; tok < token_num; ++tok) {
+    for (int64_t h = 0; h < head_num; ++h) {
+      const int64_t slot = slot_ptr[tok];
+      if (slot < 0) continue;
+      const int64_t block_idx = slot / block_size;
+      const int64_t block_offset = slot % block_size;
+
+      // Key layout: column-major within block
+      const scalar_t* ksrc = key_ptr + tok * k_stride0 + h * k_stride1;
+      uint8_t* kdst = key_cache_ptr + block_idx * kc_stride0 + h * kc_stride1 +
+                      block_offset;
+      for (int64_t i = 0; i < head_dim; ++i)
+        kdst[i * block_size] = quant_fn(static_cast<float>(ksrc[i]), k_inv);
+
+      // Value layout: row-major within block (contiguous head_dim bytes)
+      const scalar_t* vsrc = value_ptr + tok * v_stride0 + h * v_stride1;
+      uint8_t* vdst = value_cache_ptr + block_idx * vc_stride0 +
+                      h * vc_stride1 + block_offset * head_dim;
+      for (int64_t i = 0; i < head_dim; ++i)
+        vdst[i] = quant_fn(static_cast<float>(vsrc[i]), v_inv);
+    }
+  }
+}
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index c1974bfd0a51..b9987fb26c19 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -12,10 +12,24 @@
 #include "cpu/utils.hpp"
 
 namespace cpu_attention {
-enum class ISA { AMX, VEC, VEC16, NEON, VXE };
+enum class ISA { AMX, VEC, VEC16, NEON, VXE, VSX };
 
-template <ISA isa, typename scalar_t, int64_t head_dim>
-class AttentionImpl {};
+// Mirrors csrc/attention/dtype_fp8.cuh Fp8KVCacheDataType exactly.
+enum class Fp8KVCacheDataType {
+  kAuto = 0,
+  kFp8E4M3 = 1,
+  kFp8E5M2 = 2,
+};
+
+struct AttentionInput;
+
+template <ISA isa, typename scalar_t, int64_t head_dim,
+          typename kv_cache_scalar_t = scalar_t>
+class AttentionImpl {
+ public:
+  void init_from_input(const AttentionInput*) {}
+  float get_output_v_scale() const noexcept { return 1.0f; }
+};
 
 struct AttentionWorkItemGroup {
   int32_t req_id;
@@ -150,6 +164,9 @@ struct AttentionMetadata {
       case ISA::VXE:
         ss << "VXE, ";
         break;
+      case ISA::VSX:
+        ss << "VSX, ";
+        break;
     }
     ss << "workitem_group_num: " << workitem_group_num
        << ", reduction_item_num: " << reduction_item_num
@@ -780,6 +797,9 @@ struct AttentionInput {
   int32_t sliding_window_left;
   int32_t sliding_window_right;
   float softcap;
+  // FP8 KV cache scales (used by FP8 attention implementations)
+  float k_scale_fp8 = 1.0f;
+  float v_scale_fp8 = 1.0f;
 };
 
 #define DEFINE_CPU_ATTENTION_PARAMS                                         \
@@ -1374,6 +1394,13 @@ class AttentionMainLoop {
       }
 
       attention_impl_t attn_impl;
+      constexpr bool fp8_kv = std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+                              std::is_same_v<kv_cache_t, c10::Float8_e5m2>;
+      float output_v_scale = 1.0f;
+      if constexpr (fp8_kv) {
+        attn_impl.init_from_input(input);
+        output_v_scale = attn_impl.get_output_v_scale();
+      }
 
       // general information
       const int32_t q_head_num = input->num_heads;
@@ -1753,7 +1780,7 @@ class AttentionMainLoop {
                                reinterpret_cast<query_t*>(input->output) +
                                    output_buffer_offset,
                                sum_buffer, actual_q_heads_per_kv,
-                               actual_q_token_num, q_head_num);
+                               actual_q_token_num, q_head_num, output_v_scale);
                 } else {
                   const int32_t stride =
                       actual_q_heads_per_kv * split_kv_q_token_num_threshold;
@@ -1823,7 +1850,7 @@ class AttentionMainLoop {
               split_output_buffer,
               reinterpret_cast<query_t*>(input->output) + output_buffer_offset,
               split_sum_buffer, actual_q_heads_per_kv, curr_output_token_num,
-              q_head_num);
+              q_head_num, output_v_scale);
         }
       }
     }
@@ -1947,8 +1974,8 @@ class AttentionMainLoop {
                     query_t* __restrict__ curr_output_buffer,
                     float* __restrict__ sum_buffer,
                     const int32_t q_heads_per_kv,
-                    const int32_t actual_q_token_num,
-                    const int32_t q_head_num) {
+                    const int32_t actual_q_token_num, const int32_t q_head_num,
+                    const float v_scale = 1.0f) {
     // final output
     using output_vec_t = typename VecTypeTrait<query_t>::vec_t;
 
@@ -1962,7 +1989,7 @@ class AttentionMainLoop {
           curr_partial_output_buffer;
       query_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
       for (int32_t head_idx = 0; head_idx < q_heads_per_kv; ++head_idx) {
-        vec_op::FP32Vec16 inv_sum_scale_vec(1.0 / *curr_sum_buffer);
+        vec_op::FP32Vec16 inv_sum_scale_vec(v_scale / *curr_sum_buffer);
 
         for (int32_t i = 0; i < group_num_per_head; ++i) {
           vec_op::FP32Vec16 vec(curr_partial_output_buffer_iter);
diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp
index 3523893c38c5..db4c5df2e88d 100644
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -248,8 +248,8 @@ class TileGemmNeonFMLA {
 }  // namespace
 
 // this is similar to "ISA::VEC" at the moment
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::NEON, scalar_t, head_dim, kv_cache_scalar_t> {
  public:
   using query_t = scalar_t;
   using q_buffer_t = float;
@@ -343,7 +343,8 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
       const int64_t head_num, const int64_t key_head_num_stride,
       const int64_t value_head_num_stride, const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
-      const int64_t block_size, const int64_t block_size_stride) {
+      const int64_t block_size, const int64_t block_size_stride,
+      const float /*k_inv*/ = 0.0f, const float /*v_inv*/ = 0.0f) {
 #pragma omp parallel for collapse(2)
     for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
       for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
@@ -388,7 +389,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
 #ifdef ARM_BF16_SUPPORT
 // For BF16 on Arm, reuse the BFMMLA kernels with 32-token alignment.
 template <int64_t head_dim>
-class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim>
+class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim, c10::BFloat16>
     : public AttentionImplNEONBFMMLA<BLOCK_SIZE_ALIGNMENT, ISA::NEON,
                                      head_dim> {};
 #endif
diff --git a/csrc/cpu/cpu_attn_neon_bfmmla.hpp b/csrc/cpu/cpu_attn_neon_bfmmla.hpp
index fb133aa13098..4e4578a74f5b 100644
--- a/csrc/cpu/cpu_attn_neon_bfmmla.hpp
+++ b/csrc/cpu/cpu_attn_neon_bfmmla.hpp
@@ -602,7 +602,8 @@ class AttentionImplNEONBFMMLA {
       [[maybe_unused]] const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
       const int64_t block_size,
-      [[maybe_unused]] const int64_t block_size_stride) {
+      [[maybe_unused]] const int64_t block_size_stride,
+      const float /*k_inv*/ = 0.0f, const float /*v_inv*/ = 0.0f) {
     const int64_t k_block_stride = (head_dim / TILE_K) * K_INNER_STRIDE;
     const int64_t v_pair_stride =
         (block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
diff --git a/csrc/cpu/cpu_attn_vec.hpp b/csrc/cpu/cpu_attn_vec.hpp
index f51a232ba955..c3983e0578a5 100644
--- a/csrc/cpu/cpu_attn_vec.hpp
+++ b/csrc/cpu/cpu_attn_vec.hpp
@@ -1,11 +1,37 @@
 #ifndef CPU_ATTN_VEC_HPP
 #define CPU_ATTN_VEC_HPP
 
+#include "cpu_attn_fp8.hpp"
 #include "cpu_attn_impl.hpp"
 
 namespace cpu_attention {
 
 namespace {
+
+// Load 32 kv_cache_t elements starting at ptr and return them as two FP32Vec16s
+// covering the lower 16 and upper 16 positions.
+// For FP8: both halves come from a single BF16Vec32 dequant of 32 bytes.
+// For BF16/FP16/FP32: two separate vector loads at ptr and ptr+16.
+template <typename kv_cache_t>
+FORCE_INLINE std::pair<vec_op::FP32Vec16, vec_op::FP32Vec16> load_b_pair_vec(
+    const kv_cache_t* ptr) {
+  if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e4m3fn>) {
+    // BF16 container, but values are in the FP16 exponent range (bias 15 not
+    // 127).
+    vec_op::BF16Vec32 bf16_b_reg(reinterpret_cast<const uint8_t*>(ptr),
+                                 vec_op::fp8_e4m3_tag{});
+    return {vec_op::FP32Vec16(bf16_b_reg, 0), vec_op::FP32Vec16(bf16_b_reg, 1)};
+  } else if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e5m2>) {
+    vec_op::BF16Vec32 bf16_b_reg(reinterpret_cast<const uint8_t*>(ptr),
+                                 vec_op::fp8_e5m2_tag{});
+    return {vec_op::FP32Vec16(bf16_b_reg, 0), vec_op::FP32Vec16(bf16_b_reg, 1)};
+  } else {
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+    return std::make_pair(vec_op::FP32Vec16(load_vec_t(ptr)),
+                          vec_op::FP32Vec16(load_vec_t(ptr + 16)));
+  }
+}
+
 // 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
 template <typename kv_cache_t>
 class TileGemm82 {
@@ -54,10 +80,7 @@ class TileGemm82 {
                          const int32_t block_size, const int32_t dynamic_k_size,
                          const bool accum_c) {
     static_assert(0 < M && M <= 8);
-    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
 
-    kv_cache_t* __restrict__ curr_b_0 = b_tile;
-    kv_cache_t* __restrict__ curr_b_1 = b_tile + 16;
     float* __restrict__ curr_c_0 = c_tile;
     float* __restrict__ curr_c_1 = c_tile + 16;
 
@@ -76,16 +99,14 @@ class TileGemm82 {
     }
 
     float* __restrict__ curr_a = a_tile;
+    kv_cache_t* __restrict__ curr_b = b_tile;
+
     for (int32_t k = 0; k < dynamic_k_size; ++k) {
-      load_vec_t b_0_reg(curr_b_0);
-      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
-      load_vec_t b_1_reg(curr_b_1);
-      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+      auto [fp32_b_0_reg, fp32_b_1_reg] = load_b_pair_vec(curr_b);
 
       float* __restrict__ curr_m_a = curr_a;
       vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
-        float v = *curr_m_a;
-        vec_op::FP32Vec16 a_reg(v);
+        vec_op::FP32Vec16 a_reg(*curr_m_a);
         c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
         c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
 
@@ -95,8 +116,7 @@ class TileGemm82 {
 
       // update
       curr_a += 1;
-      curr_b_0 += ldb;
-      curr_b_1 += ldb;
+      curr_b += ldb;
     }
 
     vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
@@ -109,15 +129,20 @@ class TileGemm82 {
     });
   }
 };
+
 }  // namespace
 
 // This is a general but naive implementation based on vector instructions
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::VEC, scalar_t, head_dim, kv_cache_scalar_t> {
+  static constexpr bool fp8_kv =
+      std::is_same_v<kv_cache_scalar_t, c10::Float8_e4m3fn> ||
+      std::is_same_v<kv_cache_scalar_t, c10::Float8_e5m2>;
+
  public:
   using query_t = scalar_t;
   using q_buffer_t = float;
-  using kv_cache_t = scalar_t;
+  using kv_cache_t = kv_cache_scalar_t;
   using logits_buffer_t = float;
   using partial_output_buffer_t = float;
   using prob_buffer_t = float;
@@ -129,11 +154,45 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
   constexpr static int64_t MaxQHeadNumPerIteration = 8;
   constexpr static int64_t HeadDim = head_dim;
   constexpr static ISA ISAType = ISA::VEC;
-  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+  constexpr static bool scale_on_logits = fp8_kv;
+
+  float k_scale = 1.0f;
+  float v_scale = 1.0f;
 
  public:
+  void init_from_input(const AttentionInput* input) {
+    if constexpr (fp8_kv) {
+      k_scale = input->k_scale_fp8;
+      v_scale = input->v_scale_fp8;
+    }
+  }
+
+  float get_output_v_scale() const noexcept {
+    if constexpr (fp8_kv) {
+      // VEC dequant unpacks FP8 into a pseudo-FP16 layout (exponent bias 15).
+      // E4M3 (bias=7) needs correction 2^(15-7) = 2^8; E5M2 bias matches FP16
+      // so no correction.
+      if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e5m2>) {
+        return v_scale;
+      } else {
+        return v_scale * 0x1p8f;
+      }
+    }
+    return 1.0f;
+  }
+
   template <template <typename tile_gemm_t> typename attention>
   FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    if constexpr (fp8_kv) {
+      // Same bias correction as get_output_v_scale: VEC FP8→pseudo-FP16 dequant
+      // uses bias 15; E4M3 (bias=7) needs ×2^8, E5M2 (bias=15) needs no
+      // correction.
+      if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e5m2>) {
+        scale *= k_scale;
+      } else {
+        scale *= k_scale * 0x1p8f;
+      }
+    }
     attention<TileGemm82<kv_cache_t>> attention_iteration;
     attention_iteration(CPU_ATTENTION_PARAMS);
   }
@@ -161,17 +220,19 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
                               // row-major
   }
 
-  // Copy q to q_buffer and cast it to fp32
-  static void copy_q_heads_tile(
-      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
-      float* __restrict__ q_buffer, const int32_t q_num,
-      const int32_t q_heads_per_kv, const int64_t q_num_stride,
-      const int64_t q_head_stride, float scale) {
+  // Copy q to q_buffer and cast it to fp32.
+  // FP8: QK scale is folded into execute_attention; copy Q unscaled here.
+  void copy_q_heads_tile(scalar_t* __restrict__ src,
+                         float* __restrict__ q_buffer, const int32_t q_num,
+                         const int32_t q_heads_per_kv,
+                         const int64_t q_num_stride,
+                         const int64_t q_head_stride, float scale) {
     static_assert(head_dim % 16 == 0);
     constexpr int32_t unroll_size = head_dim / 16;
     using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
 
-    vec_op::FP32Vec16 scale_vec(scale);
+    const float effective_scale = fp8_kv ? 1.0f : scale;
+    vec_op::FP32Vec16 scale_vec(effective_scale);
     for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
       for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
         scalar_t* __restrict__ curr_q =
@@ -196,13 +257,26 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
   // reshape K as column-major and V as row-major
   static void reshape_and_cache(
       const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      kv_cache_t* __restrict__ key_cache, kv_cache_t* __restrict__ value_cache,
       const int64_t* __restrict__ slot_mapping, const int64_t token_num,
       const int64_t key_token_num_stride, const int64_t value_token_num_stride,
       const int64_t head_num, const int64_t key_head_num_stride,
       const int64_t value_head_num_stride, const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
-      const int64_t block_size, const int64_t block_size_stride) {
+      const int64_t block_size, const int64_t block_size_stride,
+      const float k_inv = 0.0f, const float v_inv = 0.0f) {
+    if constexpr (fp8_kv) {
+      constexpr auto qfn = select_fp8_quant_fn<kv_cache_t>();
+      reshape_and_cache_fp8_vec_impl<scalar_t, qfn>(
+          key, value, reinterpret_cast<uint8_t*>(key_cache),
+          reinterpret_cast<uint8_t*>(value_cache), slot_mapping, token_num,
+          head_num, head_dim, block_size, key_token_num_stride,
+          key_head_num_stride, value_token_num_stride, value_head_num_stride,
+          num_blocks_stride, cache_head_num_stride, num_blocks_stride,
+          cache_head_num_stride, k_inv, v_inv);
+      return;
+    }
+
 #pragma omp parallel for collapse(2)
     for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
       for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
@@ -220,8 +294,9 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
                                           token_idx * key_token_num_stride +
                                           head_idx * key_head_num_stride;
           scalar_t* key_cache_start_ptr =
-              key_cache + block_idx * num_blocks_stride +
-              head_idx * cache_head_num_stride + block_offset;
+              reinterpret_cast<scalar_t*>(key_cache) +
+              block_idx * num_blocks_stride + head_idx * cache_head_num_stride +
+              block_offset;
 
 #pragma GCC unroll 8
           for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
@@ -234,8 +309,9 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
                                             token_idx * value_token_num_stride +
                                             head_idx * value_head_num_stride;
           scalar_t* value_cache_start_ptr =
-              value_cache + block_idx * num_blocks_stride +
-              head_idx * cache_head_num_stride + block_offset * head_dim;
+              reinterpret_cast<scalar_t*>(value_cache) +
+              block_idx * num_blocks_stride + head_idx * cache_head_num_stride +
+              block_offset * head_dim;
           std::memcpy(value_cache_start_ptr, value_start_ptr,
                       sizeof(scalar_t) * head_dim);
         }
@@ -243,6 +319,7 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
     }
   }
 };
+
 }  // namespace cpu_attention
 
 #endif
diff --git a/csrc/cpu/cpu_attn_vec16.hpp b/csrc/cpu/cpu_attn_vec16.hpp
index 06e4ad7624e9..bc15d614a7ed 100644
--- a/csrc/cpu/cpu_attn_vec16.hpp
+++ b/csrc/cpu/cpu_attn_vec16.hpp
@@ -116,9 +116,9 @@ class TileGemm161 {
 }  // namespace
 
 // This is a general but naive implementation based on vector instructions
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::VEC16, scalar_t, head_dim>
-    : public AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::VEC16, scalar_t, head_dim, kv_cache_scalar_t>
+    : public AttentionImpl<ISA::VEC, scalar_t, head_dim, kv_cache_scalar_t> {
  public:
   using query_t = scalar_t;
   using q_buffer_t = float;
diff --git a/csrc/cpu/cpu_attn_vsx.hpp b/csrc/cpu/cpu_attn_vsx.hpp
new file mode 100644
index 000000000000..c7e1502bcb05
--- /dev/null
+++ b/csrc/cpu/cpu_attn_vsx.hpp
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#ifndef CPU_ATTN_VSX_HPP
+#define CPU_ATTN_VSX_HPP
+
+#include "cpu_attn_impl.hpp"
+#include <altivec.h>
+#include <type_traits>
+
+namespace cpu_attention {
+
+namespace {
+
+// ppc64le Vector = 16 bytes (128 bits)
+#define BLOCK_SIZE_ALIGNMENT 32
+#define HEAD_SIZE_ALIGNMENT 32
+#define MAX_Q_HEAD_NUM_PER_ITER 16
+
+template <typename kv_cache_t>
+FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, __vector float& b0,
+                                     __vector float& b1);
+
+// [1] Float Specialization
+template <>
+FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, __vector float& b0,
+                                            __vector float& b1) {
+  b0 = vec_xl(0, const_cast<float*>(p));
+  b1 = vec_xl(0, const_cast<float*>(p + 4));
+}
+
+// [2] BFloat16 Specialization (Little Endian ppc64le)
+// On ppc64le (LE): BF16 bits should land in the HIGH 16 bits of each float32.
+// Byte layout of float32 on LE: [byte0(LSB), byte1, byte2, byte3(MSB)]
+// We need BF16 in bytes2-3 (high half) with bytes0-1 zeroed.
+// vec_mergeh on LE interleaves elements 0..3: result_i = {a[i], b[i]}
+// So vec_mergeh(zeros_u16, raw_u16) gives for each uint16 pair:
+//   uint16[2i]   = zeros[i]  -> low 16 bits of uint32  -> zeroed mantissa LSBs
+//   uint16[2i+1] = raw[i]    -> high 16 bits of uint32 -> BF16 bits
+// Cast to float32 gives exactly (bf16_bits << 16) per element.
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
+                                                    __vector float& b0,
+                                                    __vector float& b1) {
+  __vector unsigned short raw = vec_xl(
+      0, reinterpret_cast<unsigned short*>(const_cast<c10::BFloat16*>(p)));
+  __vector unsigned short zeros = vec_splat_u16(0);
+
+  // LE: zeros in low 16 bits, raw in high 16 bits → bf16 << 16 == float32
+  b0 = (__vector float)vec_mergeh(zeros, raw);
+  b1 = (__vector float)vec_mergel(zeros, raw);
+}
+
+// Note: c10::Half (FP16) is not supported on PowerPC architecture
+
+template <int32_t M, typename kv_cache_t>
+FORCE_INLINE void gemm_micro_ppc64le_Mx8_Ku4(
+    const float* __restrict A,       // [M x K]
+    const kv_cache_t* __restrict B,  // [K x 8]
+    float* __restrict C,             // [M x 8]
+    int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
+  static_assert(1 <= M && M <= 8, "M must be in [1,8]");
+
+#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
+#define IF_M(i) if constexpr (M > (i))
+
+  // 1. Define A pointers
+#define DECL_A(i) const float* a##i = A + (i) * lda;
+  ROWS_APPLY(DECL_A)
+#undef DECL_A
+
+  // 2. Define Accumulators (2 vectors covers 8 columns)
+#define DECL_ACC(i) __vector float acc##i##_0, acc##i##_1;
+  ROWS_APPLY(DECL_ACC)
+#undef DECL_ACC
+
+  // 3. Initialize Accumulators (Load C or Zero)
+#define INIT_ACC(i)                                                  \
+  IF_M(i) {                                                          \
+    if (accumulate) {                                                \
+      acc##i##_0 = vec_xl(0, const_cast<float*>(C + (i) * ldc + 0)); \
+      acc##i##_1 = vec_xl(0, const_cast<float*>(C + (i) * ldc + 4)); \
+    } else {                                                         \
+      acc##i##_0 = vec_splats(0.0f);                                 \
+      acc##i##_1 = vec_splats(0.0f);                                 \
+    }                                                                \
+  }
+  ROWS_APPLY(INIT_ACC)
+#undef INIT_ACC
+
+  int32_t k = 0;
+
+  for (; k + 3 < K; k += 4) {
+    // Load 4 values of A for each Row M: A[k...k+3]
+#define LOAD_A4(i)        \
+  __vector float a##i##v; \
+  IF_M(i) a##i##v = vec_xl(0, const_cast<float*>(a##i + k));
+    ROWS_APPLY(LOAD_A4)
+#undef LOAD_A4
+
+    // FMA for specific lane L of A
+    // ppc64le: vec_madd(b, vec_splat(a, lane), acc)
+#define FMAS_LANE(i, aiv, L)                        \
+  IF_M(i) {                                         \
+    __vector float a_broad = vec_splat(aiv, L);     \
+    acc##i##_0 = vec_madd(b0, a_broad, acc##i##_0); \
+    acc##i##_1 = vec_madd(b1, a_broad, acc##i##_1); \
+  }
+
+    // Unroll K=0..3
+    {
+      __vector float b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
+#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
+      ROWS_APPLY(STEP_K0)
+#undef STEP_K0
+    }
+    {
+      __vector float b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
+#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
+      ROWS_APPLY(STEP_K1)
+#undef STEP_K1
+    }
+    {
+      __vector float b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
+#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
+      ROWS_APPLY(STEP_K2)
+#undef STEP_K2
+    }
+    {
+      __vector float b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
+#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
+      ROWS_APPLY(STEP_K3)
+#undef STEP_K3
+    }
+#undef FMAS_LANE
+  }
+
+  for (; k < K; ++k) {
+    __vector float b0, b1;
+    load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
+#define TAIL_ROW(i)                              \
+  IF_M(i) {                                      \
+    __vector float ai = vec_splats(*(a##i + k)); \
+    acc##i##_0 = vec_madd(b0, ai, acc##i##_0);   \
+    acc##i##_1 = vec_madd(b1, ai, acc##i##_1);   \
+  }
+    ROWS_APPLY(TAIL_ROW)
+#undef TAIL_ROW
+  }
+
+#define STORE_ROW(i)                           \
+  IF_M(i) {                                    \
+    vec_xst(acc##i##_0, 0, C + (i) * ldc + 0); \
+    vec_xst(acc##i##_1, 0, C + (i) * ldc + 4); \
+  }
+  ROWS_APPLY(STORE_ROW)
+#undef STORE_ROW
+
+#undef ROWS_APPLY
+#undef IF_M
+}
+
+template <int32_t N, typename kv_cache_t>
+FORCE_INLINE void gemm_macro_ppc64le_Mx8_Ku4(const float* __restrict A,
+                                             const kv_cache_t* __restrict B,
+                                             float* __restrict C, int32_t M,
+                                             int32_t K, int64_t lda,
+                                             int64_t ldb, int64_t ldc,
+                                             bool accumulate) {
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+  for (int32_t m = 0; m < M;) {
+    int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
+    const float* Ab = A + m * lda;
+    float* Cb = C + m * ldc;
+
+    for (int32_t n = 0; n < N; n += 8) {
+      const kv_cache_t* Bn = B + n;
+      float* Cn = Cb + n;
+      switch (mb) {
+        case 8:
+          gemm_micro_ppc64le_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                    K, accumulate);
+          break;
+        case 4:
+          gemm_micro_ppc64le_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                    K, accumulate);
+          break;
+        case 2:
+          gemm_micro_ppc64le_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                    K, accumulate);
+          break;
+        default:
+          gemm_micro_ppc64le_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                    K, accumulate);
+          break;
+      }
+    }
+    m += mb;
+  }
+}
+
+template <typename kv_cache_t>
+class TileGemmPPC64 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      gemm_macro_ppc64le_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
+    } else {
+      gemm_macro_ppc64le_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
+          accum_c);
+    }
+  }
+};
+
+}  // namespace
+
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VSX, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment = BLOCK_SIZE_ALIGNMENT;
+  constexpr static int64_t HeadDimAlignment = HEAD_SIZE_ALIGNMENT;
+  constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VSX;
+  constexpr static bool scale_on_logits =
+      false;  // Scale is applied to Q during copy
+
+ public:
+  AttentionImpl() {}
+
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemmPPC64<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // Strides for Memory Layout
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // [head_dim, block_size] layout
+  }
+
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;
+  }
+
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;
+  }
+
+  static void copy_q_heads_tile(scalar_t* __restrict__ src,
+                                float* __restrict__ q_buffer,
+                                const int32_t q_num,
+                                const int32_t q_heads_per_kv,
+                                const int64_t q_num_stride,
+                                const int64_t q_head_stride, float scale) {
+    __vector float scale_vec = vec_splats(scale);
+    constexpr bool is_bf16 = std::is_same<scalar_t, c10::BFloat16>::value;
+
+    for (int32_t i = 0; i < q_num; ++i) {
+      for (int32_t h = 0; h < q_heads_per_kv; ++h) {
+        scalar_t* curr_src = src + i * q_num_stride + h * q_head_stride;
+        float* curr_dst =
+            q_buffer + i * q_heads_per_kv * head_dim + h * head_dim;
+
+        int32_t d = 0;
+        for (; d <= head_dim - 8; d += 8) {
+          __vector float v0, v1;
+          load_row8_B_as_f32<scalar_t>(curr_src + d, v0, v1);
+
+          v0 = vec_mul(v0, scale_vec);
+          v1 = vec_mul(v1, scale_vec);
+
+          vec_xst(v0, 0, curr_dst + d);
+          vec_xst(v1, 0, curr_dst + d + 4);
+        }
+
+        for (; d < head_dim; ++d) {
+          float val = static_cast<float>(curr_src[d]);
+          curr_dst[d] = val * scale;
+        }
+      }
+    }
+  }
+
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride,
+      const float k_inv = 0.0f, const float v_inv = 0.0f) {
+    // k_inv and v_inv are unused on VSX: FP8 KV cache is not supported on
+    // PowerPC. The parameters are present to match the common interface.
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) continue;
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+
+        {
+          const scalar_t* key_src = key + token_idx * key_token_num_stride +
+                                    head_idx * key_head_num_stride;
+          scalar_t* key_dst = key_cache + block_idx * num_blocks_stride +
+                              head_idx * cache_head_num_stride + block_offset;
+
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_dst[j] = key_src[i];
+          }
+        }
+
+        {
+          const scalar_t* val_src = value + token_idx * value_token_num_stride +
+                                    head_idx * value_head_num_stride;
+          scalar_t* val_dst = value_cache + block_idx * num_blocks_stride +
+                              head_idx * cache_head_num_stride +
+                              block_offset * head_dim;
+
+          std::memcpy(val_dst, val_src, sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace cpu_attention
+
+#undef BLOCK_SIZE_ALIGNMENT
+#undef HEAD_SIZE_ALIGNMENT
+#undef MAX_Q_HEAD_NUM_PER_ITER
+
+#endif  // CPU_ATTN_VSX_HPP
diff --git a/csrc/cpu/cpu_attn_vxe.hpp b/csrc/cpu/cpu_attn_vxe.hpp
index 45db4ebd7396..cbfda4cf7842 100644
--- a/csrc/cpu/cpu_attn_vxe.hpp
+++ b/csrc/cpu/cpu_attn_vxe.hpp
@@ -244,8 +244,8 @@ class TileGemmS390X {
 
 }  // namespace
 
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::VXE, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::VXE, scalar_t, head_dim, kv_cache_scalar_t> {
  public:
   using query_t = scalar_t;
   using q_buffer_t = float;
@@ -342,7 +342,8 @@ class AttentionImpl<ISA::VXE, scalar_t, head_dim> {
       const int64_t head_num, const int64_t key_head_num_stride,
       const int64_t value_head_num_stride, const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
-      const int64_t block_size, const int64_t block_size_stride) {
+      const int64_t block_size, const int64_t block_size_stride,
+      const float /*k_inv*/ = 0.0f, const float /*v_inv*/ = 0.0f) {
 #pragma omp parallel for collapse(2)
     for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
       for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index f9975b4e29cd..b408731f40d1 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -15,6 +15,9 @@ using namespace at::vec;
 
 namespace vec_op {
 
+struct fp8_e4m3_tag {};
+struct fp8_e5m2_tag {};
+
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
@@ -322,6 +325,9 @@ struct BF16Vec32 : public VectorizedRegWrapper<BF16Vec32, 4, c10::BFloat16> {
     reg.val[2] = vec8_data.reg.val[0];
     reg.val[3] = vec8_data.reg.val[0];
   };
+
+  explicit BF16Vec32(const uint8_t*, fp8_e4m3_tag) : Base() {}
+  explicit BF16Vec32(const uint8_t*, fp8_e5m2_tag) : Base() {}
 };
 
 struct FP32Vec4 : public VectorizedRegWrapper<FP32Vec4, 1, float> {
@@ -480,6 +486,10 @@ struct FP32Vec16 : public VectorizedRegWrapper<FP32Vec16, 4, float> {
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
 
+  // FP8 stub: dead code on ARM (fp8 KV cache is x86-only), needed for
+  // load_b_pair_vec template to compile on all platforms.
+  explicit FP32Vec16(const BF16Vec32&, int) : Base() {}
+
   explicit FP32Vec16(const FP16Vec16& v) {
     reg.val[0] = Vectorized<float>(vcvt_f32_f16(vget_low_f16(v.reg.val[0])));
     reg.val[1] = Vectorized<float>(vcvt_f32_f16(vget_high_f16(v.reg.val[0])));
diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp
index f9da78283da5..d1c2fc85933a 100644
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@@ -6,6 +6,9 @@
 
 namespace vec_op {
 
+struct fp8_e4m3_tag {};
+struct fp8_e5m2_tag {};
+
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
@@ -145,6 +148,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   }
 
   void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
+
+  explicit BF16Vec32(const uint8_t*, fp8_e4m3_tag) : reg{} {}
+  explicit BF16Vec32(const uint8_t*, fp8_e5m2_tag) : reg{} {}
 };
 
 struct FP32Vec4 : public Vec<FP32Vec4> {
@@ -302,6 +308,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
 
+  // FP8 stub: dead code on scalar path (fp8 KV cache is x86-only), needed for
+  // load_b_pair_vec template to compile on all platforms.
+  explicit FP32Vec16(const BF16Vec32&, int) : reg{} {}
+
   FP32Vec16 operator*(const FP32Vec16& b) const {
     f32x16_t ret;
     unroll_loop<int, VEC_ELEM_NUM>(
diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
index 089b9840ea2e..87c7a9dd51f4 100644
--- a/csrc/cpu/cpu_types_vsx.hpp
+++ b/csrc/cpu/cpu_types_vsx.hpp
@@ -9,6 +9,10 @@
 
 namespace vec_op {
 
+// FP8 tag types for tag dispatch (see cpu_attn_vec.hpp)
+struct fp8_e4m3_tag {};
+struct fp8_e5m2_tag {};
+
 // FIXME: FP16 is not fully supported in Torch-CPU
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
@@ -142,6 +146,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
       : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
 
   void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
+
+  explicit BF16Vec32(const uint8_t*, fp8_e4m3_tag) : reg{} {}
+  explicit BF16Vec32(const uint8_t*, fp8_e5m2_tag) : reg{} {}
 };
 
 struct FP32Vec4 : public Vec<FP32Vec4> {
@@ -404,6 +411,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
+  // FP8 stub: dead code on PowerPC (fp8 KV cache is x86-only), needed for
+  // load_b_pair_vec template to compile on all platforms.
+  explicit FP32Vec16(const BF16Vec32&, int) : reg{} {}
+
   explicit FP32Vec16(const INT32Vec16& v) {
     reg.val[0] = vec_ctf(v.reg.val[0], 0);
     reg.val[1] = vec_ctf(v.reg.val[1], 0);
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index 700ba0306239..2e0af466b649 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -8,6 +8,9 @@
 #include <torch/all.h>
 namespace vec_op {
 
+struct fp8_e4m3_tag {};
+struct fp8_e5m2_tag {};
+
 #define vec_neg(a) (-(a))
 #define vec_add(a, b) ((a) + (b))
 #define vec_sub(a, b) ((a) - (b))
@@ -241,6 +244,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   explicit BF16Vec32(const BF16Vec8& vec8_data)
       : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
 
+  explicit BF16Vec32(const uint8_t*, fp8_e4m3_tag) : reg{} {}
+  explicit BF16Vec32(const uint8_t*, fp8_e5m2_tag) : reg{} {}
+
   void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
 };
 
@@ -682,6 +688,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
+  // FP8 stub: dead code on s390x (fp8 KV cache is x86-only), needed for
+  // load_b_pair_vec template to compile on all platforms.
+  explicit FP32Vec16(const BF16Vec32&, int) : reg{} {}
+
   FP32Vec16 operator*(const FP32Vec16& b) const {
     return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
                                 vec_mul(reg.val[1], b.reg.val[1]),
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index d94af338ac1c..396b9b7e041f 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -11,6 +11,17 @@ static_assert(false, "AVX2 must be supported for the current implementation.");
 
 namespace vec_op {
 
+// Tags for FP8 BF16Vec32 constructors (avoid overload collision with
+// BF16Vec32(void*)).
+// VEC path (FP8 → pseudo-FP16 layout, scale correction applied later):
+struct fp8_e4m3_tag {};  // E4M3 → pseudo-FP16; BF16 value = true_E4M3 * 2^-8
+struct fp8_e5m2_tag {};  // E5M2 → FP16 bits directly (same exponent bias=15)
+// AMX path (FP8 → unscaled BF16, no FP32 round-trip):
+// BF16 value = true_E4M3 * 2^-120 (E4M3) or true_E5M2 * 2^-112 (E5M2).
+// Exponent rebiasing is folded into k/v scales by the caller.
+struct fp8_bf16_e4m3_tag {};
+struct fp8_bf16_e5m2_tag {};
+
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
@@ -111,9 +122,17 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
   void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
 
   void save(void* ptr, const int elem_num) const {
+#ifdef __AVX512BW__
     constexpr uint32_t M = 0xFFFFFFFF;
     __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
     _mm256_mask_storeu_epi16(ptr, mask, reg);
+#else
+    // Fallback for lack of 16-bit masked store
+    int16_t tmp[VEC_ELEM_NUM];
+    _mm256_storeu_si256((__m256i*)tmp, reg);
+    for (int i = 0; i < elem_num; ++i)
+      reinterpret_cast<int16_t*>(ptr)[i] = tmp[i];
+#endif
   }
 };
 
@@ -150,9 +169,17 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
 
   void save(void* ptr, const int elem_num) const {
+#ifdef __AVX512BW__
     constexpr uint32_t M = 0xFFFFFFFF;
     __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
     _mm256_mask_storeu_epi16(ptr, mask, reg);
+#else
+    // Fallback for lack of 16-bit masked store
+    int16_t tmp[VEC_ELEM_NUM];
+    _mm256_storeu_si256((__m256i*)tmp, reg);
+    for (int i = 0; i < elem_num; ++i)
+      reinterpret_cast<int16_t*>(ptr)[i] = tmp[i];
+#endif
   }
 };
 
@@ -176,6 +203,50 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
                                (__m128i)vec8_data.reg, 2),
             (__m128i)vec8_data.reg, 3)) {}
 
+  // Decode 32 FP8-E4M3 bytes to pseudo-FP16 layout (stored in the BF16
+  // register).  Result = true_E4M3 * 2^-8; caller applies scale * 2^8.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_e4m3_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m512i b16 = _mm512_cvtepu8_epi16(b8);
+    __m512i sign =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x80)), 8);
+    __m512i payload =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x7F)), 7);
+    reg = _mm512_or_si512(sign, payload);
+  }
+
+  // Decode 32 FP8-E5M2 bytes to FP16 layout.
+  // E5M2 and FP16 share the same 5-bit exponent bias (15), so FP8 byte b maps
+  // directly to FP16 bits by shifting left 8 — no sign/payload reconstruction.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_e5m2_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    reg = _mm512_slli_epi16(_mm512_cvtepu8_epi16(b8), 8);
+  }
+
+  // Direct FP8-E4M3 → unscaled BF16 for AMX (no FP32 round-trip).
+  // BF16 value = true_E4M3 * 2^-120; exponent rebiasing folded into k/v scales.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_bf16_e4m3_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m512i b16 = _mm512_cvtepu8_epi16(b8);
+    __m512i sign =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x80)), 8);
+    __m512i payload =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x7F)), 4);
+    reg = _mm512_or_si512(sign, payload);
+  }
+
+  // Direct FP8-E5M2 → unscaled BF16 for AMX (no FP32 round-trip).
+  // BF16 value = true_E5M2 * 2^-112; exponent rebiasing folded into k/v scales.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_bf16_e5m2_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m512i b16 = _mm512_cvtepu8_epi16(b8);
+    __m512i sign =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x80)), 8);
+    __m512i payload =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x7F)), 5);
+    reg = _mm512_or_si512(sign, payload);
+  }
+
   void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; }
 };
 #else
@@ -192,13 +263,83 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   explicit BF16Vec32(__m256i low, __m256i high)
       : reg_low(low), reg_high(high) {}
 
+  explicit BF16Vec32()
+      : reg_low(_mm256_setzero_si256()), reg_high(_mm256_setzero_si256()) {}
+
   explicit BF16Vec32(BF16Vec8& vec8_data)
-      : reg_low((__m256i)_mm256_inserti32x4(
-            _mm256_castsi128_si256((__m128i)vec8_data.reg),
-            (__m128i)vec8_data.reg, 1)),
-        reg_high((__m256i)_mm256_inserti32x4(
-            _mm256_castsi128_si256((__m128i)vec8_data.reg),
-            (__m128i)vec8_data.reg, 1)) {}
+      : reg_low(_mm256_broadcastsi128_si256((__m128i)vec8_data.reg)),
+        reg_high(_mm256_broadcastsi128_si256((__m128i)vec8_data.reg)) {}
+
+  // E4M3 decode (AVX2 path) — same bit-layout trick as the AVX512 variant
+  // above.  Result = true_E4M3 * 2^-8; caller applies scale * 2^8.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_e4m3_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m128i b8_low = _mm256_extracti128_si256(b8, 0);
+    __m128i b8_high = _mm256_extracti128_si256(b8, 1);
+    __m256i b16_low = _mm256_cvtepu8_epi16(b8_low);
+    __m256i b16_high = _mm256_cvtepu8_epi16(b8_high);
+
+    __m256i sign_low = _mm256_slli_epi16(
+        _mm256_and_si256(b16_low, _mm256_set1_epi16(0x80)), 8);
+    __m256i payload_low = _mm256_slli_epi16(
+        _mm256_and_si256(b16_low, _mm256_set1_epi16(0x7F)), 7);
+    __m256i sign_high = _mm256_slli_epi16(
+        _mm256_and_si256(b16_high, _mm256_set1_epi16(0x80)), 8);
+    __m256i payload_high = _mm256_slli_epi16(
+        _mm256_and_si256(b16_high, _mm256_set1_epi16(0x7F)), 7);
+    reg_low = _mm256_or_si256(sign_low, payload_low);
+    reg_high = _mm256_or_si256(sign_high, payload_high);
+  }
+
+  // E5M2 decode (AVX2 path) — b << 8 maps to FP16 bits; see AVX512 variant
+  // above.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_e5m2_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m128i b8_low = _mm256_extracti128_si256(b8, 0);
+    __m128i b8_high = _mm256_extracti128_si256(b8, 1);
+    reg_low = _mm256_slli_epi16(_mm256_cvtepu8_epi16(b8_low), 8);
+    reg_high = _mm256_slli_epi16(_mm256_cvtepu8_epi16(b8_high), 8);
+  }
+
+  // Direct FP8-E4M3 → unscaled BF16 for AMX (AVX2 path, no FP32 round-trip).
+  // BF16 value = true_E4M3 * 2^-120; exponent rebiasing folded into k/v scales.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_bf16_e4m3_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m128i b8_low = _mm256_extracti128_si256(b8, 0);
+    __m128i b8_high = _mm256_extracti128_si256(b8, 1);
+    __m256i b16_low = _mm256_cvtepu8_epi16(b8_low);
+    __m256i b16_high = _mm256_cvtepu8_epi16(b8_high);
+    reg_low = _mm256_or_si256(
+        _mm256_slli_epi16(_mm256_and_si256(b16_low, _mm256_set1_epi16(0x80)),
+                          8),
+        _mm256_slli_epi16(_mm256_and_si256(b16_low, _mm256_set1_epi16(0x7F)),
+                          4));
+    reg_high = _mm256_or_si256(
+        _mm256_slli_epi16(_mm256_and_si256(b16_high, _mm256_set1_epi16(0x80)),
+                          8),
+        _mm256_slli_epi16(_mm256_and_si256(b16_high, _mm256_set1_epi16(0x7F)),
+                          4));
+  }
+
+  // Direct FP8-E5M2 → unscaled BF16 for AMX (AVX2 path, no FP32 round-trip).
+  // BF16 value = true_E5M2 * 2^-112; exponent rebiasing folded into k/v scales.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_bf16_e5m2_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m128i b8_low = _mm256_extracti128_si256(b8, 0);
+    __m128i b8_high = _mm256_extracti128_si256(b8, 1);
+    __m256i b16_low = _mm256_cvtepu8_epi16(b8_low);
+    __m256i b16_high = _mm256_cvtepu8_epi16(b8_high);
+    reg_low = _mm256_or_si256(
+        _mm256_slli_epi16(_mm256_and_si256(b16_low, _mm256_set1_epi16(0x80)),
+                          8),
+        _mm256_slli_epi16(_mm256_and_si256(b16_low, _mm256_set1_epi16(0x7F)),
+                          5));
+    reg_high = _mm256_or_si256(
+        _mm256_slli_epi16(_mm256_and_si256(b16_high, _mm256_set1_epi16(0x80)),
+                          8),
+        _mm256_slli_epi16(_mm256_and_si256(b16_high, _mm256_set1_epi16(0x7F)),
+                          5));
+  }
 
   void save(void* ptr) const {
     _mm256_storeu_si256((__m256i*)ptr, reg_low);
@@ -390,6 +531,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
       : reg(_mm512_castsi512_ps(
             _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
 
+  explicit FP32Vec16(const BF16Vec32& v, int upper) {
+    __m256i v_half_i = _mm512_extracti32x8_epi32(v.reg, upper);
+    reg = _mm512_cvtph_ps(v_half_i);
+  }
+
   explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {}
 
   explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@@ -494,6 +640,14 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(const FP32Vec8& data)
       : reg_low(data.reg), reg_high(data.reg) {}
 
+  explicit FP32Vec16(const BF16Vec32& v, int upper) {
+    const __m256i& half = upper ? v.reg_high : v.reg_low;
+    __m128i lo = _mm256_extractf128_si256(half, 0);
+    __m128i hi = _mm256_extractf128_si256(half, 1);
+    reg_low = _mm256_cvtph_ps(lo);
+    reg_high = _mm256_cvtph_ps(hi);
+  }
+
   explicit FP32Vec16(const FP16Vec16& v) {
     __m128i low = _mm256_extractf128_si256(v.reg, 0);
     __m128i high = _mm256_extractf128_si256(v.reg, 1);
@@ -535,6 +689,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                      _mm256_sub_ps(reg_high, b.reg_high));
   }
 
+  FP32Vec16 operator-() const {
+    const __m256 neg = _mm256_set1_ps(-0.0f);
+    return FP32Vec16(_mm256_xor_ps(reg_low, neg), _mm256_xor_ps(reg_high, neg));
+  }
+
   FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
                      _mm256_div_ps(reg_high, b.reg_high));
@@ -600,6 +759,85 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     _mm256_storeu_ps(ptr, reg_low);
     _mm256_storeu_ps(ptr + 8, reg_high);
   }
+
+  void save(float* ptr, const int elem_num) const {
+    // Partial store: cmpgt produces a sign-bit mask (0xFFFFFFFF/0 per lane)
+    // for the first elem_num lanes, applied across the two 8-wide halves.
+    if (elem_num <= 8) {
+      __m256i mask =
+          _mm256_cmpgt_epi32(_mm256_set1_epi32(elem_num),
+                             _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7));
+      _mm256_maskstore_ps(ptr, mask, reg_low);
+    } else {
+      _mm256_storeu_ps(ptr, reg_low);
+      __m256i mask =
+          _mm256_cmpgt_epi32(_mm256_set1_epi32(elem_num - 8),
+                             _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7));
+      _mm256_maskstore_ps(ptr + 8, mask, reg_high);
+    }
+  }
+
+  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
+    return FP32Vec16(
+        _mm256_min_ps(max.reg_low, _mm256_max_ps(min.reg_low, reg_low)),
+        _mm256_min_ps(max.reg_high, _mm256_max_ps(min.reg_high, reg_high)));
+  }
+
+  FP32Vec16 abs() const {
+    const __m256 sign_mask = _mm256_set1_ps(-0.0f);
+    return FP32Vec16(_mm256_andnot_ps(sign_mask, reg_low),
+                     _mm256_andnot_ps(sign_mask, reg_high));
+  }
+
+  FP32Vec16 min(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_min_ps(reg_low, b.reg_low),
+                     _mm256_min_ps(reg_high, b.reg_high));
+  }
+
+  // Partial element-wise min over the first elem_num lanes only (tail path).
+  // Scalar via AliasReg: AVX2 has no masked vminps, so we spill, loop, reload.
+  FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
+    AliasReg ar_this_low, ar_this_high, ar_b_low, ar_b_high;
+    ar_this_low.reg = reg_low;
+    ar_this_high.reg = reg_high;
+    ar_b_low.reg = b.reg_low;
+    ar_b_high.reg = b.reg_high;
+    for (int i = 0; i < elem_num && i < 8; ++i)
+      ar_this_low.values[i] =
+          std::min(ar_this_low.values[i], ar_b_low.values[i]);
+    for (int i = 0; i < elem_num - 8 && i < 8; ++i)
+      ar_this_high.values[i] =
+          std::min(ar_this_high.values[i], ar_b_high.values[i]);
+    return FP32Vec16(ar_this_low.reg, ar_this_high.reg);
+  }
+
+  // Partial element-wise max over the first elem_num lanes only (tail path).
+  // Scalar via AliasReg: AVX2 has no masked vmaxps, so we spill, loop, reload.
+  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
+    AliasReg ar_this_low, ar_this_high, ar_b_low, ar_b_high;
+    ar_this_low.reg = reg_low;
+    ar_this_high.reg = reg_high;
+    ar_b_low.reg = b.reg_low;
+    ar_b_high.reg = b.reg_high;
+    for (int i = 0; i < elem_num && i < 8; ++i)
+      ar_this_low.values[i] =
+          std::max(ar_this_low.values[i], ar_b_low.values[i]);
+    for (int i = 0; i < elem_num - 8 && i < 8; ++i)
+      ar_this_high.values[i] =
+          std::max(ar_this_high.values[i], ar_b_high.values[i]);
+    return FP32Vec16(ar_this_low.reg, ar_this_high.reg);
+  }
+
+  float reduce_min() const {
+    __m256 v = _mm256_min_ps(reg_low, reg_high);
+    __m256 v_shuffled = _mm256_permute_ps(v, 0b00001011);
+    __m256 v_min = _mm256_min_ps(v, v_shuffled);
+    v_shuffled = _mm256_permute_ps(v_min, 0b00000001);
+    v_min = _mm256_min_ps(v_min, v_shuffled);
+    v_shuffled = _mm256_permute2f128_ps(v_min, v_min, 0b00000001);
+    v_min = _mm256_min_ps(v_min, v_shuffled);
+    return _mm256_cvtss_f32(v_min);
+  }
 };
 #endif
 
@@ -652,6 +890,34 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
   // non-temporal save
   void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
 };
+#else
+struct INT8Vec16 : public Vec<INT8Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m128i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m128i reg;
+
+  explicit INT8Vec16(const FP32Vec16& vec) {
+    __m256i lo_i32 = _mm256_cvtps_epi32(vec.reg_low);
+    __m256i hi_i32 = _mm256_cvtps_epi32(vec.reg_high);
+    __m256i packed16 = _mm256_packs_epi32(lo_i32, hi_i32);
+    packed16 = _mm256_permute4x64_epi64(packed16, 0xD8);
+    __m256i packed8 = _mm256_packs_epi16(packed16, _mm256_setzero_si256());
+    packed8 = _mm256_permute4x64_epi64(packed8, 0xD8);
+    reg = _mm256_castsi256_si128(packed8);
+  }
+
+  void save(int8_t* ptr) const { _mm_storeu_si128((__m128i*)ptr, reg); }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    AliasReg ar;
+    ar.reg = reg;
+    for (int i = 0; i < elem_num; ++i) ptr[i] = ar.values[i];
+  }
+};
 #endif
 
 template <typename T>
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index 80be42bb7639..058fe25b0e26 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -215,7 +215,7 @@ void dynamic_quant_epilogue(const float* input, scalar_t* output,
         float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
         token_zp_scale_vec = cvt_vec_t(zp_scale_val);
       }
-      for (; j < hidden_size - vec_elem_num; ++j) {
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
         cvt_vec_t elems_fp32(input_ptr + j);
         elems_fp32 = elems_fp32 * token_scale_vec;
         if constexpr (AZP) {
diff --git a/csrc/cpu/generate_cpu_attn_dispatch.py b/csrc/cpu/generate_cpu_attn_dispatch.py
index bbcd6d85b1d1..2cfd599f09cd 100644
--- a/csrc/cpu/generate_cpu_attn_dispatch.py
+++ b/csrc/cpu/generate_cpu_attn_dispatch.py
@@ -20,73 +20,98 @@
     "VEC16": 2,
     "NEON": 3,
     "VXE": 4,
+    "VSX": 5,
+}
+
+# KV cache index: 0 = auto (same as scalar_t), 1 = fp8_e4m3, 2 = fp8_e5m2
+KV_CACHE_IDX = {
+    "auto": 0,
+    "fp8_e4m3": 1,
+    "fp8_e5m2": 2,
+}
+
+# C++ type for each kv_cache index
+KV_CACHE_CPP_TYPES = {
+    "auto": "scalar_t",
+    "fp8_e4m3": "c10::Float8_e4m3fn",
+    "fp8_e5m2": "c10::Float8_e5m2",
 }
 
 # ISAs supported for head_dims divisible by 32
-ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16", "VXE"]
+ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16", "VXE", "VSX"]
 
 # ISAs supported for head_dims divisible by 16 only
 ISA_FOR_16 = ["VEC16"]
 
+# ISAs that support FP8 KV cache (x86 AVX2/AVX-512 required)
+ISA_FOR_FP8 = ["AMX", "VEC"]
+
 
-def encode_params(head_dim: int, isa_type: str) -> int:
-    """Encode head_dim and ISA type into a single int64_t."""
+def encode_params(head_dim: int, isa_type: str, kv_cache: str = "auto") -> int:
+    """Encode head_dim, ISA type, and KV cache type into a single int64_t."""
     isa_val = ISA_TYPES[isa_type]
-    # Encoding: (head_dim << 8) | isa_type
-    # This allows head_dim up to 2^56 - 1 and 256 ISA types
-    return (head_dim << 8) | isa_val
+    kv_val = KV_CACHE_IDX[kv_cache]
+    # Encoding: (head_dim << 16) | (kv_cache_idx << 8) | isa_type
+    # This allows head_dim up to 2^48 - 1, 256 KV cache types, and 256 ISA types
+    return (head_dim << 16) | (kv_val << 8) | isa_val
+
+
+def _make_case(
+    head_dim: int, isa: str, kv_cache: str = "auto", isa_override: str | None = None
+) -> str:
+    """Generate a single switch case line."""
+    encoded = encode_params(head_dim, isa, kv_cache)
+    actual_isa = isa_override if isa_override else isa
+    cpp_type = KV_CACHE_CPP_TYPES[kv_cache]
+    attn_impl = (
+        f"cpu_attention::AttentionImpl<"
+        f"cpu_attention::ISA::{actual_isa}, \\\n"
+        f"                                                       "
+        f"scalar_t, head_dim, {cpp_type}>"
+    )
+    comment = (
+        f"head_dim={head_dim}, isa={isa}"
+        if kv_cache == "auto"
+        else f"head_dim={head_dim}, isa={isa}, kv_cache={kv_cache}"
+    )
+    return (
+        f"""      case {encoded}LL: {{ """
+        f"""/* {comment} */ \\"""
+        f"""
+        constexpr size_t head_dim = {head_dim}; \\"""
+        f"""
+        using attn_impl = {attn_impl}; \\"""
+        f"""
+        return __VA_ARGS__(); \\"""
+        f"""
+      }} \\"""
+    )
 
 
-def generate_cases_for_isa_group(isa_list: list[str]) -> str:
+def generate_cases_for_isa_group(isa_list: list[str], include_fp8: bool = False) -> str:
     """Generate switch cases for a specific ISA group."""
     cases = []
 
-    # Generate cases for head_dims divisible by 32
+    # Non-FP8 cases for head_dims divisible by 32
     for head_dim in HEAD_DIMS_32:
         for isa in isa_list:
             if isa not in ISA_FOR_32:
                 continue
-            encoded = encode_params(head_dim, isa)
-            case_str = (
-                f"""      case {encoded}LL: {{ """
-                f"""/* head_dim={head_dim}, isa={isa} */ \\"""
-                f"""
-        constexpr size_t head_dim = {head_dim}; \\"""
-                f"""
-        using attn_impl = cpu_attention::AttentionImpl<"""
-                f"""cpu_attention::ISA::{isa}, \\"""
-                f"""
-                                                       """
-                f"""scalar_t, head_dim>; \\"""
-                f"""
-        return __VA_ARGS__(); \\"""
-                f"""
-      }} \\"""
-            )
-            cases.append(case_str)
+            cases.append(_make_case(head_dim, isa, "auto"))
 
-    # Generate cases for head_dims divisible by 16 only
+    # Non-FP8 cases for head_dims divisible by 16 only
     for head_dim in HEAD_DIMS_16:
         for isa in isa_list:
-            encoded = encode_params(head_dim, isa)
-            case_str = (
-                f"""      case {encoded}LL: {{ """
-                f"""/* head_dim={head_dim}, isa={isa} """
-                f"""(using VEC16) */ \\"""
-                f"""
-        constexpr size_t head_dim = {head_dim}; \\"""
-                f"""
-        using attn_impl = cpu_attention::AttentionImpl<"""
-                f"""cpu_attention::ISA::VEC16, \\"""
-                f"""
-                                                       """
-                f"""scalar_t, head_dim>; \\"""
-                f"""
-        return __VA_ARGS__(); \\"""
-                f"""
-      }} \\"""
-            )
-            cases.append(case_str)
+            cases.append(_make_case(head_dim, isa, "auto", isa_override="VEC16"))
+
+    # FP8 cases: only AMX and VEC, only head_dims divisible by 32
+    if include_fp8:
+        for fp8_type in ("fp8_e4m3", "fp8_e5m2"):
+            for head_dim in HEAD_DIMS_32:
+                for isa in isa_list:
+                    if isa not in ISA_FOR_FP8:
+                        continue
+                    cases.append(_make_case(head_dim, isa, fp8_type))
 
     return "\n".join(cases)
 
@@ -94,8 +119,9 @@ def generate_cases_for_isa_group(isa_list: list[str]) -> str:
 def generate_helper_function() -> str:
     """Generate helper function to encode parameters."""
     return """
-inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa) {
-  return (head_dim << 8) | static_cast<int64_t>(isa);
+inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa,
+                                      int64_t kv_cache_idx = 0) {
+  return (head_dim << 16) | (kv_cache_idx << 8) | static_cast<int64_t>(isa);
 }
 """
 
@@ -123,93 +149,94 @@ def generate_header_file() -> str:
   #include "cpu_attn_vxe.hpp"
 #endif
 
+#ifdef __powerpc__
+  #include "cpu_attn_vsx.hpp"
+#endif
+
 """
 
     header += generate_helper_function()
 
     # Generate dispatch macro with conditional compilation for different ISA sets
     header += """
-// Dispatch macro using encoded parameters
-"""
-
-    # x86_64 with AMX
-    header += """#if defined(CPU_CAPABILITY_AMXBF16)
-#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
-  [&] { \\
-    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
-    switch (encoded_params) { \\
-"""
-    header += generate_cases_for_isa_group(["AMX", "VEC", "VEC16"])
-    header += """
-      default: { \\
-        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
-                    std::to_string(HEAD_DIM) + " isa=" + \\
-                    std::to_string(static_cast<int>(ISA_TYPE))); \\
-      } \\
-    } \\
-  }()
-
-"""
-
-    # ARM64 with NEON
-    header += """#elif defined(__aarch64__)
-#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
-  [&] { \\
-    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
-    switch (encoded_params) { \\
-"""
-    header += generate_cases_for_isa_group(["NEON", "VEC", "VEC16"])
-    header += """
-      default: { \\
-        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
-                    std::to_string(HEAD_DIM) + " isa=" + \\
-                    std::to_string(static_cast<int>(ISA_TYPE))); \\
-      } \\
-    } \\
-  }()
-
-"""
-
-    # s390x with VXE
-    header += """#elif defined(__s390x__)
-#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
-  [&] { \\
-    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
-    switch (encoded_params) { \\
-"""
-    header += generate_cases_for_isa_group(["VXE", "VEC", "VEC16"])
-    header += """
-      default: { \\
-        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
-                    std::to_string(HEAD_DIM) + " isa=" + \\
-                    std::to_string(static_cast<int>(ISA_TYPE))); \\
-      } \\
-    } \\
-  }()
-
-"""
-
-    # Fallback: VEC and VEC16 only
-    header += """#else
-#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
-  [&] { \\
-    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
-    switch (encoded_params) { \\
+// Dispatch macro using encoded parameters.
+// KV_CACHE_IDX: Fp8KVCacheDataType enum value (kAuto=0, kFp8E4M3=1, kFp8E5M2=2).
+// FP8 cases (kv_cache_idx != 0) are generated on x86 platforms with AVX2 or
+// AVX-512: BF16Vec32 FP8 constructors have both AVX-512 and AVX2 implementations
+// in cpu_types_x86.hpp. Non-x86 platforms (#else fallback) have fp8=False.
 """
-    header += generate_cases_for_isa_group(["VEC", "VEC16"])
-    header += """
-      default: { \\
-        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
-                    std::to_string(HEAD_DIM) + " isa=" + \\
-                    std::to_string(static_cast<int>(ISA_TYPE))); \\
-      } \\
-    } \\
-  }()
-
-#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ / __s390x__ */
 
-#endif  // CPU_ATTN_DISPATCH_GENERATED_H
-"""
+    def _macro_block(guard: str, isa_list: list[str], fp8: bool) -> str:
+        """Return one CPU_ATTN_DISPATCH macro block for a given guard."""
+        enc = (
+            "    int64_t encoded_params = encode_cpu_attn_params("
+            "HEAD_DIM, ISA_TYPE, KV_CACHE_IDX); \\"
+        )
+        cases = generate_cases_for_isa_group(isa_list, include_fp8=fp8)
+        tail = (
+            "\n"
+            "      default: { \\\n"
+            "        TORCH_CHECK(false, "
+            '"Unsupported CPU attention configuration: head_dim=" + \\\n'
+            '                    std::to_string(HEAD_DIM) + " isa=" + \\\n'
+            "                    std::to_string(static_cast<int>(ISA_TYPE))"
+            " + \\\n"
+            '                    " kv_cache_idx=" + '
+            "std::to_string(KV_CACHE_IDX)); \\\n"
+            "      } \\\n"
+            "    } \\\n"
+            "  }()\n\n"
+        )
+        return (
+            f"{guard}\n"
+            "#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, KV_CACHE_IDX, ...) \\\n"
+            "  [&] { \\\n"
+            f"{enc}\n"
+            "    switch (encoded_params) { \\\n"
+            f"{cases}"
+            f"{tail}"
+        )
+
+    header += _macro_block(
+        "#if defined(CPU_CAPABILITY_AMXBF16)",
+        ["AMX", "VEC", "VEC16"],
+        fp8=True,
+    )
+    header += _macro_block(
+        "#elif defined(__aarch64__)",
+        ["NEON", "VEC", "VEC16"],
+        fp8=False,
+    )
+    header += _macro_block(
+        "#elif defined(__s390x__)",
+        ["VXE", "VEC", "VEC16"],
+        fp8=False,
+    )
+    header += _macro_block(
+        "#elif defined(__powerpc__)",
+        ["VSX", "VEC", "VEC16"],
+        fp8=False,
+    )
+    header += _macro_block(
+        "#elif defined(__AVX512F__)",
+        ["VEC", "VEC16"],
+        fp8=True,
+    )
+    header += _macro_block(
+        "#elif defined(__AVX2__)",
+        ["VEC", "VEC16"],
+        fp8=False,
+    )
+    header += _macro_block(
+        "#else",
+        ["VEC", "VEC16"],
+        fp8=False,
+    )
+    header += (
+        "#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ / "
+        "__s390x__ / __powerpc__ */\n\n"
+        "#endif  // CPU_ATTN_DISPATCH_GENERATED_H\n"
+    )
 
     return header
 
diff --git a/csrc/cpu/sgl-kernels/gemm_fp8.cpp b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
index ef29181cee56..487a73d09497 100644
--- a/csrc/cpu/sgl-kernels/gemm_fp8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
@@ -447,7 +447,7 @@ INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
 INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
 
 at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
-    std::vector<int64_t> block_size, std::optional<at::Tensor>& bias,
+    std::vector<int64_t> block_size, const std::optional<at::Tensor>& bias,
     at::ScalarType out_dtype, bool is_vnni) {
   RECORD_FUNCTION("sgl-kernel::fp8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, block_size, bias}));
 
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index bd57c9188886..c1f04a0527c1 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -77,6 +77,13 @@ at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
                                      const std::optional<at::Tensor>& bias,
                                      at::ScalarType out_dtype, bool is_vnni);
 
+// Adapted from sglang: FP8 W8A16 kernel
+at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
+                             at::Tensor& scales2,
+                             std::vector<int64_t> block_size,
+                             const std::optional<at::Tensor>& bias,
+                             at::ScalarType out_dtype, bool is_vnni);
+
 // Adapted from sglang: INT4 W4A8 kernels
 std::tuple<at::Tensor, at::Tensor, at::Tensor> convert_weight_packed_scale_zp(
     at::Tensor qweight, at::Tensor qzeros, at::Tensor scales);
@@ -101,7 +108,9 @@ void cpu_attn_reshape_and_cache(const torch::Tensor& key,
                                 torch::Tensor& key_cache,
                                 torch::Tensor& value_cache,
                                 const torch::Tensor& slot_mapping,
-                                const std::string& isa);
+                                const std::string& isa, const double k_scale,
+                                const double v_scale,
+                                const std::string& kv_cache_dtype);
 
 void cpu_attention_with_kv_cache(
     const torch::Tensor& query, const torch::Tensor& key_cache,
@@ -112,7 +121,8 @@ void cpu_attention_with_kv_cache(
     const int64_t sliding_window_left, const int64_t sliding_window_right,
     const torch::Tensor& block_table, const double softcap,
     const torch::Tensor& scheduler_metadata,
-    const std::optional<torch::Tensor>& s_aux);
+    const std::optional<torch::Tensor>& s_aux, const double k_scale,
+    const double v_scale, const std::string& kv_cache_dtype);
 
 // Note: just for avoiding importing errors
 void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
@@ -268,8 +278,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
 
   // Quantization
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
-    defined(__powerpc64__)
+#if defined(__AVX512F__) || defined(__AVX2__) || \
+    (defined(__aarch64__) && !defined(__APPLE__)) || defined(__powerpc64__)
   // Helper function to release oneDNN handlers
   ops.def("release_dnnl_matmul_handler(int handler) -> ()",
           &release_dnnl_matmul_handler);
@@ -373,6 +383,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "int4_scaled_mm_cpu(Tensor(a0!) x, Tensor(a1!) w, Tensor(a2!) w_zeros, "
       "Tensor(a3!) w_scales, Tensor? bias) -> Tensor");
   ops.impl("int4_scaled_mm_cpu", torch::kCPU, &int4_scaled_mm_cpu);
+
+  // Adapted from sglang: FP8 W8A16 kernel
+  ops.def(
+      "fp8_scaled_mm_cpu(Tensor(a0!) mat1, Tensor(a1!) mat2, Tensor(a2!) "
+      "scales2, SymInt[] block_size, Tensor? bias, ScalarType out_dtype, "
+      "bool is_vnni) -> Tensor");
+  ops.impl("fp8_scaled_mm_cpu", torch::kCPU, &fp8_scaled_mm_cpu);
 #endif
 
   // CPU attention kernels
@@ -384,15 +401,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       &get_scheduler_metadata);
   ops.def(
       "cpu_attn_reshape_and_cache(Tensor key, Tensor value, Tensor(a2!) "
-      "key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str "
-      "isa) -> ()",
+      "key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str isa, "
+      "float k_scale=1.0, float v_scale=1.0, str kv_cache_dtype=\"auto\") -> "
+      "()",
       &cpu_attn_reshape_and_cache);
   ops.def(
       "cpu_attention_with_kv_cache(Tensor query, Tensor key_cache, Tensor "
       "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
       "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
       "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
-      "float softcap, Tensor scheduler_metadata, Tensor? s_aux) -> ()",
+      "float softcap, Tensor scheduler_metadata, Tensor? s_aux, "
+      "float k_scale=1.0, float v_scale=1.0, str kv_cache_dtype=\"auto\") -> "
+      "()",
       &cpu_attention_with_kv_cache);
 
   // placeholders
diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
index 2c9e01c60f93..394e67e3a034 100644
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -54,7 +54,7 @@ struct Counter {
 };
 
 inline int64_t get_available_l2_size() {
-#if defined(__s390x__)
+#if defined(__s390x__) || defined(__powerpc__)
   static int64_t size = []() {
     uint32_t l2_cache_size = 0;
     auto caps = at::cpu::get_cpu_capabilities();
diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index 9ef623bf7f1f..0b720d356e78 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -232,28 +232,6 @@ void unmap_and_release(unsigned long long device, ssize_t size,
     }
   }
 
-  // ROCm workaround: hipMemRelease does not return physical VRAM to the
-  // free pool while the virtual-address reservation is still held.
-  // Cycling cuMemAddressFree → cuMemAddressReserve (at the same address)
-  // forces the driver to actually release the physical pages while keeping
-  // the same VA available for a later create_and_map.
-  if (first_error == no_error) {
-    first_error = cuMemAddressFree(d_mem, size);
-    if (first_error == no_error) {
-      CUdeviceptr d_mem_new = 0;
-      first_error = cuMemAddressReserve(&d_mem_new, size, 0, d_mem, 0);
-      if (first_error == no_error && d_mem_new != d_mem) {
-        cuMemAddressFree(d_mem_new, size);
-        snprintf(error_msg, sizeof(error_msg),
-                 "ROCm: VA re-reserve got %p instead of %p", (void*)d_mem_new,
-                 (void*)d_mem);
-        error_code = CUresult(1);
-        std::cerr << error_msg << std::endl;
-        return;
-      }
-    }
-  }
-
   if (first_error != no_error) {
     CUDA_CHECK(first_error);
   }
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index c32d3a0606af..ca96b0ef3fef 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -96,44 +96,14 @@ struct enable_sm90_or_later : Kernel {
 };
 
 template <typename Kernel>
-struct enable_sm90_only : Kernel {
+struct enable_sm100_to_sm120 : Kernel {
   template <typename... Args>
   CUTLASS_DEVICE void operator()(Args&&... args) {
 #if defined __CUDA_ARCH__
-  #if __CUDA_ARCH__ == 900
+  #if (__CUDA_ARCH__ >= 1000 && __CUDA_ARCH__ < 1200)
     Kernel::operator()(std::forward<Args>(args)...);
   #else
-    printf("This kernel only supports sm90.\n");
-    asm("trap;");
-  #endif
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm100f_only : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__
-  #if __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
-    Kernel::operator()(std::forward<Args>(args)...);
-  #else
-    printf("This kernel only supports sm100f.\n");
-    asm("trap;");
-  #endif
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm100a_only : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__
-  #if __CUDA_ARCH__ == 1000
-    Kernel::operator()(std::forward<Args>(args)...);
-  #else
-    printf("This kernel only supports sm100a.\n");
+    printf("This kernel only supports sm[100, 120).\n");
     asm("trap;");
   #endif
 #endif
@@ -148,7 +118,7 @@ struct enable_sm120_only : Kernel {
   #if __CUDA_ARCH__ == 1200
     Kernel::operator()(std::forward<Args>(args)...);
   #else
-    printf("This kernel only supports sm120.\n");
+    printf("This kernel only supports sm120a.\n");
     asm("trap;");
   #endif
 #endif
@@ -160,8 +130,13 @@ template <typename Kernel>
 struct enable_sm120_family : Kernel {
   template <typename... Args>
   CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
+#if defined __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
     Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm120f.\n");
+    asm("trap;");
+  #endif
 #endif
   }
 };
diff --git a/csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu b/csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu
index e96017d86dad..2f2e7ecc1829 100644
--- a/csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu
+++ b/csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu
@@ -29,7 +29,11 @@
  */
 
 #include <cmath>
-#include <cuda_fp8.h>
+#ifndef USE_ROCM
+  #include <cuda_fp8.h>
+#else
+  #include <hip/hip_fp8.h>
+#endif
 #include <cuda_runtime.h>
 #include <type_traits>
 
@@ -42,7 +46,23 @@
 #include "type_convert.cuh"
 
 #ifndef FINAL_MASK
-  #define FINAL_MASK 0xffffffffu
+  #ifdef USE_ROCM
+    #define FINAL_MASK 0xffffffffffffffffULL
+  #else
+    #define FINAL_MASK 0xffffffffu
+  #endif
+#endif
+
+#ifdef USE_ROCM
+// ROCm-compatible FP8 conversion helpers
+__device__ __forceinline__ uint8_t rocm_cvt_float_to_fp8_e4m3(float val) {
+  #if defined(HIP_FP8_TYPE_OCP)
+  __hip_fp8_e4m3 fp8_val(val);
+  #else
+  __hip_fp8_e4m3_fnuz fp8_val(val);
+  #endif
+  return reinterpret_cast<uint8_t&>(fp8_val);
+}
 #endif
 
 namespace vllm {
@@ -314,9 +334,13 @@ __global__ void fusedDeepseekV4QNormRopeKVRopeQuantInsertKernel(
       for (int i = 0; i < kElemsPerLane; i++) {
         float scaled = elements[i] * inv_scale;
         scaled = fminf(fmaxf(scaled, -kFp8Max), kFp8Max);
+#ifndef USE_ROCM
         __nv_fp8_storage_t s =
             __nv_cvt_float_to_fp8(scaled, __NV_SATFINITE, __NV_E4M3);
         out_bytes[i] = static_cast<uint8_t>(s);
+#else
+      out_bytes[i] = rocm_cvt_float_to_fp8_e4m3(scaled);
+#endif
       }
       // One 16-byte STG per lane.
       *reinterpret_cast<uint4*>(token_fp8_ptr + dim_base) =
@@ -384,6 +408,7 @@ void launchFusedDeepseekV4QNormRopeKVRopeQuantInsert(
   // PDL: enable programmatic stream serialization whenever the hardware
   // supports it (SM90+).  On pre-Hopper GPUs the attribute is unavailable,
   // so leave numAttrs = 0 and launch as a regular kernel.
+#ifndef USE_ROCM
   static int const sm_version = getSMVersion();
   // Host-side guard: the device kernel body is compiled as a no-op for
   // bf16 on pre-Ampere (sm_70/sm_75) because _typeConvert<BFloat16> is
@@ -410,6 +435,15 @@ void launchFusedDeepseekV4QNormRopeKVRopeQuantInsert(
       q_inout, kv_in, k_cache, slot_mapping, position_ids, cos_sin_cache, eps,
       num_tokens_full, num_tokens_insert, num_heads_q, cache_block_size,
       kv_block_stride);
+#else
+  // ROCm: use standard kernel launch syntax (no PDL/stream serialization)
+  // clang-format off
+  fusedDeepseekV4QNormRopeKVRopeQuantInsertKernel<scalar_t_in>
+      <<<grid, kBlockSize, 0, stream>>>(
+          q_inout, kv_in, k_cache, slot_mapping, position_ids, cos_sin_cache,
+          eps, num_tokens_full, num_tokens_insert, num_heads_q,
+          cache_block_size, kv_block_stride);
+#endif
 }
 
 }  // namespace deepseek_v4_fused_ops
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
index 546e1eec64bb..e98433bed25e 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
@@ -141,7 +141,7 @@ struct cutlass_3x_gemm_sm100 {
               sizeof(typename CollectiveEpilogue::SharedStorage))>,
           KernelSchedule>::CollectiveOp;
 
-  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+  using GemmKernel = enable_sm100_to_sm120<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };
 
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
index cf84fc3a6de1..8f74f2991469 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -125,7 +125,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
           MainloopScheduler
       >::CollectiveOp>;
 
-  using KernelType = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+  using KernelType = enable_sm100_to_sm120<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
 
   struct GemmKernel : public KernelType {};
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
index 46c36d13ece4..5cd55f0198c2 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -92,7 +92,7 @@ struct cutlass_3x_gemm_sm100_fp8 {
   // -----------------------------------------------------------
   // Kernel definition
   // -----------------------------------------------------------
-  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+  using GemmKernel = enable_sm100_to_sm120<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };
 
diff --git a/csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu b/csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu
index 8b0356815bb7..5029e3903752 100644
--- a/csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -236,17 +236,41 @@ void per_token_group_quant_8bit(const torch::stable::Tensor& input,
 #undef LAUNCH_KERNEL
 }
 
-template <typename T, typename DST_DTYPE>
-__global__ void per_token_group_quant_8bit_packed_kernel(
+// Register-resident fast path for group_size==128.
+//
+// Each thread holds 16 source elements (32 B = uint4 x 2) in registers across
+// the absmax reduce -> scale compute -> quantize pipeline. No shared memory.
+// UE8M0 scale extracted via bit math (bit-exact with exp2f(ceilf(log2f))).
+//
+// Loads two contiguous uint4s (16 B + 16 B = 32 B) per thread; on Blackwell
+// nvcc fuses these into a single 256-bit LDG.E.256.
+//
+// Constraints: GROUP_SIZE % (THREADS_PER_GROUP * VEC_SIZE) == 0; for
+// THREADS_PER_GROUP=8 and bf16/fp16 (VEC_SIZE=16), this means GROUP_SIZE=128.
+template <typename T, typename DST_DTYPE, int GROUP_SIZE>
+__global__ void per_token_group_quant_8bit_packed_register_kernel(
     const T* __restrict__ input, void* __restrict__ output_q,
-    unsigned int* __restrict__ output_s_packed, const int group_size,
-    const int num_groups_padded, const int groups_per_block,
-    const int padded_groups_per_row, const int groups_per_row, const int mn,
-    const int tma_aligned_mn, const int num_scale_elems, const float eps,
+    unsigned int* __restrict__ output_s_packed, const int64_t num_groups_padded,
+    const int groups_per_block, const int padded_groups_per_row,
+    const int groups_per_row, const int mn, const int output_q_mn_extent,
+    const int tma_aligned_mn, const int64_t num_scale_elems, const float eps,
     const float min_8bit, const float max_8bit) {
-  const int threads_per_group = 16;
-  const int64_t local_group_id = threadIdx.x / threads_per_group;
-  const int lane_id = threadIdx.x % threads_per_group;
+  static_assert(GROUP_SIZE == 128, "fast path supports GROUP_SIZE==128");
+  constexpr int THREADS_PER_GROUP = 8;
+  constexpr int VEC_SIZE = 32 / sizeof(T);  // 16 for bf16/fp16
+  static_assert(GROUP_SIZE == THREADS_PER_GROUP * VEC_SIZE,
+                "GROUP_SIZE must equal THREADS_PER_GROUP * VEC_SIZE");
+  // Each group's 8 threads must live in a single warp octet so the
+  // 0xffu << (threadIdx.x & 24u) shuffle mask selects exactly the lanes
+  // that share a group. Requires 32 % THREADS_PER_GROUP == 0 and the host
+  // to launch num_threads as a multiple of THREADS_PER_GROUP (which it does
+  // via num_threads = groups_per_block * THREADS_PER_GROUP).
+  static_assert(32 % THREADS_PER_GROUP == 0,
+                "THREADS_PER_GROUP must divide warp size for the shuffle "
+                "mask to be valid");
+
+  const int local_group_id = threadIdx.x / THREADS_PER_GROUP;
+  const int lane_id = threadIdx.x % THREADS_PER_GROUP;
 
   const int64_t block_group_id = blockIdx.x * groups_per_block;
   const int64_t global_group_id = block_group_id + local_group_id;
@@ -254,141 +278,207 @@ __global__ void per_token_group_quant_8bit_packed_kernel(
     return;
   }
 
-  // map flat group id to 2D indices (mn_idx, sf_k_idx)
   const int sf_k_idx =
       static_cast<int>(global_group_id % padded_groups_per_row);
   const int mn_idx = static_cast<int>(global_group_id / padded_groups_per_row);
-
-  // whether it is a valid group (not padding)
   const bool is_valid_group = (mn_idx < mn) && (sf_k_idx < groups_per_row);
 
-  // shared memory to cache each group's data to avoid double DRAM reads.
-  extern __shared__ __align__(16) char smem_raw[];
-  T* smem = reinterpret_cast<T*>(smem_raw);
-  T* smem_group = smem + local_group_id * group_size;
-
-  // compute scale for valid groups
-  float y_s = 0.f;
+  // Load 16 input elements (32 B) into registers as two adjacent uint4
+  // loads. nvcc keeps these as 2x LDG.E.128 on sm_100; the per-thread cost
+  // is dominated by HBM bandwidth at large MN, so a fused 256-bit load via
+  // inline PTX gave no measurable speedup.
+  // alignas(16) is required so the uint4* reinterpret_cast below is
+  // well-defined for T == bf16/fp16 (default alignof is 2).
+  alignas(16) T regs[VEC_SIZE];
+  float local_absmax = eps;
   if (is_valid_group) {
     const T* group_input =
-        input + static_cast<int64_t>(mn_idx) * groups_per_row * group_size +
-        sf_k_idx * group_size;
-    y_s = ComputeGroupScale<T, true>(group_input, smem_group, group_size,
-                                     lane_id, threads_per_group, eps, max_8bit);
+        input + static_cast<int64_t>(mn_idx) * groups_per_row * GROUP_SIZE +
+        sf_k_idx * GROUP_SIZE + lane_id * VEC_SIZE;
+    uint4* dst = reinterpret_cast<uint4*>(&regs[0]);
+    const uint4* src = reinterpret_cast<const uint4*>(group_input);
+    dst[0] = src[0];
+    dst[1] = src[1];
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      float v = fabsf(static_cast<float>(regs[i]));
+      local_absmax = fmaxf(local_absmax, v);
+    }
   }
 
-  // pack 4 scales into a uint32 exponent
+  // 8-lane subgroup shuffle reduce (octet of the warp). The mask selects the
+  // 8 lanes within the warp that share a group.
+  unsigned mask = 0xffu << (threadIdx.x & 24u);
+  local_absmax = fmaxf(local_absmax, __shfl_xor_sync(mask, local_absmax, 4));
+  local_absmax = fmaxf(local_absmax, __shfl_xor_sync(mask, local_absmax, 2));
+  local_absmax = fmaxf(local_absmax, __shfl_xor_sync(mask, local_absmax, 1));
+
+  float y_s = local_absmax / max_8bit;
+  y_s = fmaxf(y_s, 1e-10f);
+  uint32_t bits = __float_as_uint(y_s);
+  uint32_t exp_bits = (bits >> 23) & 0xffu;
+  uint32_t mant_bits = bits & 0x7fffffu;
+  uint8_t exp_byte =
+      static_cast<uint8_t>(exp_bits + (mant_bits != 0u ? 1u : 0u));
+
+  // Lane 0 writes the packed scale byte.
   if (lane_id == 0) {
-    // each uint32 in output_s_packed stores 4 packed scales
     const int sf_k_pack_idx = sf_k_idx / 4;
     const int pos = sf_k_idx % 4;
     const int out_idx = sf_k_pack_idx * tma_aligned_mn + mn_idx;
-
     if (is_valid_group) {
-      // reinterpret the UE8M0 scale y_s as IEEE bits, extract the 8-bit
-      // exponent, and place it into the correct byte of the 32-bit word.
-      const unsigned int bits = __float_as_uint(y_s);
-      const uint8_t exponent = static_cast<uint8_t>((bits >> 23u) & 0xffu);
-      reinterpret_cast<uint8_t*>(output_s_packed)[out_idx * 4 + pos] = exponent;
+      reinterpret_cast<uint8_t*>(output_s_packed)[out_idx * 4 + pos] = exp_byte;
     } else if (out_idx < num_scale_elems) {
-      // write zero for padding groups if within bounds of output_s_packed
       reinterpret_cast<uint8_t*>(output_s_packed)[out_idx * 4 + pos] = 0;
     }
   }
 
-  __syncthreads();
+  // For padded mn rows that fall within output_q's allocated extent, write
+  // a uint4 of zeros to keep the buffer clean for downstream TMA loads.
+  // Skip writes for sf_k padding (those positions don't exist in output_q).
+  if (!is_valid_group) {
+    if (sf_k_idx < groups_per_row && mn_idx >= mn &&
+        mn_idx < output_q_mn_extent) {
+      DST_DTYPE* group_output =
+          static_cast<DST_DTYPE*>(output_q) +
+          static_cast<int64_t>(mn_idx) * groups_per_row * GROUP_SIZE +
+          sf_k_idx * GROUP_SIZE + lane_id * VEC_SIZE;
+      *reinterpret_cast<uint4*>(group_output) = make_uint4(0, 0, 0, 0);
+    }
+    return;
+  }
 
-  if (is_valid_group) {
-    DST_DTYPE* group_output =
-        static_cast<DST_DTYPE*>(output_q) +
-        static_cast<int64_t>(mn_idx) * groups_per_row * group_size +
-        sf_k_idx * group_size;
-    QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
-                                threads_per_group, y_s, min_8bit, max_8bit);
+  // Reconstruct y_s as a power-of-2 float and use its reciprocal.
+  float y_s_q = __uint_as_float(static_cast<uint32_t>(exp_byte) << 23);
+  float inv_y = 1.0f / y_s_q;
+
+  // Quantize and pack into 16 fp8/int8 bytes (= uint4). VEC_SIZE==16 so we
+  // fill four 32-bit words, four bytes each.
+  uint32_t packed_lo = 0;
+  uint32_t packed_lo_hi = 0;
+  uint32_t packed_hi_lo = 0;
+  uint32_t packed_hi = 0;
+#pragma unroll
+  for (int i = 0; i < VEC_SIZE; ++i) {
+    float q =
+        fminf(fmaxf(static_cast<float>(regs[i]) * inv_y, min_8bit), max_8bit);
+    DST_DTYPE qb = DST_DTYPE(q);
+    uint8_t byte = *reinterpret_cast<uint8_t*>(&qb);
+    const int shift = (i & 3) * 8;
+    if (i < 4) {
+      packed_lo |= static_cast<uint32_t>(byte) << shift;
+    } else if (i < 8) {
+      packed_lo_hi |= static_cast<uint32_t>(byte) << shift;
+    } else if (i < 12) {
+      packed_hi_lo |= static_cast<uint32_t>(byte) << shift;
+    } else {
+      packed_hi |= static_cast<uint32_t>(byte) << shift;
+    }
   }
+
+  uint4 packed_out =
+      make_uint4(packed_lo, packed_lo_hi, packed_hi_lo, packed_hi);
+  DST_DTYPE* group_output =
+      static_cast<DST_DTYPE*>(output_q) +
+      static_cast<int64_t>(mn_idx) * groups_per_row * GROUP_SIZE +
+      sf_k_idx * GROUP_SIZE + lane_id * VEC_SIZE;
+  *reinterpret_cast<uint4*>(group_output) = packed_out;
 }
 
+// Public entry point: register-resident packed quant kernel.
+// Constraints: group_size == 128 and bf16/fp16 input.
 void per_token_group_quant_8bit_packed(const torch::stable::Tensor& input,
                                        torch::stable::Tensor& output_q,
                                        torch::stable::Tensor& output_s_packed,
                                        int64_t group_size, double eps,
                                        double min_8bit, double max_8bit) {
+  STD_TORCH_CHECK(group_size == 128,
+                  "per_token_group_quant_8bit_packed only supports "
+                  "group_size==128, got ",
+                  group_size, ".");
+  const auto in_dtype = input.scalar_type();
+  STD_TORCH_CHECK(
+      in_dtype == torch::headeronly::ScalarType::Half ||
+          in_dtype == torch::headeronly::ScalarType::BFloat16,
+      "per_token_group_quant_8bit_packed only supports bf16/fp16 input.");
+
   STD_TORCH_CHECK(input.is_contiguous());
   STD_TORCH_CHECK(output_q.is_contiguous());
 
   const int64_t k = input.size(-1);
-  STD_TORCH_CHECK(k % group_size == 0, "Last dimension (", k,
-                  ") must be divisible by group_size (", group_size, ").");
+  STD_TORCH_CHECK(k % group_size == 0, "input last dim k=", k,
+                  " is not divisible by group_size=", group_size, ".");
 
   const int64_t mn = input.numel() / k;
   const int64_t groups_per_row = k / group_size;
-
-  STD_TORCH_CHECK(output_s_packed.dim() == 2,
-                  "output_s_packed must be 2D, got dim=", output_s_packed.dim(),
-                  ".");
-
   const int64_t k_num_packed_sfk = (groups_per_row + 3) / 4;
   const int64_t tma_aligned_mn = ((mn + 3) / 4) * 4;
 
+  // output_q may be allocated with extra padded mn rows (e.g.,
+  // (tma_aligned_mn, k)) so the kernel can zero-fill them in-line and the
+  // caller can use torch.empty instead of torch.zeros. The grid only covers
+  // up to tma_aligned_mn, so we cap the extent there.
+  const int64_t output_q_mn_actual = output_q.numel() / k;
+  STD_TORCH_CHECK(output_q_mn_actual >= mn,
+                  "output_q must have at least mn rows; got ",
+                  output_q_mn_actual, " rows for mn=", mn, ".");
+  const int64_t output_q_mn_extent =
+      output_q_mn_actual < tma_aligned_mn ? output_q_mn_actual : tma_aligned_mn;
+
   STD_TORCH_CHECK(
       output_s_packed.scalar_type() == torch::headeronly::ScalarType::Int,
-      "output_s_packed must have dtype int32 for UE8M0-packed scales.");
-  // DeepGEMM expects SFA scales in MN-major form with shape
-  // [mn, ceil_div(K, 128 * 4)] and TMA-aligned stride on the last
-  // dimension.
+      "output_s_packed must be int32 for UE8M0-packed scales.");
   STD_TORCH_CHECK(output_s_packed.size(0) == mn &&
                       output_s_packed.size(1) == k_num_packed_sfk,
                   "output_s_packed shape must be [", mn, ", ", k_num_packed_sfk,
-                  "], but got [", output_s_packed.size(0), ", ",
+                  "]; got [", output_s_packed.size(0), ", ",
                   output_s_packed.size(1), "].");
-  // Verify column-major TMA-aligned layout
   STD_TORCH_CHECK(output_s_packed.stride(0) == 1 &&
                       output_s_packed.stride(1) == tma_aligned_mn,
-                  "output_s_packed must have strides [1, ", tma_aligned_mn,
-                  "], but got [", output_s_packed.stride(0), ", ",
+                  "output_s_packed strides must be [1, ", tma_aligned_mn,
+                  "]; got [", output_s_packed.stride(0), ", ",
                   output_s_packed.stride(1), "].");
 
   cudaStream_t stream = get_current_cuda_stream();
 
-  constexpr int THREADS_PER_GROUP = 16;
-
-  // Expand the grid to cover MN and K padding so every byte in
-  // output_s_packed is written (padding bytes get zeroed by the kernel).
+  constexpr int THREADS_PER_GROUP = 8;
   const int64_t padded_groups_per_row = k_num_packed_sfk * 4;
   const int64_t num_groups_padded = tma_aligned_mn * padded_groups_per_row;
-  // Number of elements in output_s_packed.
   const int64_t num_scale_elems = mn + (k_num_packed_sfk - 1) * tma_aligned_mn;
-
   const int groups_per_block = GetGroupsPerBlock(num_groups_padded);
 
   auto dst_type = output_q.scalar_type();
-  const int num_blocks = num_groups_padded / groups_per_block;
+  const int64_t num_blocks = num_groups_padded / groups_per_block;
   const int num_threads = groups_per_block * THREADS_PER_GROUP;
-
-#define LAUNCH_PACKED_KERNEL(T, DST_DTYPE)                                     \
-  do {                                                                         \
-    dim3 grid(num_blocks);                                                     \
-    dim3 block(num_threads);                                                   \
-    size_t smem_bytes =                                                        \
-        static_cast<size_t>(groups_per_block) * group_size * sizeof(T);        \
-    per_token_group_quant_8bit_packed_kernel<T, DST_DTYPE>                     \
-        <<<grid, block, smem_bytes, stream>>>(                                 \
-            static_cast<const T*>(input.data_ptr()), output_q.data_ptr(),      \
-            reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()),       \
-            static_cast<int>(group_size), static_cast<int>(num_groups_padded), \
-            groups_per_block, static_cast<int>(padded_groups_per_row),         \
-            static_cast<int>(groups_per_row), static_cast<int>(mn),            \
-            static_cast<int>(tma_aligned_mn),                                  \
-            static_cast<int>(num_scale_elems), static_cast<float>(eps),        \
-            static_cast<float>(min_8bit), static_cast<float>(max_8bit));       \
+  // CUDA caps grid.x at 2^31 - 1; this fits any realistic shape but guard
+  // against pathological inputs.
+  STD_TORCH_CHECK(num_blocks <= static_cast<int64_t>(INT32_MAX),
+                  "per_token_group_quant_8bit_packed grid too large: ",
+                  num_blocks, " blocks (max ", INT32_MAX, ").");
+
+#define LAUNCH_REG_KERNEL(T, DST_DTYPE)                                   \
+  do {                                                                    \
+    dim3 grid(static_cast<unsigned int>(num_blocks));                     \
+    dim3 block(num_threads);                                              \
+    per_token_group_quant_8bit_packed_register_kernel<T, DST_DTYPE, 128>  \
+        <<<grid, block, 0, stream>>>(                                     \
+            static_cast<const T*>(input.data_ptr()), output_q.data_ptr(), \
+            reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()),  \
+            num_groups_padded, groups_per_block,                          \
+            static_cast<int>(padded_groups_per_row),                      \
+            static_cast<int>(groups_per_row), static_cast<int>(mn),       \
+            static_cast<int>(output_q_mn_extent),                         \
+            static_cast<int>(tma_aligned_mn), num_scale_elems,            \
+            static_cast<float>(eps), static_cast<float>(min_8bit),        \
+            static_cast<float>(max_8bit));                                \
   } while (0)
 
-  VLLM_STABLE_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "per_token_group_quant_8bit_packed", ([&] {
+  VLLM_STABLE_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "per_token_group_quant_8bit_packed_register", ([&] {
         if (dst_type == torch::headeronly::ScalarType::Float8_e4m3fn) {
-          LAUNCH_PACKED_KERNEL(scalar_t, __nv_fp8_e4m3);
+          LAUNCH_REG_KERNEL(scalar_t, __nv_fp8_e4m3);
         } else if (dst_type == torch::headeronly::ScalarType::Char) {
-          LAUNCH_PACKED_KERNEL(scalar_t, int8_t);
+          LAUNCH_REG_KERNEL(scalar_t, int8_t);
         } else {
           STD_TORCH_CHECK(
               false,
@@ -397,7 +487,7 @@ void per_token_group_quant_8bit_packed(const torch::stable::Tensor& input,
         }
       }));
 
-#undef LAUNCH_PACKED_KERNEL
+#undef LAUNCH_REG_KERNEL
 }
 
 void per_token_group_quant_fp8(const torch::stable::Tensor& input,
diff --git a/csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h b/csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h
index d67fd2b336ec..6630c0decee6 100644
--- a/csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h
+++ b/csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h
@@ -8,3 +8,13 @@ void per_token_group_quant_8bit(const torch::stable::Tensor& input,
                                 torch::stable::Tensor& output_s,
                                 int64_t group_size, double eps, double min_8bit,
                                 double max_8bit, bool scale_ue8m0 = false);
+
+// Public op: register-resident packed quant for the DeepGEMM Blackwell path.
+// Restricted to group_size == 128 and bf16/fp16 input; other configurations
+// raise STD_TORCH_CHECK. The legacy shared-memory fallback was removed because
+// no production caller (deep_gemm_moe / input_quant_fp8) uses other shapes.
+void per_token_group_quant_8bit_packed(const torch::stable::Tensor& input,
+                                       torch::stable::Tensor& output_q,
+                                       torch::stable::Tensor& output_s_packed,
+                                       int64_t group_size, double eps,
+                                       double min_8bit, double max_8bit);
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 973190935dfb..ac0e8d59f604 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -67,10 +67,6 @@ void shuffle_rows(const torch::Tensor& input_tensor,
                   torch::Tensor& output_tensor);
 
 #ifndef USE_ROCM
-// cuBLAS bf16 x bf16 -> fp32 router GEMM (fallback for non-SM90 / batch > 16)
-torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
-                                    torch::Tensor const& weight);
-
 // DeepSeek V3 optimized router GEMM kernel for SM90+
 // Computes output = mat_a @ mat_b.T where:
 //   mat_a: [num_tokens, hidden_dim] in bf16
diff --git a/csrc/moe/router_gemm.cu b/csrc/moe/router_gemm.cu
deleted file mode 100644
index a939f8846ff1..000000000000
--- a/csrc/moe/router_gemm.cu
+++ /dev/null
@@ -1,52 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-// bf16 x bf16 -> fp32 router GEMM via cuBLAS.
-// Uses CUBLAS_COMPUTE_32F so bf16 operands accumulate into fp32,
-// matching TRT-LLM's cuBLAS fallback behaviour in dsv3RouterGemmOp.
-
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cublas_v2.h>
-
-// cuBLAS column-major math for row-major PyTorch tensors:
-//   weight[N,K]_row  lda=K  -> cuBLAS sees (K,N) col-major; CUBLAS_OP_T ->
-//   (N,K) input[M,K]_row   ldb=K  -> cuBLAS sees (K,M) col-major; CUBLAS_OP_N
-//   -> (K,M) out[M,N]_row     ldc=N  -> cuBLAS sees (N,M) col-major (written as
-//   output^T)
-// cuBLAS: C(N,M) = weight(N,K) @ input(K,M)  =>  C^T = output[M,N]
-// params: m=N, n=M, k=K, lda=K (weight), ldb=K (input), ldc=N (output)
-
-torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
-                                    torch::Tensor const& weight) {
-  TORCH_CHECK(input.dtype() == torch::kBFloat16,
-              "router_gemm_bf16_fp32: input must be bfloat16");
-  TORCH_CHECK(weight.dtype() == torch::kBFloat16,
-              "router_gemm_bf16_fp32: weight must be bfloat16");
-  TORCH_CHECK(input.dim() == 2 && weight.dim() == 2,
-              "router_gemm_bf16_fp32: input and weight must be 2-D");
-  TORCH_CHECK(input.size(1) == weight.size(1),
-              "router_gemm_bf16_fp32: inner dimensions must match");
-
-  int64_t const M = input.size(0);
-  int64_t const N = weight.size(0);
-  int64_t const K = input.size(1);
-
-  auto out = torch::empty({M, N}, input.options().dtype(torch::kFloat32));
-
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  TORCH_CUDABLAS_CHECK(
-      cublasSetStream(handle, at::cuda::getCurrentCUDAStream()));
-
-  float const alpha = 1.0f;
-  float const beta = 0.0f;
-
-  TORCH_CUDABLAS_CHECK(cublasGemmEx(
-      handle, CUBLAS_OP_T, CUBLAS_OP_N, static_cast<int>(N),
-      static_cast<int>(M), static_cast<int>(K), &alpha, weight.data_ptr(),
-      CUDA_R_16BF, static_cast<int>(K), input.data_ptr(), CUDA_R_16BF,
-      static_cast<int>(K), &beta, out.data_ptr(), CUDA_R_32F,
-      static_cast<int>(N), CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT));
-
-  return out;
-}
diff --git a/csrc/moe/topk_softplus_sqrt_kernels.cu b/csrc/moe/topk_softplus_sqrt_kernels.cu
index 50a8540a7374..43d461a0179a 100644
--- a/csrc/moe/topk_softplus_sqrt_kernels.cu
+++ b/csrc/moe/topk_softplus_sqrt_kernels.cu
@@ -60,15 +60,6 @@ __device__ __forceinline__ float toFloat(T value) {
   }
 }
 
-#define FINAL_MASK 0xffffffff
-template <typename T>
-__inline__ __device__ T warpReduceSum(T val) {
-#pragma unroll
-  for (int mask = 16; mask > 0; mask >>= 1)
-    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
-  return val;
-}
-
 // ====================== TopK softplus_sqrt things
 // ===============================
 
@@ -272,8 +263,14 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
       }
     }
     // Compute per-thread scale (using warp reduction when renormalizing).
+    // THREADS_PER_ROW-parameterized butterfly works for both warp sizes (32
+    // on CUDA, 64 on ROCm CDNA) and any THREADS_PER_ROW the dispatch picks.
     if (renormalize) {
-      selected_sum = warpReduceSum(selected_sum);
+#pragma unroll
+      for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+        selected_sum +=
+            VLLM_SHFL_XOR_SYNC_WIDTH(selected_sum, mask, THREADS_PER_ROW);
+      }
     }
     float scale = static_cast<float>(routed_scaling_factor);
     if (renormalize) {
@@ -544,7 +541,6 @@ void topkGatingSoftplusSqrtKernelLauncher(
     const IndType* tid2eid, cudaStream_t stream) {
   static constexpr int WARPS_PER_TB = 4;
   static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
-#ifndef USE_ROCM
   // for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
   // elements can be loaded by a warp
   static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
@@ -552,6 +548,19 @@ void topkGatingSoftplusSqrtKernelLauncher(
        std::is_same_v<InputType, __half>)
           ? 4
           : 8;
+  // Narrower LDG (ELTS_PER_LDG=1) used by 192/320/448/576 on ROCm WARP_SIZE=64
+  // where ELTS_PER_LDG=2 fails the EXPERTS%(ELTS_PER_LDG*WARP_SIZE)==0 check.
+  // On CUDA WARP_SIZE=32 the wider LDG already aligns, so the alias collapses
+  // back to BYTES_PER_LDG_MULTIPLE_64 — no behavioral change for CUDA.
+#ifdef USE_ROCM
+  static constexpr int BYTES_PER_LDG_MULTIPLE_64_NARROW =
+      (std::is_same_v<InputType, __nv_bfloat16> ||
+       std::is_same_v<InputType, __half>)
+          ? 2
+          : 4;
+#else
+  static constexpr int BYTES_PER_LDG_MULTIPLE_64_NARROW =
+      BYTES_PER_LDG_MULTIPLE_64;
 #endif
   switch (num_experts) {
     case 1:
@@ -584,27 +593,29 @@ void topkGatingSoftplusSqrtKernelLauncher(
     case 512:
       LAUNCH_SOFTPLUS_SQRT(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
       break;
-      // (CUDA only) support multiples of 64 when num_experts is not power of 2.
-      // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of
-      // num_experts, alternatively we can test 4 bytes loading and enable it in
-      // future.
-#ifndef USE_ROCM
+      // Multiples of 64 that are not powers of 2. The kernel requires
+      // EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0. With ELTS_PER_LDG=2
+      // (BYTES_PER_LDG_MULTIPLE_64), this holds for all five values on CUDA
+      // WARP_SIZE=32 but only for 384 on ROCm WARP_SIZE=64. The other four
+      // use BYTES_PER_LDG_MULTIPLE_64_NARROW (ELTS_PER_LDG=1), which
+      // satisfies the assertion for any multiple of 64 on either backend;
+      // on CUDA the narrow alias collapses back to the wider load, so CUDA
+      // behavior is unchanged.
     case 192:
-      LAUNCH_SOFTPLUS_SQRT(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+      LAUNCH_SOFTPLUS_SQRT(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64_NARROW);
       break;
     case 320:
-      LAUNCH_SOFTPLUS_SQRT(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+      LAUNCH_SOFTPLUS_SQRT(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64_NARROW);
       break;
     case 384:
       LAUNCH_SOFTPLUS_SQRT(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
       break;
     case 448:
-      LAUNCH_SOFTPLUS_SQRT(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+      LAUNCH_SOFTPLUS_SQRT(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64_NARROW);
       break;
     case 576:
-      LAUNCH_SOFTPLUS_SQRT(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+      LAUNCH_SOFTPLUS_SQRT(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64_NARROW);
       break;
-#endif
     default: {
       TORCH_CHECK(false, "Unsupported expert number: ", num_experts);
     }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 7bf56ba7a2cd..8940e341cd01 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -16,14 +16,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "bias) -> ()");
   m.impl("topk_sigmoid", torch::kCUDA, &topk_sigmoid);
 
-#ifndef USE_ROCM
   m.def(
       "topk_softplus_sqrt(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
       "token_expert_indices, Tensor gating_output, bool renormalize, float "
       "routed_scaling_factor, Tensor? "
       "bias, Tensor? input_ids, Tensor? tid2eid) -> ()");
   m.impl("topk_softplus_sqrt", torch::kCUDA, &topk_softplus_sqrt);
-#endif
+
   // Calculate the result of moe by summing up the partial results
   // from all selected experts.
   m.def("moe_sum(Tensor input, Tensor! output) -> ()");
@@ -133,10 +132,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "Tensor)");
   m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 
-  // cuBLAS bf16 x bf16 -> fp32 router GEMM (fallback for non-SM90 / batch > 16)
-  m.def("router_gemm_bf16_fp32(Tensor input, Tensor weight) -> Tensor");
-  m.impl("router_gemm_bf16_fp32", torch::kCUDA, &router_gemm_bf16_fp32);
-
   // DeepSeek V3 optimized router GEMM for SM90+
   m.def("dsv3_router_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
   // conditionally compiled so impl registration is in source file
diff --git a/csrc/persistent_topk.cuh b/csrc/persistent_topk.cuh
index d6162d52998b..8b9d10ff83dd 100644
--- a/csrc/persistent_topk.cuh
+++ b/csrc/persistent_topk.cuh
@@ -887,27 +887,14 @@ __global__ void __launch_bounds__(kThreadsPerBlock, 2)
   uint32_t* shared_ordered =
       reinterpret_cast<uint32_t*>(smem_raw + kFixedSmemLarge);
 
-  // RadixRowState for multi-CTA cooperative radix
+  // RadixRowState for multi-CTA cooperative radix.
+  // Zero-initialization is done host-side via cudaMemsetAsync in topk.cu
+  // before launch — that gives a stream-ordered happens-before edge for all
+  // CTAs, which the previous in-kernel init (CTA-0 only + intra-CTA
+  // __syncthreads) did not provide and which manifested as a race against
+  // CTA-1+'s first red_release on arrival_counter.
   RadixRowState* state = &params.row_states[group_id];
 
-  // -- Initialize RadixRowState (only needed if large rows exist) --
-  if (params.max_seq_len > RADIX_THRESHOLD) {
-    if (cta_in_group == 0) {
-      for (uint32_t buf = 0; buf < 3; buf++) {
-        for (uint32_t i = tx; i < RADIX; i += kThreadsPerBlock) {
-          state->histogram[buf][i] = 0;
-        }
-      }
-      if (tx == 0) {
-        state->remaining_k = 0;
-        state->prefix = 0;
-        state->arrival_counter = 0;
-        state->output_counter = 0;
-      }
-    }
-    __syncthreads();
-  }
-
   int barrier_phase = 0;
   const uint32_t total_iters = (params.num_rows + num_groups - 1) / num_groups;
 
diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
index c45ebd34729b..d03c6a5cf0dd 100644
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -7,23 +7,23 @@
 
 namespace vllm {
 
-template <typename scalar_t, bool IS_NEOX>
+template <typename scalar_t, typename cache_t, bool IS_NEOX>
 inline __device__ void apply_token_rotary_embedding(
-    scalar_t* __restrict__ arr, const float* __restrict__ cos_ptr,
-    const float* __restrict__ sin_ptr, int rot_offset, int embed_dim,
+    scalar_t* __restrict__ arr, const cache_t* __restrict__ cos_ptr,
+    const cache_t* __restrict__ sin_ptr, int rot_offset, int embed_dim,
     const bool inverse) {
   int x_index, y_index;
   float cos_f, sin_f;
   if (IS_NEOX) {
     x_index = rot_offset;
     y_index = embed_dim + rot_offset;
-    cos_f = VLLM_LDG(cos_ptr + x_index);
-    sin_f = VLLM_LDG(sin_ptr + x_index);
+    cos_f = static_cast<float>(VLLM_LDG(cos_ptr + x_index));
+    sin_f = static_cast<float>(VLLM_LDG(sin_ptr + x_index));
   } else {
     x_index = 2 * rot_offset;
     y_index = 2 * rot_offset + 1;
-    cos_f = VLLM_LDG(cos_ptr + x_index / 2);
-    sin_f = VLLM_LDG(sin_ptr + x_index / 2);
+    cos_f = static_cast<float>(VLLM_LDG(cos_ptr + x_index / 2));
+    sin_f = static_cast<float>(VLLM_LDG(sin_ptr + x_index / 2));
   }
   if (inverse) {
     sin_f = -sin_f;
@@ -34,7 +34,7 @@ inline __device__ void apply_token_rotary_embedding(
   arr[y_index] = static_cast<scalar_t>(y_f * cos_f + x_f * sin_f);
 }
 
-template <typename scalar_t, bool IS_NEOX>
+template <typename scalar_t, typename cache_t, bool IS_NEOX>
 inline __device__ void apply_rotary_embedding(
     scalar_t* __restrict__ query,  // [batch_size, seq_len, num_heads,
                                    // head_size] or [num_tokens, num_heads,
@@ -43,14 +43,14 @@ inline __device__ void apply_rotary_embedding(
                                    // [batch_size, seq_len, num_kv_heads,
                                    // head_size] or [num_tokens, num_kv_heads,
                                    // head_size]
-    const float* cache_ptr, const int head_size, const int num_heads,
+    const cache_t* cache_ptr, const int head_size, const int num_heads,
     const int num_kv_heads, const int rot_dim, const int token_idx,
     const int64_t query_stride, const int64_t key_stride,
     const int64_t head_stride, const int64_t rope_dim_offset,
     const bool inverse) {
   const int embed_dim = rot_dim / 2;
-  const float* cos_ptr = cache_ptr;
-  const float* sin_ptr = cache_ptr + embed_dim;
+  const cache_t* cos_ptr = cache_ptr;
+  const cache_t* sin_ptr = cache_ptr + embed_dim;
 
   const int nq = num_heads * embed_dim;
   for (int i = threadIdx.x; i < nq; i += blockDim.x) {
@@ -58,7 +58,7 @@ inline __device__ void apply_rotary_embedding(
     const int64_t token_head =
         token_idx * query_stride + head_idx * head_stride + rope_dim_offset;
     const int rot_offset = i % embed_dim;
-    apply_token_rotary_embedding<scalar_t, IS_NEOX>(
+    apply_token_rotary_embedding<scalar_t, cache_t, IS_NEOX>(
         query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim, inverse);
   }
 
@@ -69,13 +69,13 @@ inline __device__ void apply_rotary_embedding(
       const int64_t token_head =
           token_idx * key_stride + head_idx * head_stride + rope_dim_offset;
       const int rot_offset = i % embed_dim;
-      apply_token_rotary_embedding<scalar_t, IS_NEOX>(
+      apply_token_rotary_embedding<scalar_t, cache_t, IS_NEOX>(
           key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim, inverse);
     }
   }
 }
 
-template <typename scalar_t, bool IS_NEOX>
+template <typename scalar_t, typename cache_t, bool IS_NEOX>
 __global__ void rotary_embedding_kernel(
     const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
                                             // [num_tokens]
@@ -86,15 +86,15 @@ __global__ void rotary_embedding_kernel(
                                  // [batch_size, seq_len, num_kv_heads,
                                  // head_size] or [num_tokens, num_kv_heads,
                                  // head_size]
-    const float* __restrict__ cos_sin_cache,  // [max_position, rot_dim] fp32
+    const cache_t* __restrict__ cos_sin_cache,  // [max_position, rot_dim]
     const int rot_dim, const int64_t query_stride, const int64_t key_stride,
     const int64_t head_stride, const int num_heads, const int num_kv_heads,
     const int head_size, const int64_t rope_dim_offset, const bool inverse) {
   const int token_idx = blockIdx.x;
   int64_t pos = positions[token_idx];
-  const float* cache_ptr = cos_sin_cache + pos * rot_dim;
+  const cache_t* cache_ptr = cos_sin_cache + pos * rot_dim;
 
-  apply_rotary_embedding<scalar_t, IS_NEOX>(
+  apply_rotary_embedding<scalar_t, cache_t, IS_NEOX>(
       query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
       token_idx, query_stride, key_stride, head_stride, rope_dim_offset,
       inverse);
@@ -168,23 +168,28 @@ void rotary_embedding(
   dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  auto cache_f32 = cos_sin_cache.to(torch::kFloat32);
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
-    if (is_neox) {
-      vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
-          positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-          key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-          cache_f32.data_ptr<float>(), rot_dim, query_stride, key_stride,
-          head_stride, num_heads, num_kv_heads, head_size, rope_dim_offset,
-          inverse);
-    } else {
-      vllm::rotary_embedding_kernel<scalar_t, false>
-          <<<grid, block, 0, stream>>>(
-              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-              cache_f32.data_ptr<float>(), rot_dim, query_stride, key_stride,
-              head_stride, num_heads, num_kv_heads, head_size, rope_dim_offset,
-              inverse);
-    }
+    using query_t = scalar_t;
+    VLLM_DISPATCH_FLOATING_TYPES(
+        cos_sin_cache.scalar_type(), "rotary_embedding_cache", [&] {
+          using cache_t = scalar_t;
+          if (is_neox) {
+            vllm::rotary_embedding_kernel<query_t, cache_t, true>
+                <<<grid, block, 0, stream>>>(
+                    positions.data_ptr<int64_t>(), query.data_ptr<query_t>(),
+                    key.has_value() ? key->data_ptr<query_t>() : nullptr,
+                    cos_sin_cache.data_ptr<cache_t>(), rot_dim, query_stride,
+                    key_stride, head_stride, num_heads, num_kv_heads, head_size,
+                    rope_dim_offset, inverse);
+          } else {
+            vllm::rotary_embedding_kernel<query_t, cache_t, false>
+                <<<grid, block, 0, stream>>>(
+                    positions.data_ptr<int64_t>(), query.data_ptr<query_t>(),
+                    key.has_value() ? key->data_ptr<query_t>() : nullptr,
+                    cos_sin_cache.data_ptr<cache_t>(), rot_dim, query_stride,
+                    key_stride, head_stride, num_heads, num_kv_heads, head_size,
+                    rope_dim_offset, inverse);
+          }
+        });
   });
 }
diff --git a/csrc/topk.cu b/csrc/topk.cu
index 364ecc21e532..c5bffb32856d 100644
--- a/csrc/topk.cu
+++ b/csrc/topk.cu
@@ -82,22 +82,109 @@ void launch_persistent_topk(const torch::Tensor& logits,
     size_t smem_size = P::kFixedSmemLarge + chunk_size * sizeof(uint32_t);
     if (smem_size < P::kSmemMedium) smem_size = P::kSmemMedium;
 
+    // Query occupancy for the instantiation that will actually launch;
+    // overestimating it deadlocks the cooperative barrier.
     int occupancy = 1;
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &occupancy, P::persistent_topk_kernel<TopK, 4>, P::kThreadsPerBlock,
-        smem_size);
+    cudaError_t occ_err = cudaSuccess;
+    if (vec_size == 4) {
+      occ_err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &occupancy, P::persistent_topk_kernel<TopK, 4>, P::kThreadsPerBlock,
+          smem_size);
+    } else if (vec_size == 2) {
+      occ_err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &occupancy, P::persistent_topk_kernel<TopK, 2>, P::kThreadsPerBlock,
+          smem_size);
+    } else {
+      occ_err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &occupancy, P::persistent_topk_kernel<TopK, 1>, P::kThreadsPerBlock,
+          smem_size);
+    }
+    TORCH_CHECK(occ_err == cudaSuccess,
+                "persistent_topk occupancy query failed: ",
+                cudaGetErrorString(occ_err));
     if (occupancy < 1) occupancy = 1;
 
-    uint32_t max_resident_ctas = static_cast<uint32_t>(num_sms) * occupancy;
+    // The cooperative spin-wait barrier only runs when at least one row hits
+    // the radix path (seq_len > RADIX_THRESHOLD). Below that, non-CTA-0 CTAs
+    // early-exit, so oversubscription can't deadlock and headroom is wasted.
+    const bool needs_cooperative =
+        static_cast<uint32_t>(max_seq_len) > P::RADIX_THRESHOLD;
+
+    const uint32_t hw_resident_cap =
+        static_cast<uint32_t>(num_sms) * static_cast<uint32_t>(occupancy);
+    uint32_t max_resident_ctas = hw_resident_cap;
+    if (needs_cooperative) {
+      // Reserve one CTA per SM when occupancy allows; fall back to a single
+      // CTA when occupancy == 1 (the most deadlock-prone case — any straggler
+      // kernel that takes the only slot on one SM hangs the barrier). Never
+      // drop below one full group's worth.
+      uint32_t headroom = (occupancy > 1) ? static_cast<uint32_t>(num_sms) : 1u;
+      if (max_resident_ctas >= headroom + ctas_per_group) {
+        max_resident_ctas -= headroom;
+      }
+    }
     uint32_t num_groups = std::min(max_resident_ctas / ctas_per_group,
                                    static_cast<uint32_t>(num_rows));
     if (num_groups == 0) num_groups = 1;
     uint32_t total_ctas = num_groups * ctas_per_group;
 
+    // If the cooperative launch wouldn't fit, fall back to FilteredTopK
+    // instead of deadlocking. Only relevant when needs_cooperative.
+    if (needs_cooperative && total_ctas > hw_resident_cap) {
+      TORCH_CHECK(max_smem_per_block >= 128 * 1024,
+                  "persistent_topk would oversubscribe and the FilteredTopK "
+                  "fallback requires >=128KB smem per block (have ",
+                  max_smem_per_block, "). total_ctas=", total_ctas,
+                  " > num_sms*occupancy=", hw_resident_cap, " (TopK=", TopK,
+                  ", vec_size=", vec_size, ", ctas_per_group=", ctas_per_group,
+                  ", smem=", smem_size, ").");
+      cudaError_t status =
+          vllm::FilteredTopKRaggedTransform<float, int32_t, TopK>(
+              logits.data_ptr<float>(), output.data_ptr<int32_t>(),
+              lengths.data_ptr<int32_t>(), static_cast<uint32_t>(num_rows),
+              static_cast<uint32_t>(TopK), static_cast<uint32_t>(stride),
+              stream);
+      TORCH_CHECK(status == cudaSuccess,
+                  "FilteredTopK fallback failed: ", cudaGetErrorString(status));
+      return;
+    }
+
     size_t state_bytes = num_groups * sizeof(P::RadixRowState);
     TORCH_CHECK(workspace.size(0) >= static_cast<int64_t>(state_bytes),
                 "workspace too small, need ", state_bytes, " bytes");
 
+    // Zero the per-group RadixRowState region before launch.
+    //
+    // Issued UNCONDITIONALLY so the memset is captured as its own node in
+    // the cudagraph (a separate cudaMemsetAsync node, sequenced before the
+    // persistent_topk_kernel launch on the same stream). The previous
+    // host-side guard `if (needs_cooperative)` was evaluated at capture time;
+    // when capture-time max_seq_len <= RADIX_THRESHOLD (always true under
+    // FULL_DECODE_ONLY with max_model_len < 32 K) the memset would NOT be
+    // captured, leaving the workspace state to accumulate across replays.
+    // That's a latent correctness bug if the runtime data ever takes the
+    // radix path, and removes one variable while debugging hangs in the
+    // decode/medium paths.
+    //
+    // Cost is sub-microsecond: state_bytes = num_groups * sizeof(RadixRowState)
+    // is ~3 KB per group, ~100 KB for the largest grids on this hardware.
+    //
+    // Why the memset is required (regardless of which path the kernel takes):
+    //   1. arrival_counter accumulates within a launch and is never reset,
+    //      so a prior call leaves it at a large positive value. Without this
+    //      reset, the very first wait_ge in the next call sees counter >>
+    //      target and returns instantly, breaking the barrier.
+    //   2. The previous in-kernel init only ran in CTA-0 with intra-CTA
+    //      __syncthreads(), so it had no happens-before edge to CTA-1+'s
+    //      first red_release. cudaMemsetAsync is stream-ordered: the zero
+    //      is globally visible before any CTA runs.
+    {
+      cudaError_t mz_err = cudaMemsetAsync(workspace.data_ptr<uint8_t>(), 0,
+                                           state_bytes, stream);
+      TORCH_CHECK(mz_err == cudaSuccess,
+                  "row_states memset failed: ", cudaGetErrorString(mz_err));
+    }
+
     P::PersistentTopKParams params;
     params.input = logits.data_ptr<float>();
     params.output = output.data_ptr<int32_t>();
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8d8f7bed0441..e695497fd88f 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -183,7 +183,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "int forced_token_heads_per_warp=-1) -> ()");
   ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
 
-#ifndef USE_ROCM
   // Horizontally-fused DeepseekV4-MLA: per-head RMSNorm + GPT-J RoPE for Q, and
   // GPT-J RoPE + UE8M0 FP8 quant + paged cache insert for KV, all in one
   // kernel launch.
@@ -194,7 +193,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float eps, int cache_block_size) -> ()");
   ops.impl("fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert", torch::kCUDA,
            &fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert);
-#endif
 
   // Apply repetition penalties to logits in-place
   ops.def(
diff --git a/docker/Dockerfile b/docker/Dockerfile
index ca5e35f80c37..fd0622e2416a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -41,6 +41,13 @@ ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 # Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}
 
+# OS family of BUILD_BASE_IMAGE. Controls package manager (apt vs dnf) and
+# Python bootstrap. Set to "manylinux" alongside a manylinux build base such
+# as pytorch/manylinux2_28-builder:cuda13.0 to produce wheels with a glibc
+# 2.28 floor (matches PyTorch's own published wheels). Default stays on
+# Ubuntu for backwards compatibility.
+ARG BUILD_OS=ubuntu
+
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
 # their own mirror. When doing so, we don't benefit from the transparent
 # installation of the GPG key of the PPA, as done by add-apt-repository, so we
@@ -94,35 +101,64 @@ FROM ${BUILD_BASE_IMAGE} AS base
 
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
+ARG BUILD_OS
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-# Install system dependencies including build tools
-RUN apt-get update -y \
-    && apt-get install -y --no-install-recommends \
-        ccache \
-        software-properties-common \
-        git \
-        curl \
-        sudo \
-        python3-pip \
-        libibverbs-dev \
-        # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-        # as it was causing spam when compiling the CUTLASS kernels
-        gcc-10 \
-        g++-10 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
-    # Install python dev headers if available (needed for cmake FindPython on Ubuntu 24.04
-    # which ships cmake 3.28 and requires Development.SABIModule; silently skipped on
-    # Ubuntu 20.04/22.04 where python3.x-dev is not available without a PPA)
-    && (apt-get install -y --no-install-recommends python${PYTHON_VERSION}-dev 2>/dev/null || true) \
-    && rm -rf /var/lib/apt/lists/* \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
+# Install system dependencies including build tools.
+# The Ubuntu path uses apt + deadsnakes-via-uv for Python; the manylinux path
+# (AlmaLinux 8, e.g. pytorch/manylinux2_28-builder) uses dnf and the Python
+# interpreters pre-installed at /opt/python/cpXY-cpXY/.
+RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
+        # rdma-core-devel provides libibverbs headers; ccache lives in EPEL,
+        # which the pytorch manylinux image already enables. git/curl/sudo
+        # are typically pre-installed but listed defensively.
+        dnf install -y --setopt=install_weak_deps=False \
+            ccache \
+            git \
+            curl \
+            sudo \
+            rdma-core-devel \
+        && dnf clean all \
+        && rm -rf /var/cache/dnf; \
+    else \
+        apt-get update -y \
+        && apt-get install -y --no-install-recommends \
+            ccache \
+            software-properties-common \
+            git \
+            curl \
+            sudo \
+            python3-pip \
+            libibverbs-dev \
+            # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+            # as it was causing spam when compiling the CUTLASS kernels
+            gcc-10 \
+            g++-10 \
+        && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
+        # Install python dev headers if available (needed for cmake FindPython on Ubuntu 24.04
+        # which ships cmake 3.28 and requires Development.SABIModule; silently skipped on
+        # Ubuntu 20.04/22.04 where python3.x-dev is not available without a PPA)
+        && (apt-get install -y --no-install-recommends python${PYTHON_VERSION}-dev 2>/dev/null || true) \
+        && rm -rf /var/lib/apt/lists/*; \
+    fi
+
+# Install uv and bootstrap /opt/venv. Both paths converge on /opt/venv so all
+# downstream stages stay distro-agnostic.
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && if [ "${BUILD_OS}" = "manylinux" ]; then \
+           # manylinux images ship Python at /opt/python/cpXY-cpXY/; point uv
+           # at the matching interpreter rather than letting it fetch one.
+           PYV_NODOT=$(echo ${PYTHON_VERSION} | tr -d '.') \
+           && MANYLINUX_PY=/opt/python/cp${PYV_NODOT}-cp${PYV_NODOT}/bin/python${PYTHON_VERSION} \
+           && $HOME/.local/bin/uv venv /opt/venv --python "$MANYLINUX_PY"; \
+       else \
+           $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION}; \
+       fi \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
-    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
-    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
-    && ln -s /opt/venv/bin/pip /usr/bin/pip \
+    && ln -sf /opt/venv/bin/python3 /usr/bin/python3 \
+    && ln -sf /opt/venv/bin/python3-config /usr/bin/python3-config \
+    && ln -sf /opt/venv/bin/pip /usr/bin/pip \
     && python3 --version && python3 -m pip --version
 
 # Activate virtual environment and add uv to PATH
@@ -433,6 +469,7 @@ FROM base AS dev
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG BUILD_OS
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -442,7 +479,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 
 # Install libnuma-dev, required by fastsafetensors (fixes #20384)
-RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
+RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
+        dnf install -y numactl-devel && dnf clean all && rm -rf /var/cache/dnf; \
+    else \
+        apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*; \
+    fi
 
 
 # We can specify the standard or nightly build of PyTorch
@@ -585,9 +626,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ARG FLASHINFER_VERSION=0.6.8.post1
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
-        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-    && flashinfer show-config \
-    && flashinfer download-cubin
+        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # ============================================================
 # OPENAI API SERVER DEPENDENCIES
@@ -669,6 +708,13 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
     uv pip install --system ep_kernels/dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
+# Download FlashInfer precompiled cubins AFTER all pip installs are done.
+# This must run after the vLLM wheel and EP kernels installs above, because
+# those can reinstall/touch flashinfer packages. Downloading cubins earlier
+# (in the flashinfer-jit-cache layer) causes ~2.5 GB of layer duplication
+# when a later pip install overwrites flashinfer package files.
+RUN flashinfer show-config && flashinfer download-cubin
+
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
 # consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
@@ -758,6 +804,10 @@ FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
 ARG INSTALL_KV_CONNECTORS=false
 ARG CUDA_VERSION
+ARG VLLM_BUILD_COMMIT
+ARG VLLM_BUILD_PIPELINE
+ARG VLLM_BUILD_URL
+ARG VLLM_IMAGE_TAG
 
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@@ -794,6 +844,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image
+ENV VLLM_BUILD_COMMIT=${VLLM_BUILD_COMMIT:-unknown} \
+    VLLM_BUILD_PIPELINE=${VLLM_BUILD_PIPELINE:-local} \
+    VLLM_BUILD_URL=${VLLM_BUILD_URL:-} \
+    VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-local/vllm-openai:dev}
+LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm" \
+      org.opencontainers.image.revision="${VLLM_BUILD_COMMIT}" \
+      org.opencontainers.image.version="${VLLM_IMAGE_TAG}" \
+      org.opencontainers.image.url="${VLLM_BUILD_URL}" \
+      ai.vllm.build.commit="${VLLM_BUILD_COMMIT}" \
+      ai.vllm.build.pipeline="${VLLM_BUILD_PIPELINE}" \
+      ai.vllm.build.url="${VLLM_BUILD_URL}" \
+      ai.vllm.image.tag="${VLLM_IMAGE_TAG}"
 
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 72ee002d90a4..d15ced8e0111 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -192,6 +192,7 @@ ADD ./tests/ ./tests/
 ADD ./examples/ ./examples/
 ADD ./benchmarks/ ./benchmarks/
 ADD ./vllm/collect_env.py .
+ADD ./docker/ ./docker/
 ADD ./.buildkite/ ./.buildkite/
 
 # install development dependencies (for testing)
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 7a93823cd2d5..0ed12f11da94 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -124,9 +124,9 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 
 # RIXL/UCX build stages
 FROM base AS build_rixl
-ARG RIXL_BRANCH="bf4a7214"
+ARG RIXL_BRANCH="39be1de8"
 ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
-ARG UCX_BRANCH="7009d7a1"
+ARG UCX_BRANCH="bfb51733"
 ARG UCX_REPO="https://github.com/openucx/ucx.git"
 ENV ROCM_PATH=/opt/rocm
 ENV UCX_HOME=/usr/local/ucx
@@ -192,6 +192,7 @@ RUN cd /opt/rixl && \
     sed -i "s/--exclude 'libamdhip64\*'/--exclude 'libamdhip64*' --exclude 'libcore*' --exclude 'libpull*'/" \
         contrib/build-wheel.sh && \
     mkdir -p /app/install && \
+    _ucx_install_dir=${UCX_HOME} \
     ./contrib/build-wheel.sh \
         --output-dir /app/install \
         --rocm-dir ${ROCM_PATH} \
@@ -200,9 +201,9 @@ RUN cd /opt/rixl && \
 
 # DeepEP build stage
 FROM base AS build_deep
-ARG ROCSHMEM_BRANCH="ba0bf0f3"
+ARG ROCSHMEM_BRANCH="f0acb0c6"
 ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git"
-ARG DEEPEP_BRANCH="5d90af8b"
+ARG DEEPEP_BRANCH="a9ea9774"
 ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
 ARG DEEPEP_NIC="cx7"
 ARG DEEPEP_ROCM_ARCH="gfx942;gfx950"
@@ -213,18 +214,15 @@ RUN git clone ${ROCSHMEM_REPO} \
  && git checkout ${ROCSHMEM_BRANCH} \
  && mkdir -p projects/rocshmem/build \
  && cd projects/rocshmem/build \
- && bash ../scripts/build_configs/all_backends \
-      -DCMAKE_INSTALL_PREFIX="${ROCSHMEM_DIR}" \
-      -DROCM_PATH=/opt/rocm \
-      -DGPU_TARGETS="${DEEPEP_ROCM_ARCH}" \
-      -DUSE_EXTERNAL_MPI=OFF
+ && INSTALL_PREFIX=${ROCSHMEM_DIR} \
+    ../scripts/build_configs/all_backends -DUSE_EXTERNAL_MPI=OFF
 
 # Build DeepEP wheel.
 # DeepEP looks for rocshmem at ROCSHMEM_DIR.
 RUN git clone ${DEEPEP_REPO} \
  && cd DeepEP \
  && git checkout ${DEEPEP_BRANCH} \
- && python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
+ && python3 setup.py --variant rocm --rocm-explicit-ctx --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
 
 # MoRI runtime dependencies live in Dockerfile.rocm so NIC backend changes do
 # not force users to rebuild the long-lived Dockerfile.rocm_base image.
@@ -388,6 +386,16 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
 # above are not available once that RUN step completes.
 COPY --from=export_vllm /*.whl /opt/vllm-wheels/
 
+# Update rdma-core to support latest rocshmem
+ARG DEEPEP_NIC
+RUN if [ "${DEEPEP_NIC}" = "cx7" ] || [ "${DEEPEP_NIC}" = "io" ]; then \
+    git clone --branch v62.0 --depth 1 https://github.com/linux-rdma/rdma-core.git /tmp/rdma-core && \
+    cd /tmp/rdma-core && \
+    mkdir -p build && cd build && \
+    cmake -GNinja -DCMAKE_INSTALL_PREFIX=/usr -DNO_MAN_PAGES=1 .. && \
+    ninja &&     ninja install &&     ldconfig &&     rm -rf /tmp/rdma-core; \
+fi
+
 # Install RIXL wheel
 RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
     uv pip install --system /rixl_install/*.whl
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 5940a4ee564d..a21916d0b531 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.2.1-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.2.2-complete
 ARG TRITON_BRANCH="ba5c1517"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
 ARG PYTORCH_BRANCH="8514f051" # release/2.10 as of 3/17
@@ -9,7 +9,7 @@ ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="v0.1.10.post3"
+ARG AITER_BRANCH="v0.1.12.post2"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 ARG MORI_BRANCH="v1.1.0"
 ARG MORI_REPO="https://github.com/ROCm/mori.git"
@@ -104,6 +104,28 @@ ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
 ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
 ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
 
+# torch profiler hotfix for 7.2.2: rebuild CLR with https://github.com/ROCm/rocm-systems/pull/5062
+# will be removed once we move to ROCm 7.2.3
+RUN apt-get update && apt-get install -y rocm-llvm-dev
+RUN pip install CppHeaderParser
+RUN git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-systems /tmp/rocm-systems \
+    && cd /tmp/rocm-systems \
+    && git sparse-checkout init --cone \
+    && git sparse-checkout set projects/hip projects/clr \
+    && git checkout 35e8c7bf8911862e5389509800e65fdf125412b3 \
+    && export CLR_DIR=/tmp/rocm-systems/projects/clr \
+    && export HIP_DIR=/tmp/rocm-systems/projects/hip \
+    && mkdir -p $CLR_DIR/build && cd $CLR_DIR/build \
+    && cmake \
+        -DHIP_COMMON_DIR=$HIP_DIR \
+        -DCMAKE_PREFIX_PATH="/opt/rocm/" \
+        -DCLR_BUILD_HIP=ON \
+        -DCLR_BUILD_OCL=OFF \
+        -DHIP_PLATFORM=amd \
+        .. \
+    && make -j$(nproc) \
+    && make install \
+    && rm -rf /tmp/rocm-systems
 
 ###
 ### Triton Build
@@ -153,8 +175,6 @@ RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH}
 RUN cd pytorch \
     && pip install -r requirements.txt && git submodule update --init --recursive
-RUN cd pytorch/third_party/kineto \
-    && git remote add rocm https://github.com/ROCm/kineto && git fetch rocm && git checkout 2d73be3 
 RUN cd pytorch && python3 tools/amd_build/build_amd.py \
     && if [ "$USE_SCCACHE" = "1" ]; then \
            export HIP_CLANG_PATH=/opt/sccache-wrappers \
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index ab3f6f40d87b..e348562a47f7 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -5,9 +5,6 @@ WORKDIR /workspace/
 ARG PYTHON_VERSION=3.12
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/xpu"
 
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
-
 RUN apt clean && apt-get update -y && \
     apt-get install -y --no-install-recommends --fix-missing \
     curl \
@@ -26,8 +23,20 @@ RUN apt clean && apt-get update -y && \
     python3.12-dev \
     python3-pip
 
-RUN apt update && apt upgrade -y && \
-    apt install -y intel-oneapi-compiler-dpcpp-cpp-2025.3
+# Add oneAPI repo, pin oneAPI to 2025.3, then install pinned packages in one layer.
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
+    printf '%s\n' \
+    'Package: intel-oneapi-* intel-deep-learning-essentials* intel-pti*' \
+    'Pin: version 2025.3*' \
+    'Pin-Priority: 1001' \
+    > /etc/apt/preferences.d/oneapi-2025.3.pref && \
+    apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    intel-oneapi-compiler-dpcpp-cpp-2025.3 \
+    intel-oneapi-mkl-devel-2025.3 \
+    intel-oneapi-dnnl-devel-2025.3 && \
+    rm -rf /var/lib/apt/lists/*
 
 # Install UMD
 RUN mkdir neo && \
diff --git a/docker/docker-bake.hcl b/docker/docker-bake.hcl
index 785d598d6080..94ca8397561a 100644
--- a/docker/docker-bake.hcl
+++ b/docker/docker-bake.hcl
@@ -27,6 +27,22 @@ variable "COMMIT" {
   default = ""
 }
 
+variable "VLLM_BUILD_COMMIT" {
+  default = "unknown"
+}
+
+variable "VLLM_BUILD_PIPELINE" {
+  default = "local"
+}
+
+variable "VLLM_BUILD_URL" {
+  default = ""
+}
+
+variable "VLLM_IMAGE_TAG" {
+  default = "local/vllm-openai:dev"
+}
+
 # Groups
 
 group "default" {
@@ -46,6 +62,10 @@ target "_common" {
     max_jobs             = MAX_JOBS
     nvcc_threads         = NVCC_THREADS
     torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST
+    VLLM_BUILD_COMMIT    = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown")
+    VLLM_BUILD_PIPELINE  = VLLM_BUILD_PIPELINE
+    VLLM_BUILD_URL       = VLLM_BUILD_URL
+    VLLM_IMAGE_TAG       = VLLM_IMAGE_TAG
   }
 }
 
@@ -56,10 +76,16 @@ target "_labels" {
     "org.opencontainers.image.title"       = "vLLM"
     "org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs"
     "org.opencontainers.image.licenses"    = "Apache-2.0"
-    "org.opencontainers.image.revision"    = COMMIT
+    "org.opencontainers.image.revision"    = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown")
+    "org.opencontainers.image.version"     = VLLM_IMAGE_TAG
+    "org.opencontainers.image.url"         = VLLM_BUILD_URL
+    "ai.vllm.build.commit"                 = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown")
+    "ai.vllm.build.pipeline"               = VLLM_BUILD_PIPELINE
+    "ai.vllm.build.url"                    = VLLM_BUILD_URL
+    "ai.vllm.image.tag"                    = VLLM_IMAGE_TAG
   }
   annotations = [
-      "index,manifest:org.opencontainers.image.revision=${COMMIT}",
+    "index,manifest:org.opencontainers.image.revision=${VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown")}",
   ]
 }
 
diff --git a/docker/versions.json b/docker/versions.json
index b6b555790d2a..75652823db0b 100644
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -16,6 +16,9 @@
     "FINAL_BASE_IMAGE": {
       "default": "nvidia/cuda:13.0.2-base-ubuntu22.04"
     },
+    "BUILD_OS": {
+      "default": "ubuntu"
+    },
     "GET_PIP_URL": {
       "default": "https://bootstrap.pypa.io/get-pip.py"
     },
diff --git a/docs/assets/training/layerwise.png b/docs/assets/training/layerwise.png
new file mode 100644
index 000000000000..bc1e4f24d4a8
Binary files /dev/null and b/docs/assets/training/layerwise.png differ
diff --git a/docs/assets/training/layerwise_bad_loading.png b/docs/assets/training/layerwise_bad_loading.png
new file mode 100644
index 000000000000..c93de6c61ff1
Binary files /dev/null and b/docs/assets/training/layerwise_bad_loading.png differ
diff --git a/docs/assets/training/layerwise_good_loading.png b/docs/assets/training/layerwise_good_loading.png
new file mode 100644
index 000000000000..0d7c7a7a56e0
Binary files /dev/null and b/docs/assets/training/layerwise_good_loading.png differ
diff --git a/docs/cli/README.md b/docs/cli/README.md
index c708eb795898..b27bd3b647b5 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -163,7 +163,7 @@ Running with a local file:
 
 ```bash
 vllm run-batch \
-    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -i features/openai_batch/openai_example_batch.jsonl \
     -o results.jsonl \
     --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
@@ -172,7 +172,7 @@ Using remote file:
 
 ```bash
 vllm run-batch \
-    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \
     -o results.jsonl \
     --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 8ea241c582e5..2c098118dbb1 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -23,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 !!! note
     With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
 
-    You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+    You can convert the model checkpoint to a sharded checkpoint using [examples/features/sharded_state/load_sharded_state_offline.py](../../examples/features/sharded_state/load_sharded_state_offline.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
 ## Quantization
 
diff --git a/docs/contributing/ci/failures.md b/docs/contributing/ci/failures.md
index dad04e75fbb6..a0038f461a04 100644
--- a/docs/contributing/ci/failures.md
+++ b/docs/contributing/ci/failures.md
@@ -60,9 +60,19 @@ the failure?
 
 ## Logs Wrangling
 
-Download the full log file from Buildkite locally.
+Download a job's log (no Buildkite login required):
 
-Strip timestamps and colorization:
+[.buildkite/scripts/ci-fetch-log.sh](../../../.buildkite/scripts/ci-fetch-log.sh)
+
+```bash
+# Find the failing job. Each row's URL is .../builds/<N>#<job_uuid>:
+gh pr checks <PR> --repo vllm-project/vllm
+
+# Download + strip timestamps/ANSI in one step:
+.buildkite/scripts/ci-fetch-log.sh "https://buildkite.com/vllm/ci/builds/<N>#<job_uuid>"
+```
+
+To clean an already-downloaded log:
 
 [.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
 
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index addda300d020..91757c40e4f8 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -42,7 +42,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
 
 #### Offline Inference
 
-Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.
+Refer to [examples/features/profiling/simple_profiling_offline.py](../../examples/features/profiling/simple_profiling_offline.py) for an example.
 
 #### OpenAI Server
 
diff --git a/docs/deployment/frameworks/anyscale.md b/docs/deployment/frameworks/anyscale.md
index 965742ec0726..6888e4dbf0b6 100644
--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@@ -3,7 +3,7 @@
 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
 
 Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
-without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/ray_serving/run_cluster.sh](../../../examples/ray_serving/run_cluster.sh).
 
 When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
 
diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index c7d171c165aa..e711694b3b9f 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -155,6 +155,7 @@ Priority is **1 = highest** (tried first).
 | **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) |
 | **Head Sizes** | Supported attention head sizes |
 | **Sink** | Attention sink support (for StreamingLLM) |
+| **Non-Causal** | Non-causal (bidirectional) attention support for decoder models |
 | **Sparse** | Sparse attention support (MLA only) |
 | **MM Prefix** | Multimodal prefix full attention support |
 | **DCP** | Decode Context Parallelism support (`--decode-context-parallel-size`) |
@@ -165,22 +166,22 @@ Priority is **1 = highest** (tried first).
 
 ## Standard Attention (MHA, MQA, GQA) Backends
 
-| Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | DCP | Attention Types | Compute Cap. |
-| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | --------- | --- | --------------- | ------------ |
-| `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256, 512 | ❌ | ❌ | ❌ | All | N/A |
-| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
-| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
-| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 |
-| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x |
-| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ✅ | ❌ | ✅ | All | ≥10.0 |
-| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
-| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
-| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ✅ | ❌ | All | N/A |
-| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ✅ | ❌ | Decoder, Encoder, Encoder Only | N/A |
-| `TREE_ATTN` | | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
-| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `int8_per_token_head`, `fp8_per_token_head` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
-| `TURBOQUANT` | | fp16, bf16 | `turboquant_k8v4`, `turboquant_4bit_nc`, `turboquant_k3v4_nc`, `turboquant_3bit_nc` | 16, 32, 64, 128 | Any | ❌ | ❌ | ❌ | Decoder | Any |
+| Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Non-Causal | MM Prefix | DCP | Attention Types | Compute Cap. |
+| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | ---------- | --------- | --- | --------------- | ------------ |
+| `CPU_ATTN` | | fp16, bf16, fp32 | `auto`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256, 512 | ❌ | ❌ | ❌ | ❌ | All | N/A |
+| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
+| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `nvfp4` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ❌ | ✅ | Decoder | 10.x |
+| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ✅ | ❌ | ✅ | All | ≥8.0 |
+| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | ✅ | All | 9.x |
+| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ✅ | ✅ | ❌ | ✅ | All | ≥10.0 |
+| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
+| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ✅ | ✅ | ❌ | Decoder, Encoder Only | Any |
+| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ❌ | ✅ | ❌ | All | N/A |
+| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ✅ | ✅ | ❌ | Decoder, Encoder, Encoder Only | N/A |
+| `TREE_ATTN` | | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | ❌ | Decoder | Any |
+| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `int8_per_token_head`, `fp8_per_token_head` | %16 | Any | ✅ | ❌ | ✅ | ❌ | All | Any |
+| `TURBOQUANT` | | fp16, bf16 | `turboquant_k8v4`, `turboquant_4bit_nc`, `turboquant_k3v4_nc`, `turboquant_3bit_nc` | 16, 32, 64, 128 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | Any |
 
 > **†** FlashInfer uses TRTLLM attention on Blackwell (SM100), which supports sinks. Disable via `--attention-config.use_trtllm_attention=0`.
 >
@@ -192,31 +193,35 @@ MLA uses separate backends for prefill and decode phases.
 
 ### Prefill Backends
 
-The prefill backend is selected at runtime based on hardware and
-configuration.
+To explicitly select a prefill backend, use
+`-ac.mla_prefill_backend=<BACKEND>` (e.g., `FLASH_ATTN`, `FLASHINFER`).
+Otherwise, the prefill backend is selected automatically at runtime based on
+hardware and configuration.
 
-| Backend | Description | Compute Cap. | Enable | Disable | Notes |
-| ------- | ----------- | ------------ | ------ | ------- | ----- |
-| TRT-LLM Ragged‡ | TensorRT-LLM ragged attention | 10.x | Default on SM100 | `-ac.use_trtllm_ragged_deepseek_prefill=0` | DeepSeek R1 dims only |
-| FlashInfer | FlashInfer CUTLASS backend | 10.x | `-ac.disable_flashinfer_prefill=0` | `-ac.disable_flashinfer_prefill=1` | DeepSeek R1 dims only |
-| cuDNN | cuDNN-based attention | 10.x | `-ac.use_cudnn_prefill=1` | `-ac.use_cudnn_prefill=0` | |
-| FlashAttention | FlashAttention varlen (FA2/FA3) | Any | Default fallback | Use other backends | FA3 on SM90, FA2 otherwise |
+| Backend | Description | Dtypes | Compute Cap. | Notes |
+| ------- | ----------- | ------ | ------------ | ----- |
+| `FLASH_ATTN`‡ | FlashAttention varlen (FA2/FA3/FA4) | fp16, bf16 | Any | FA4 on SM100+, FA3 on SM90, FA2 otherwise |
+| `TRTLLM_RAGGED` | TensorRT-LLM ragged attention | fp16, bf16 | 10.x | DeepSeek R1 dims only |
+| `FLASHINFER` | FlashInfer CUTLASS backend | fp16, bf16 | 10.x | DeepSeek R1 dims only |
 
 > **‡** TRT-LLM Ragged is the default on Blackwell (SM100).
 > On other GPUs, FlashAttention is used as the default.
 
 ### Decode Backends
 
-| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
-| ------- | ------ | --------- | ----------- | ---------- | ---- | ------ | --------- | --- | --------------- | ------------ |
-| `CUTLASS_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
-| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHMLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
-| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 512, 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
-| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
-| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16` | 1 | Any | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `TRITON_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
-| `XPU_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16` | Any | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | Any |
+MLA decode backends are selected using the standard
+`-ac.backend=<BACKEND>` argument (e.g., `FLASHMLA`, `TRITON_MLA`).
+
+| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Non-Causal | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
+| ------- | ------ | --------- | ----------- | ---------- | ---- | ---------- | ------ | --------- | --- | --------------- | ------------ |
+| `CUTLASS_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
+| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHMLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
+| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 512, 576 | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
+| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
+| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %1 | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 1, 64 | Any | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `TRITON_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | %16 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
+| `XPU_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16` | Any | 576 | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | Any |
diff --git a/docs/design/cuda_graphs_multimodal.md b/docs/design/cuda_graphs_multimodal.md
index e32010232ef0..f44ef359df38 100644
--- a/docs/design/cuda_graphs_multimodal.md
+++ b/docs/design/cuda_graphs_multimodal.md
@@ -86,9 +86,11 @@ Models opt-in to encoder CUDA Graphs by implementing the [SupportsEncoderCudaGra
 | Architecture | Models | CG for Image | CG for Video |
 | ------------ | ------ | ------------ | ------------ |
 | `Qwen3VLForConditionalGeneration` | `Qwen3-VL` | ✅︎ | ✅︎ |
+| `Qwen2_5_VLForConditionalGeneration` | `Qwen2.5-VL` | ✅︎ | ✅︎ |
 
 !!! note
     Encoder CUDA Graphs have currently been tested with `--mm-encoder-attn-backend=FLASH_ATTN` and `--mm-encoder-attn-backend=FLASHINFER` on Blackwell GPUs.
+    For Qwen2.5-VL only FA2 and FA3 has been tested.
 
 ## Configuration
 
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index fbee9f4c3e3e..7edda6fa6476 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -5,12 +5,14 @@ TL;DR:
 - use tlparse to acquire torch.compile logs. Include these logs in bug reports and/or support asks.
 - The vLLM-torch.compile integration is multiple pieces. vLLM exposes flags to turn off each piece:
 
-| Online Flag | Offline Flag | Result |
-| ----------- | ------------ | ------ |
-| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
-| -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
-| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
-| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |
+| Online Flag                    | Offline Flag                                                                   | Result                                               |
+|--------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------|
+| --enforce-eager                | enforce_eager=True                                                             | Turn off torch.compile and CUDAGraphs                |
+| -cc.mode=0                     | compilation_config=CompilationConfig(mode=CompilationMode.NONE)                | Turn off torch.compile only                          |
+| -cc.mode=1                     | compilation_config=CompilationConfig(mode=CompilationMode.STOCK_TORCH_COMPILE) | Turn off vLLM-compile modifications to torch.compile |
+| -cc.cudagraph_mode=NONE        | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE)        | Turn off CUDAGraphs only                             |
+| -cc.backend=eager              | compilation_config=CompilationConfig(backend='eager')                          | Turn off TorchInductor                               |
+| -cc.ir_enable_torch_wrap=False | compilation_config=CompilationConfig(ir_enable_torch_wrap=False)               | Turn off vLLM IR wrapping                            |
 
 ## vLLM-torch.compile overview
 
@@ -22,7 +24,7 @@ Most notably, vLLM-compile is NOT torch.compile, it is a custom compiler built u
 
 - Given a model, we do a full graph capture via TorchDynamo that is dynamic on the batch size (number of tokens)
 - vLLM then optionally splits and/or specializes this graph and then uses TorchInductor to compile each graph into a compiled artifact.
-This step may use vLLM custom Inductor passes to further optimize the graph.
+This step may use vLLM custom Inductor passes to further optimize the graph. This includes vLLM IR lowering to remove dispatch overhead.
 - The compiled artifact is saved to vLLM's compile cache so that it can be loaded in the future.
 - vLLM applies CUDAGraphs to reduce CPU overheads.
 
@@ -34,6 +36,7 @@ For more details on the design, please see the following resources:
 
 - [Introduction to vLLM-torch.compile blogpost](https://blog.vllm.ai/2025/08/20/torch-compile.html)
 - [vLLM-torch.compile integration design](./torch_compile.md)
+- [vLLM IR design](./vllm_ir.md)
 - [vLLM Office Hours #26](https://www.youtube.com/live/xLyxc7hxCJc?si=Xulo9pe53C6ywf0V&t=561)
 - [Talk at PyTorch Conference 2025](https://youtu.be/1wV1ESbGrVQ?si=s1GqymUfwiwOrDTg&t=725)
 
@@ -117,6 +120,21 @@ from vllm.config.compilation import CompilationConfig, CUDAGraphMode
 LLM(model, compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE))
 ```
 
+vLLM IR makes heavy use of the compilation pipeline, from functionalization, custom fusions, and lowering.
+To turn that off and capture eager-mode dispatching behavior of vLLM IR, run with `ir_enable_torch_wrap=False`.
+IR torch wrap is only enabled by default when using `mode=VLLM_COMPILE` and `backend="inductor"` (default).
+
+```sh
+# Online
+vllm serve -cc.ir_enable_torch_wrap=False
+```
+
+```py
+# Offline
+from vllm.config.compilation import CompilationConfig
+LLM(model, compilation_config=CompilationConfig(ir_enable_torch_wrap=False))
+```
+
 ## Debugging TorchDynamo
 
 vLLM requires model code be capturable into a full graph via TorchDynamo (torch.compile's frontend).
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 4e3706645ef2..54b796fde3bf 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -36,7 +36,7 @@ th {
 | deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht.DeepEPHTPrepareAndFinalize] |
 | deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll.DeepEPLLPrepareAndFinalize] |
 | flashinfer_nvlink_two_sided | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferNVLinkTwoSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided.FlashInferNVLinkTwoSidedPrepareAndFinalize] |
-| flashinfer_nvlink_one_sided | standard | nvfp4 | G,A,T | N | N | [`FlashInferNVLinkOneSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_one_sided.FlashInferNVLinkOneSidedPrepareAndFinalize] |
+| flashinfer_nvlink_one_sided | standard | nvfp4,bf16,mxfp8 | G,A,T | N | N | [`FlashInferNVLinkOneSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_one_sided.FlashInferNVLinkOneSidedPrepareAndFinalize] |
 
 !!! info "Table key"
     1. All types: mxfp4, nvfp4, int4, int8, fp8
diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md
index 4674bef8d2b6..c1de955b6ffe 100644
--- a/docs/design/p2p_nccl_connector.md
+++ b/docs/design/p2p_nccl_connector.md
@@ -88,7 +88,7 @@ pip install "vllm>=0.9.2"
 #### Proxy (e.g. 10.0.1.1)
 
 ```shell
-cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
+cd {your vllm directory}/examples/disaggregated/p2p_nccl_xpyd/
 python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```
 
@@ -181,7 +181,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 #### Proxy (e.g. 10.0.1.1)
 
 ```shell
-cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
+cd {your vllm directory}/examples/disaggregated/p2p_nccl_xpyd/
 python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```
 
diff --git a/docs/design/vllm_ir.md b/docs/design/vllm_ir.md
new file mode 100644
index 000000000000..82628f3762fe
--- /dev/null
+++ b/docs/design/vllm_ir.md
@@ -0,0 +1,615 @@
+# vLLM IR: Functional Intermediate Representation
+
+## Motivation
+
+vLLM IR is a **functional intermediate representation (IR)** that fills the gap between
+low-level `torch` ops and vLLM layers like `RMSNorm` and quantization operators,
+By separating operator **semantics** from the **implementation** and **dispatching**,
+vLLM IR simplifies both compilation and kernel registration & dispatching simultaneously.
+It operates as a **dialect** in the torch FX representation, allowing full interoperability
+with “regular” torch ops & custom torch ops/kernels, as well as a piecewise migration from
+the previous `CustomOp` approach.
+
+Key design principles:
+
+- **Eager-compile consistency**: identical behavior (barring minor numerics) in eager and compiled modes
+- **Simple, transparent, yet powerful kernel selection**: good visibility and control allowing easy debugging
+- **Convention over configuration**: near-zero boilerplate required to register ops and implementations
+- **Extensibility**: ops and implementations can be registered anywhere, in-tree or out-of-tree
+- **Interoperability**: fully compatible with “regular” torch ops & custom torch ops/kernels,
+reducing developer friction and allowing piecewise migration
+
+The clean semantics/implementation separation enables a unified and extensible dispatching mechanism,
+allowing multiple kernels per-platform and powerful kernel selection. The separation also facilitates
+cleaner testing and benchmarking, removing much of the boilerplate standard for legacy approaches.
+
+By delaying kernel selection until late in the compilation process, the compiler can operate on
+a higher-level representation, which has the following main benefits:
+
+- Pattern matching in fusion/transformation passes only requires a single, simple pattern per op
+- OOT compiler backends can lower from the higher-level representation (in-progress)
+- The compiler can autotune over available implementations (future feature)
+
+## Quick Overview
+
+### Declaring an IR Operation
+
+IR operations are declared using the `@register_op` decorator with a native PyTorch implementation that defines the op's semantics:
+
+```python
+# vllm/ir/ops/layernorm.py
+from torch import Tensor
+from vllm.ir import register_op
+
+@register_op
+def rms_norm(x: Tensor, weight: Tensor | None, epsilon: float, variance_size: int | None = None) -> Tensor:
+    """Weighted root-mean-square layer normalization"""
+    orig_dtype = x.dtype
+    x = x.to(torch.float32)
+    x_var = x if variance_size is None else x[..., :variance_size]
+    variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + epsilon)
+    x = x.to(orig_dtype)
+    if weight is not None:
+        x = x * weight
+    return x
+```
+
+The native implementation serves three purposes:
+
+1. **Semantic definition**: Specifies the exact semantics of the operation, including shapes and strides
+2. **Default implementation**: Used when no other (better) implementation is available
+3. **Reference for testing**: Other implementations must match these semantics
+
+### Registering Implementations
+
+Kernel implementations are registered using the `register_impl` decorator on the IR op object:
+
+```python
+# vllm/kernels/vllm_c.py
+from vllm import ir
+
+rms_norm_no_var = lambda x, weight, epsilon, variance_size=None: variance_size is None
+
+@ir.ops.rms_norm.register_impl("vllm_c", supports_args=rms_norm_no_var, supported=current_platform.is_cuda_alike())
+def rms_norm(x: Tensor, weight: Tensor | None, epsilon: float, variance_size: int | None = None) -> Tensor:
+    output = torch.empty_like(x)
+    torch.ops._C.rms_norm(output, x, weight, epsilon)
+    return output
+```
+
+Implementations can specify:
+
+- `supported`: Static boolean indicating if this implementation is available
+- `supports_args`: Function checking if the implementation supports specific arguments
+- `inplace`: Whether this implementation reuses input memory for outputs
+
+### Using IR Operations in Models
+
+IR operations are imported and called directly in model code:
+
+```python
+# vllm/model_executor/layers/layernorm.py
+from vllm import ir
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x: Tensor, residual: Tensor | None = None):
+        if residual is None:
+            return ir.ops.rms_norm(x, self.weight, self.variance_epsilon)
+
+        # Use maybe_inplace overload to allow implementation to reuse input memory for outputs
+        # (using x or residual after this call is undefined behavior)
+        return ir.ops.fused_add_rms_norm.maybe_inplace(
+            x, residual, self.weight, self.variance_epsilon
+        )
+```
+
+### Configuring Kernel Selection
+
+Kernel selection is controlled via priority lists in the configuration.
+Priority lists specify the order in which implementations are considered,
+with the first supported implementation being selected.
+This includes the static support check (`supported=...`) and
+the dynamic arg support check (`supports_args=...`).
+
+#### Command Line Configuration
+
+Use `--ir-op-priority.<op_name>=<provider1>,<provider2>,...`:
+
+```bash
+# CUDA: Use vllm_c implementation for rms_norm
+vllm serve meta-llama/Llama-3.2-1B \
+  --ir-op-priority.rms_norm=vllm_c
+
+# ROCm: Try aiter first, fall back to vllm_c, then native
+vllm serve meta-llama/Llama-3.2-1B \
+  --ir-op-priority.rms_norm=aiter,vllm_c,native
+
+# Configure multiple operations
+vllm serve meta-llama/Llama-3.2-1B \
+  --ir-op-priority.rms_norm=vllm_c \
+  --ir-op-priority.fused_add_rms_norm=vllm_c
+```
+
+#### Python Configuration
+
+```python
+from vllm import LLM
+from vllm.config import VllmConfig, KernelConfig
+
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    vllm_config=VllmConfig(
+        kernel_config=KernelConfig(
+            ir_op_priority={
+                "rms_norm": ["vllm_c", "native"],
+                "fused_add_rms_norm": ["vllm_c", "native"],
+            }
+        )
+    )
+)
+```
+
+#### Platform Defaults
+
+Each platform provides default priority lists that are automatically applied:
+
+```python
+# CUDA/XPU/ROCm platform defaults (when compiling with Inductor)
+{
+  "rms_norm": ["native"],  # Native torch is default
+  "fused_add_rms_norm": ["native"],
+}
+
+# CUDA platform defaults (eager or Dynamo-only)
+{
+  "rms_norm": ["vllm_c", "native"],
+  "fused_add_rms_norm": ["vllm_c", "native"],
+}
+
+# ROCm platform defaults (future - currently same as CUDA)
+{
+    "rms_norm": ["aiter", "vllm_c", "native"],
+    "fused_add_rms_norm": ["aiter", "vllm_c", "native"],
+}
+
+# XPU platform defaults (eager or Dynamo-only)
+{
+    "rms_norm": ["xpu_kernels", "native"],
+    "fused_add_rms_norm": ["xpu_kernels", "native"],
+}
+```
+
+User-specified priorities are prepended to platform defaults,
+so you only need to specify the out-of-order implementations,
+other implementations are appended automatically.
+
+## Compilation Pipeline
+
+vLLM IR heavily customizes the `torch.compile`-based compilation process to allow custom compile
+passes to operate on high-level IR while still producing efficient low-level code at the end.
+The compilation pipeline consists of several stages:
+
+### 1. Dynamo Tracing
+
+When `torch.compile` traces the model's forward pass, vLLM IR operations appear as custom operations
+in the `vllm_ir` torch library. These operations are opaque to Dynamo, meaning they appear directly
+in the FX graph without decomposition:
+
+```python
+# Python code (epsilon=1e-5)
+x1 = ir.ops.rms_norm(x, weight, epsilon)
+x2, residual_out = ir.ops.fused_add_rms_norm.maybe_inplace(x1, residual, weight, epsilon)
+
+# FX graph after Dynamo tracing
+x1 = torch.ops.vllm_ir.rms_norm.default(x, weight, 1e-5); x = None
+out = torch.ops.vllm_ir.fused_add_rms_norm.maybe_inplace(x1, residual, weight, 1e-5); x1 = residual = None
+x2 = out[0]
+residual_out = out[1]
+```
+
+### 2. AOTAutograd and Functionalization
+
+AOTAutograd functionalizes the graph, converting any mutating operations to functional equivalents.
+For vLLM IR operations with `maybe_inplace` overloads, we perform this manually before AOTAutograd,
+converting them to the functional `default` overload using the pre-grad custom pass hook.
+
+```python
+# After functionalization
+x1 = torch.ops.vllm_ir.rms_norm.default(x, weight, 1e-5); x = None
+out = torch.ops.vllm_ir.fused_add_rms_norm.default(x1, residual, weight, 1e-5); x1 = residual = None
+x2 = out[0]
+residual_out = out[1]
+```
+
+The pass also tracks which inputs were "donated" (passed to `maybe_inplace`),
+storing this information in vLLM's `PassContext` for later use in clone elimination.
+
+### 3. IR Fusion and Transformation Passes
+
+After functionalization, custom vLLM passes operate on the functional FX graph containing high-level IR operations.
+These passes can perform fusion, distribute operations for sequence parallelism, and other transformations:
+
+```python
+# Example: Sequence Parallelism (see SequenceParallelismPass)
+# Before SP pass
+
+all_reduce = torch.ops.vllm.all_reduce(x, "tp:0")
+rms_norm = torch.ops.vllm_ir.rms_norm(all_reduce, weight, 1e-5)
+
+# after SP pass
+reduce_scatter = torch.ops.vllm.reduce_scatter(x, "tp:0")
+rms_norm = torch.ops.vllm_ir.rms_norm(all_reduce, weight, 1e-5)
+all_gather = torch.ops.vllm.all_gather(x, "tp:0")
+```
+
+Fusion passes benefit from the high-level representation: they don't need to match against low-level PyTorch operations,
+handle different kernel implementations separately, or deal with functionalization of custom kernels.
+
+### 4. IR Lowering
+
+The lowering pass (`VllmIRLoweringPass`) replaces each vLLM IR operation with its selected implementation.
+The implementation is chosen based on the priority list and support predicates,
+using the **fake tensors** in the graph's metadata in place of op arguments:
+
+```python
+# Implementation selection, same in eager dispatch and compile lowering
+def dispatch(*args) -> IrOpImpl:
+  for provider in priority_list:  # e.g., ["vllm_c", "native"]
+    impl = ir_op.impls[provider]
+    if not impl.supported:
+      continue
+    if impl.supports_args and not impl.supports_args(*args):
+      continue
+    return impl
+
+# make_fx uses torch.fx.symbolic_trace
+impl_graph = make_fx(selected_impl.impl_fn)
+# Replace IR op node with impl_graph's nodes
+match.replace_by_example(selected_impl.impl_fn, node.args)
+```
+
+For example, lowering `rms_norm` with the `vllm_c` implementation:
+
+```python
+# Before lowering (IR op)
+rms_norm = torch.ops.vllm_ir.rms_norm.default(x, weight, 1e-5)
+
+# After lowering (vllm_c implementation traced)
+# Note: Lowering does not currently functionalize, this will likely change in the future.
+empty =  torch.ops.aten.empty.memory_format(x.shape, ...)
+rms_norm = torch.ops._C.rms_norm(empty, x, weight, 1e-5)
+```
+
+When lowering an implementation that mutates inputs (`inplace=True`),
+the lowering pass inserts clones to preserve functional semantics:
+
+```python
+# vllm_c implementation for fused_add_rms_norm mutates its first two arguments
+# Lowered with clones for safety
+clone_default = torch.ops.aten.clone.default(x)
+clone_default_1 = torch.ops.aten.clone.default(residual)
+fused_add_rms_norm = torch.ops._C.fused_add_rms_norm.default(clone_default, clone_default_1, weight, 1e-5)
+```
+
+### 5. Clone Cleanup
+
+After lowering, the clone elimination pass (`UnsafeCloneEliminationPass`) removes unnecessary clones introduced during lowering.
+This pass is essential for achieving zero-copy behavior when using in-place kernels with `maybe_inplace`.
+The pass removes a clone if:
+
+- the cloned input is created in the graph and not used again in the graph
+- the cloned input is a graph parameter, marked as donated
+
+```python
+# After cleanup (donated inputs, no subsequent uses)
+fused_add_rms_norm = torch.ops._C.fused_add_rms_norm.default(x, residual, weight, 1e-5)
+```
+
+The combination of inplace functionalization (tracking donated inputs) and clone cleanup enables the compiler to safely
+use in-place kernels without adding redundant copies or increasing the memory usage.
+
+### 6. Inductor Optimization and Codegen
+
+After IR lowering and cleanup, the graph contains only standard PyTorch operations and platform-specific custom ops.
+Inductor then performs its standard codegen:
+
+- **Inductor lowering and pointwise fusion**: Fusing element-wise operations, reductions, etc.
+- **Memory planning**: Determining buffer allocation and reuse
+- **Kernel generation**: Generating Triton or C++ code for fused operations
+- **Autotuning**: Selecting the best kernel configurations
+
+### Pipeline Summary
+
+```text
+Model Forward Pass
+    ↓
+[Dynamo Tracing] → FX Graph with vllm_ir.* ops
+    ↓
+[Pre-grad: Inplace Functionalization] → maybe_inplace → default, track donated inputs
+    ↓
+[AOTAutograd] → Functionalization
+    ↓
+[Post-grad: IR Fusion Passes] → Fuse high-level IR ops (e.g., rms_norm + quant)
+    ↓
+[Post-grad: IR Lowering] → vllm_ir.* ops → impl ops (with clones if needed)
+    ↓
+[Post-grad: Clone Cleanup] → Remove unnecessary clones using donated input info
+    ↓
+[Inductor] → Pattern matching, fusion, memory planning, codegen
+    ↓
+Compiled Code
+```
+
+## Core vLLM IR Concepts
+
+### Operation Declaration
+
+Operations are declared with the `@register_op` decorator, which creates an `IrOp` object:
+
+```python
+@register_op(
+    name=None,           # Operation name (defaults to function name)
+    activations=None,    # List of activation parameters (defaults to params starting with 'x')
+    allow_inplace=False, # Whether to create a maybe_inplace overload
+)
+def op_name(...):
+    ...
+```
+
+**Parameters:**
+
+- `activations`: List of parameter names considered "activations" (typically consumed by `maybe_inplace`). Defaults to parameters starting with `x`.
+- `allow_inplace`: Creates a `maybe_inplace` overload for memory-efficient execution (see below).
+
+### The `maybe_inplace` Overload
+
+The `maybe_inplace` overload is a critical feature for memory efficiency in LLM inference.
+It signals that the caller doesn't need to preserve the activation inputs after the operation,
+allowing in-place implementations to reuse input memory for outputs.
+
+#### Semantics and Usage
+
+```python
+# Standard usage: inputs are preserved
+out, res_out = ir.ops.fused_add_rms_norm(x, residual, weight, epsilon)
+# x and residual are unchanged, out and res_out are new tensors
+
+# maybe_inplace: inputs may be modified
+out, res_out = ir.ops.fused_add_rms_norm.maybe_inplace(x, residual, weight, epsilon)
+# x and residual may be modified (undefined behavior to use them after this)
+# out and res_out may alias x and residual
+```
+
+Using an activation input after passing it to `maybe_inplace` is **undefined behavior**:
+
+```python
+# WRONG: Using x after donating it
+out, res_out = ir.ops.fused_add_rms_norm.maybe_inplace(x, residual, weight, epsilon)
+result = out + x  # ERROR: x was donated!
+```
+
+If you need to preserve an input, either use the default overload or clone manually:
+
+```python
+# Option 1: Use default overload
+out, res_out = ir.ops.fused_add_rms_norm(x, residual, weight, epsilon)
+result = out + x  # OK: x is preserved
+
+# Option 2: Clone before maybe_inplace
+out, res_out = ir.ops.fused_add_rms_norm.maybe_inplace(x.clone(), residual, weight, epsilon)
+result = out + x  # OK: x is preserved, clone was donated
+```
+
+#### Compilation Behavior
+
+During compilation, the inplace functionalization pass validates that donated inputs are
+not used again and converts `maybe_inplace` to the functional `default` overload:
+
+```python
+# Inplace functionalization pass (pre-grad)
+for node in graph.nodes:
+    if node.target == torch.ops.vllm_ir.fused_add_rms_norm.maybe_inplace:
+        # Check that activation inputs aren't used after this node
+        for activation_arg in activation_inputs:
+            for user in activation_arg.users:
+                if user appears after node:
+                    raise ValueError(f"Input {activation_arg} donated but used again")
+
+        # Convert to default overload
+        node.target = torch.ops.vllm_ir.fused_add_rms_norm.default
+
+        # Track donated graph inputs for later clone elimination
+        for i, arg in enumerate(node.args):
+            if arg.op == "placeholder" and i in activation_indices:
+                pass_context.donated_input_ids.add(node_to_idx[arg])
+```
+
+The donated input information is then used by the clone cleanup pass to eliminate
+unnecessary copies when in-place kernels are lowered.
+
+#### Eager Mode Behavior
+
+In eager mode (without `torch.compile`), `maybe_inplace` enables **maximally memory-efficient**
+execution by allowing the IR operation to dispatch directly to in-place implementations:
+
+```python
+# Eager dispatch logic for maybe_inplace
+impl: IrOpImpl = ir_op.dispatch(*args)
+return impl.impl_fn(*args)
+
+# Eager dispatch logic for default:
+impl: IrOpImpl = ir_op.dispatch(*args)
+if impl.inplace:
+  args = [
+    arg.clone() if i in ir_op.activations else arg
+    for i, arg in enumerate(args)
+  ]
+return impl.impl_fn(*args)
+```
+
+The combination of `maybe_inplace` in model code and in-place kernel implementations provides optimal memory efficiency
+in both eager and compiled modes, with identical semantics in both cases.
+
+#### Memory Savings Example
+
+Consider a transformer layer with residual connections:
+
+```python
+# Without maybe_inplace (2 allocations per layer)
+hidden_states = self.attention(input)
+normed, residual = ir.ops.fused_add_rms_norm(hidden_states, input, weight, eps)
+# Memory: input (preserved), hidden_states (preserved), normed (new), residual (new)
+
+# With maybe_inplace (0 allocations per layer when using in-place kernel)
+hidden_states = self.attention(input)
+normed, residual = ir.ops.fused_add_rms_norm.maybe_inplace(hidden_states, input, weight, eps)
+# Memory: normed (reuses hidden_states), residual (reuses input)
+```
+
+### Implementation Registration
+
+Implementations are registered using the `register_impl` method:
+
+```python
+@ir.ops.op_name.register_impl(
+    provider="provider_name",  # Unique identifier (e.g., "vllm_c", "aiter", "triton")
+    supported=True,            # Static availability check
+    supports_args=None,        # Dynamic argument support check
+)
+def impl_fn(...):
+    ...
+```
+
+**Provider naming conventions:**
+
+- `native`: Reserved for the native torch implementation (declared with `@register_op`)
+- `vllm_c`: C++/CUDA kernels via `torch.ops._C`
+- `aiter`: AMD AITER library
+- `xpu_kernels`: SYCL/SYCLTLA kernels implemented in `vllm-xpu-kernels`
+- `triton_*`: Triton kernels
+- Platform/library names for other implementations
+
+**Support checking:**
+
+- `supported`: Static boolean, checked once at import time (e.g., `HAS_TRITON`, `is_cuda_alike()`)
+- `supports_args`: Function `(*args, **kwargs) -> bool` checking argument compatibility
+    - Called with **fake tensors** during compilation for zero-cost checking
+    - Called with **real tensors** during eager mode dispatch
+    - Should NOT check batch sizes or add guards based on values
+
+Example support predicate:
+
+```python
+def aiter_rms_norm_supports(x, weight, epsilon, variance_size=None):
+    # Check dtype (OK: doesn't depend on batch size)
+    if x.dtype not in [torch.float16, torch.bfloat16]:
+        return False
+    # Check optional parameter (OK: static check)
+    if variance_size is not None:
+        return False
+    return True
+
+@ir.ops.rms_norm.register_impl("aiter", supports_args=aiter_rms_norm_supports)
+def rms_norm(...):
+    ...
+```
+
+Batch-invariant kernels are automatically selected when `VLLM_BATCH_INVARIANT=1` is set.
+
+### Eager Mode vs Compile Mode
+
+vLLM IR operations behave identically in eager and compile modes:
+
+**Eager mode:**
+
+- Direct dispatch to implementation based on priority list
+- Support checked with real tensor arguments
+- Minimal overhead (can be optimized further if needed)
+
+**Compile mode:**
+
+- IR ops appear in FX graph as `torch.ops.vllm_ir.*` custom ops
+- Lowering selects implementation using fake tensors
+- Full integration with Inductor optimizations
+
+This consistency enables:
+
+- Prototyping in eager mode with confidence
+- Debugging by disabling compilation
+- Gradual migration from eager to compiled execution
+
+## Other Topics
+
+### Out-of-Tree Implementations
+
+External platforms can register implementations without modifying vLLM:
+
+```python
+# In external package
+from vllm import ir
+
+@ir.ops.rms_norm.register_impl("my_platform", supported=is_my_platform())
+def rms_norm(x, weight, epsilon, variance_size=None):
+    return my_platform.rms_norm(x, weight, epsilon)
+```
+
+Then configure priority to use your implementation:
+
+```python
+class MyPlatform(Platform):
+  def get_default_ir_op_priority(self):
+    return IrOpPriorityConfig(rms_norm=['my_platform', 'native'])
+
+# Users can still override priority in the same way
+llm = LLM(ir_op_priority=IrOpPriorityConfig(rms_norm=['custom_oot_kernel']))
+```
+
+### Debugging and Observability
+
+!!! note
+    Please let us know how observability can be improved for your use-case!
+
+Enable debug logging to see kernel selection:
+
+```bash
+VLLM_LOGGING_LEVEL=DEBUG vllm serve ...
+```
+
+This logs:
+
+- Which implementations are selected for each operation
+- Why implementations were rejected (unsupported, args not supported)
+- Compilation cache hits/misses
+- IR lowering statistics
+
+Check selected implementations in compiled graphs:
+
+```python
+# After compilation, inspect the lowering pass
+lowering_pass = backend.lowering_pass
+print(lowering_pass.selected_impls)
+# Output: {'rms_norm': {'node_123': 'vllm_c', 'node_456': 'vllm_c'}}
+```
+
+## Migration from CustomOp
+
+vLLM IR is designed to coexist with and gradually replace `CustomOp`:
+
+1. **Op declaration**: Convert `CustomOp` class `PluggableLayer` and move `forward_native` to `@register_op` function
+2. **Implementation registration**: Use `@ir.ops.op_name.register_impl` instead of overriding methods
+3. **Layer usage**: Replace `self.op(...)` with `ir.ops.op_name(...)`
+4. **Configuration**: Migrate `--compilation-config.custom-ops` to `--ir-op-priority`
+
+The migration can be done incrementally, one operation at a time.
+
+## See Also
+
+- [torch.compile Integration](torch_compile.md) - General compilation infrastructure
+- [Fusions](fusions.md) - Custom fusion and transformation passes in vLLM
+- [Custom Operations](custom_op.md) - Legacy custom op system
diff --git a/docs/features/README.md b/docs/features/README.md
index e62d9cddee76..28362f401477 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -52,10 +52,10 @@ th:not(:first-child) {
 | [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
 | best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
 | beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
-| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
+| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ✅ | ❔ | ❔ | ✅ |
 
 \* Chunked prefill and prefix caching are only applicable to last-token or all pooling with causal attention.  
-<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
+<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.  
 
 ### Feature x Hardware
 
diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md
index 3718a4b74eb2..fe7977ee23d0 100644
--- a/docs/features/automatic_prefix_caching.md
+++ b/docs/features/automatic_prefix_caching.md
@@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries,
 
 Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
 
-[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py)
+[examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py](../../examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py)
 
 ## Example workloads
 
diff --git a/docs/features/batch_invariance.md b/docs/features/batch_invariance.md
index 804cd905e3b1..b23631484508 100644
--- a/docs/features/batch_invariance.md
+++ b/docs/features/batch_invariance.md
@@ -105,7 +105,7 @@ Batch invariance has been tested and verified on the following models:
 
 - **DeepSeek series**: `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-V3-0324`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`
 - **Qwen3 (Dense)**: `Qwen/Qwen3-1.7B`, `Qwen/Qwen3-8B`, `Qwen/Qwen3-4B-AWQ`, `Qwen/Qwen3-8B-AWQ`
-- **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`
+- **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`, `Qwen/Qwen3-30B-A3B-Thinking-2507-FP8`
 - **Qwen2.5**: `Qwen/Qwen2.5-0.5B-Instruct`, `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-3B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`, `Qwen/Qwen2.5-14B-Instruct`, `Qwen/Qwen2.5-32B-Instruct`
 - **Llama 3**: `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`
 - **GPT-OSS**: `openai/gpt-oss-20b`, `openai/gpt-oss-120b`
diff --git a/docs/features/context_extension.md b/docs/features/context_extension.md
index f622191aebc6..f96340c3183f 100644
--- a/docs/features/context_extension.md
+++ b/docs/features/context_extension.md
@@ -6,12 +6,12 @@ This directory contains examples for extending the context length of models usin
 
 ## Offline Inference Example
 
-The [`context_extension.py`](../../examples/offline_inference/context_extension) script demonstrates how to extend the context length of a Qwen model using the YARN method (rope_parameters) and run a simple chat example.
+The [`context_extension.py`](../../examples/features/context_extension/context_extension_offline.py) script demonstrates how to extend the context length of a Qwen model using the YARN method (rope_parameters) and run a simple chat example.
 
 ### Usage
 
 ```bash
-python examples/offline_inference/context_extension.py
+python examples/features/context_extension/context_extension_offline.py
 ```
 
 ## OpenAI Online Method
diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md
index af6da94aa622..c27d6b277284 100644
--- a/docs/features/disagg_encoder.md
+++ b/docs/features/disagg_encoder.md
@@ -36,10 +36,10 @@ The current reference pathway is **ExampleConnector**.
 Below ready-to-run scripts shows the workflow:
 
 1 Encoder instance + 1 PD instance:
-`examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh`
+`examples/disaggregated/disaggregated_encoder/disagg_1e1pd_example.sh`
 
 1 Encoder instance + 1 Prefill instance + 1 Decode instance:
-`examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh`
+`examples/disaggregated/disaggregated_encoder/disagg_1e1p1d_example.sh`
 
 ---
 
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index 5167d612391d..9ad005be3fed 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -17,15 +17,15 @@ Two main reasons:
 
 ## Usage example
 
-Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
+Please refer to [examples/disaggregated/disaggregated_prefill.sh](../../examples/disaggregated/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
 
 Now supports 6 types of connectors:
 
-- **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
-- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
+- **ExampleConnector**: refer to [examples/disaggregated/example_connector/run.sh](../../examples/disaggregated/example_connector/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
+- **LMCacheConnectorV1**: refer to [examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
 - **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md). For feature compatibility details, see [NixlConnector Compatibility Matrix](nixl_connector_compatibility.md).
-- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
-- **MooncakeConnector**: refer to [examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh) for the example usage of ExampleConnector disaggregated prefilling. For detailed usage guide, see [MooncakeConnector Usage Guide](mooncake_connector_usage.md).
+- **P2pNcclConnector**: refer to [examples/disaggregated/p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/disaggregated/p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
+- **MooncakeConnector**: refer to [examples/disaggregated/mooncake_connector/run_mooncake_connector.sh](../../examples/disaggregated/mooncake_connector/run_mooncake_connector.sh) for the example usage of MooncakeConnector disaggregated prefilling. For detailed usage guide, see [MooncakeConnector Usage Guide](mooncake_connector_usage.md).
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
 
   ```bash
@@ -44,7 +44,7 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
   --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "cpu_bytes_to_use": 1000000000}}'
   ```
 
-- **FlexKVConnectorV1**: refer to [examples/offline_inference/prefix_caching_flexkv.py](../../examples/offline_inference/prefix_caching_flexkv.py) for the example usage of FlexKVConnectorV1. FlexKV is a distributed KV Store and multi-level cache management system for ultra-large-scale LLM inference.
+- **FlexKVConnectorV1**: refer to [examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py](../../examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py) for the example usage of FlexKVConnectorV1. FlexKV is a distributed KV Store and multi-level cache management system for ultra-large-scale LLM inference.
 
   ```bash
   --kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}'
diff --git a/docs/features/index_cache.md b/docs/features/index_cache.md
new file mode 100644
index 000000000000..7e73d0949554
--- /dev/null
+++ b/docs/features/index_cache.md
@@ -0,0 +1,54 @@
+# IndexCache
+
+IndexCache reduces redundant top-k computation in DeepSeek-V3.2 (DSA) models by caching and reusing top-k indices across layers.
+
+## Background
+
+DeepSeek-V3.2 uses a DeepSeek Sparse Attention (DSA) mechanism where top-k token selection is computed per layer. For deep models with many layers, this computation can be expensive. IndexCache allows skipping redundant top-k computations by reusing indices from previous layers.
+
+See: [IndexCache Paper](https://arxiv.org/abs/2603.12201)
+
+## Usage
+
+### CLI
+
+```bash
+vllm serve deepseek-ai/DeepSeek-V3.2 \
+    --hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}' ...
+```
+
+### Configuration Reference
+
+| Parameter            | Type | Default | Description                                                                                                                                      |
+|----------------------|------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------|
+| `use_index_cache`    | bool | false   | Enable IndexCache. Must be set to true to use this feature                                                                                       |
+| `index_topk_freq`    | int  | 1       | Frequency (in layers) at which top-k is computed. 1 = compute on every layer (disabled), 4 = compute on 1/4 of layers                            |
+| `index_topk_pattern` | str  | null    | Per-layer F/S pattern. Overrides index_topk_freq if set. Each character maps to one DSA layer: F = Full, S = Shared                              |
+
+### Configuration Examples
+
+**Using `index_topk_freq`** (compute every N layers):
+
+```bash
+vllm serve deepseek-ai/DeepSeek-V3.2 \
+    --hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}' ...
+```
+
+**Using `index_topk_pattern`** (explicit per-layer control):
+
+```bash
+# custom pattern for 61 layers: F = compute, S = reuse
+vllm serve deepseek-ai/DeepSeek-V3.2 \
+    --hf-overrides '{"use_index_cache": true, "index_topk_pattern": "FFSFSSSFSSFFFSSSFFFSFSSSSSSFFSFFSFFSSFFFFFFSFFFFFSFFSSSSSSFSF"}'
+```
+
+## How It Works
+
+1. When IndexCache is enabled, layers marked with `"F"` (Full) calculate and store top-k indices
+2. Subsequent layers marked with `"S"` (Shared) receive the cached indices from the previous layer instead of recomputing
+3. The cached indices are passed through the layer stack, reducing total computation
+
+## Requirements
+
+- DeepSeek-V3.2 or compatible DSA model
+- `use_index_cache: true` via `--hf-overrides`
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 2e7b36545d46..d78fdc05792e 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -47,7 +47,7 @@ the third parameter is the path to the LoRA adapter.
     )
     ```
 
-Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out [examples/features/lora/multilora_offline.py](../../examples/features/lora/multilora_offline.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 
 ## Serving LoRA Adapters
 
diff --git a/docs/features/mooncake_connector_usage.md b/docs/features/mooncake_connector_usage.md
index 0e2478924ead..cc8c8ecff258 100644
--- a/docs/features/mooncake_connector_usage.md
+++ b/docs/features/mooncake_connector_usage.md
@@ -31,7 +31,7 @@ vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_conne
 ### Proxy
 
 ```bash
-python examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py --prefill http://192.168.0.2:8010 --decode http://192.168.0.3:8020
+python examples/disaggregated/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py --prefill http://192.168.0.2:8010 --decode http://192.168.0.3:8020
 ```
 
 Now you can send requests to the proxy server through port 8000.
@@ -65,5 +65,5 @@ Now you can send requests to the proxy server through port 8000.
 
 Refer to these example scripts in the vLLM repository:
 
-- [run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh)
-- [mooncake_connector_proxy.py](../../examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py)
+- [run_mooncake_connector.sh](../../examples/disaggregated/mooncake_connector/run_mooncake_connector.sh)
+- [mooncake_connector_proxy.py](../../examples/disaggregated/mooncake_connector/mooncake_connector_proxy.py)
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 33796e20e76b..f6d4f3f86d80 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -215,6 +215,67 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
     - This setting only affects RGBA images with transparency; RGB images are unchanged
     - If not specified, the default white background `(255, 255, 255)` is used for backward compatibility
 
+#### Moondream3 Prompt Recipes { #moondream3-prompt-recipes }
+
+`Moondream3ForCausalLM` supports two task-specific prompt formats:
+
+- `query`: ask a question about the image.
+- `caption`: generate a caption for the image.
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
+
+llm = LLM(
+    model="moondream/moondream3-preview",
+    tokenizer="moondream/starmie-v1",
+    trust_remote_code=True,
+    max_model_len=2048,
+    limit_mm_per_prompt={"image": 1},
+)
+
+image = ImageAsset("stop_sign").pil_image
+
+
+def make_query_prompt(question: str) -> str:
+    return (
+        "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>"
+        f"{question}<|md_reserved_2|>"
+    )
+
+
+def make_caption_prompt(length: str = "normal") -> str:
+    return (
+        "<|endoftext|><image><|md_reserved_0|>"
+        f"describe<|md_reserved_1|>{length}<|md_reserved_2|>"
+    )
+
+
+query_out = llm.generate(
+    {
+        "prompt": make_query_prompt("What is shown in this image?"),
+        "multi_modal_data": {"image": image},
+    },
+    SamplingParams(max_tokens=64, temperature=0),
+)[0].outputs[0].text
+
+caption_out = llm.generate(
+    {
+        "prompt": make_caption_prompt(),
+        "multi_modal_data": {"image": image},
+    },
+    SamplingParams(max_tokens=100, temperature=0),
+)[0].outputs[0].text
+
+print("query:", query_out)
+print("caption:", caption_out)
+```
+
+!!! note
+    The native Moondream3 model also has `detect` and `point` skills. Those
+    require custom coordinate decoding and are not exposed by this vLLM
+    implementation.
+
 ### Video Inputs
 
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md
index b81d2f28e3b9..dd0b4d62c423 100644
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@@ -16,16 +16,51 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
 
 You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
 
-[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py)
+[examples/features/prompt_embed/prompt_embed_offline.py](../../examples/features/prompt_embed/prompt_embed_offline.py)
 
 ## Online Serving
 
-Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package and are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.
+Our OpenAI-compatible server accepts prompt embeddings inputs via both the [Completions API](https://platform.openai.com/docs/api-reference/completions) and the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Both are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.
+
+### Completions API
+
+Prompt embeddings inputs are added via a `'prompt_embeds'` key in the JSON request body.
 
 When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
 
 Prompt embeddings are passed in as base64 encoded torch tensors.
 
+The Completions endpoint does **not** apply a chat template to `prompt_embeds`. If the model assumes some chat template, the caller is responsible for producing embeddings for the full, already-templated prompt: apply the chat template, then embed the resulting token IDs. Anything the model would normally need (system prompt, role markers, generation prompt, etc.) must already be baked into the embedded tokens.
+
+### Chat Completions API
+
+Prompt embeddings can be included as content parts in chat messages, interleaved with text:
+
+```json
+{
+  "messages": [
+    {
+      "role": "system",
+      "content": [
+        {"type": "text", "text": "You are a helpful assistant."},
+        {"type": "prompt_embeds", "data": "<base64_encoded_tensor>"}
+      ]
+    },
+    {
+      "role": "user",
+      "content": [
+        {"type": "prompt_embeds", "data": "<base64_encoded_tensor>"},
+        {"type": "text", "text": "Summarize the above."}
+      ]
+    }
+  ]
+}
+```
+
+Each `prompt_embeds` content part contains a `data` field with a base64-encoded `torch.Tensor` of shape `(num_tokens, hidden_size)`. Multiple `prompt_embeds` parts can appear in any message, in any position relative to text parts. The server expands each part into the correct number of placeholder tokens during chat template rendering, then splices the pre-computed embeddings into the model's input at the corresponding positions.
+
+Unlike the Completions API, a `prompt_embeds` content part should encode **only** the content, not a templated conversation. The server wraps the chat template around the embedded content at request time, the same way it would for a plain text `content` string. Embedding a full templated conversation here would double-apply the template and produce incorrect inputs to the model.
+
 !!! warning
     The vLLM engine may crash if incorrect shape of embeddings is passed.
     Only enable this flag for trusted users!
@@ -41,4 +76,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
 
 Then, you can use the OpenAI client as follows:
 
-[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py)
+[examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py](../../examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py)
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index ef3b3ad6ec07..374149786e14 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -13,6 +13,7 @@ vLLM currently supports the following reasoning models:
 
 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 | ------------ | ----------- | ---------------- | ----------- |
+| [Cohere Command A Reasoning](https://huggingface.co/CohereLabs/command-a-reasoning-08-2025) | `cohere_command3` | `json`, `regex` | ✅ |
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
 | [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
diff --git a/docs/features/speculative_decoding/README.md b/docs/features/speculative_decoding/README.md
index 25cda8059b24..bef71a4f5a37 100644
--- a/docs/features/speculative_decoding/README.md
+++ b/docs/features/speculative_decoding/README.md
@@ -32,7 +32,7 @@ depend on your model family, traffic pattern, hardware, and sampling settings.
 | Suffix decoding | Low to medium gain | Medium gain | No extra draft model; dynamic speculation depth. |
 
 For reproducible measurements in your environment, use
-[`examples/offline_inference/spec_decode.py`](../../../examples/offline_inference/spec_decode.py)
+[`examples/features/speculative_decoding/spec_decode_offline.py`](../../../examples/features/speculative_decoding/spec_decode_offline.py)
 or the [benchmark CLI guide](../../benchmarking/cli.md).
 
 ## `--speculative-config` schema
diff --git a/docs/features/speculative_decoding/eagle.md b/docs/features/speculative_decoding/eagle.md
index 3e0f3add416e..cc9e4fd4c0c1 100644
--- a/docs/features/speculative_decoding/eagle.md
+++ b/docs/features/speculative_decoding/eagle.md
@@ -1,6 +1,6 @@
 # EAGLE Draft Models
 
-The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found in [examples/offline_inference/spec_decode.py](../../../examples/offline_inference/spec_decode.py)
+The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found in [examples/features/speculative_decoding/spec_decode_offline.py](../../../examples/features/speculative_decoding/spec_decode_offline.py)
 
 ## Eagle Drafter Example
 
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 41cf7be89291..fa39f7ae6e48 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -165,7 +165,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
     print(completion.choices[0].message.content)
     ```
 
-See also: [full example](../examples/online_serving/structured_outputs.md)
+See also: [full example](../../examples/features/structured_outputs/README.md)
 
 ## Reasoning Outputs
 
@@ -208,7 +208,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th
     print("content: ", completion.choices[0].message.content)
     ```
 
-See also: [full example](../examples/online_serving/structured_outputs.md)
+See also: [full example](../../examples/features/structured_outputs/README.md)
 
 !!! note
     When using Qwen3 Coder models with reasoning enabled, structured outputs might become disabled if the reasoning content does not get parsed into the `reasoning` field separately (v0.11.2+).
@@ -304,7 +304,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
 Answer: x = -29/8
 ```
 
-An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs)
+An example of using `structural_tag` can be found here: [examples/features/structured_outputs](../../examples/features/structured_outputs/README.md)
 
 ## Offline Inference
 
@@ -339,4 +339,4 @@ shown below:
     print(outputs[0].outputs[0].text)
     ```
 
-See also: [full example](../examples/online_serving/structured_outputs.md)
+See also: [full example](../../examples/features/structured_outputs/structured_outputs_offline.py)
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index e9aa87a69647..9c60255d6928 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -369,6 +369,16 @@ Flags:
 * For non-reasoning: `--tool-call-parser hunyuan_a13b`
 * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b`
 
+### Cohere Command A Reasoning (`cohere_command3`)
+
+Supported models:
+
+* [`CohereLabs/command-a-reasoning-08-2025`](https://huggingface.co/CohereLabs/command-a-reasoning-08-2025)
+
+Flags: `--tool-call-parser cohere_command3 --reasoning-parser cohere_command3`
+
+Note: the Cohere tool parser requires the `cohere_melody` package, which is not installed by default. Before using this parser please install the [cohere_melody](https://pypi.org/project/cohere-melody/) package.
+
 ### LongCat-Flash-Chat Models (`longcat`)
 
 Supported models:
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index a798b8e34530..2c19dc1763f6 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -7,7 +7,7 @@
 import textwrap
 import traceback
 from argparse import SUPPRESS, Action, HelpFormatter
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from importlib.machinery import ModuleSpec
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
@@ -38,8 +38,21 @@ def decorator(cls):
         return decorator
 
 
+class MockPluggableLayer:
+    @staticmethod
+    def register(name):
+        def decorator(cls):
+            return cls
+
+        return decorator
+
+
 mock_if_no_torch("vllm._C", MagicMock())
-mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
+mock_if_no_torch("vllm._C_stable_libtorch", MagicMock())
+mock_if_no_torch(
+    "vllm.model_executor.custom_op",
+    MagicMock(CustomOp=MockCustomOp, PluggableLayer=MockPluggableLayer),
+)
 mock_if_no_torch(
     "vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
 )
@@ -55,6 +68,31 @@ def decorator(cls):
 mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
 
 
+# Mock torch.library.infer_schema for vllm.ir.ops.IrOpInplaceOverload.__init__
+# We need to return the corresponding number of inputs, as IR infra will assert it
+def get_outputs(native_fn: Callable) -> str:
+    """
+    Extract output schema from function's return type annotation,
+    e.g. 'Tensor' or 'Tensor, Tensor'.
+    """
+    import typing
+
+    return_type = typing.get_type_hints(native_fn)["return"]
+    origin = typing.get_origin(return_type)
+    arg_name = lambda a: a.__name__ if hasattr(a, "__name__") else str(a)
+    if origin is tuple:
+        args = typing.get_args(return_type)
+        return ", ".join(arg_name(arg) for arg in args)
+    else:
+        return f"{arg_name(return_type)}"
+
+
+mock_if_no_torch(
+    "torch.library",
+    MagicMock(infer_schema=lambda fn, **k: f"(Tensor x) -> {get_outputs(fn)}"),
+)
+
+
 class PydanticMagicMock(MagicMock):
     """`MagicMock` that's able to generate pydantic-core schemas."""
 
diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md
index 38c603b46e10..965b2932ffaa 100644
--- a/docs/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@@ -101,7 +101,7 @@ vllm serve /path/to/sharded/model \
     --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
 ```
 
-To create sharded model files, you can use the script provided in [examples/offline_inference/save_sharded_state.py](../../../examples/offline_inference/save_sharded_state.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
+To create sharded model files, you can use the script provided in [examples/features/sharded_state/save_sharded_state_offline.py](../../../examples/features/sharded_state/save_sharded_state_offline.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
 
 The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
 
diff --git a/docs/models/pooling_models/scoring.md b/docs/models/pooling_models/scoring.md
index a4424642cd2a..baaf15f14fb4 100644
--- a/docs/models/pooling_models/scoring.md
+++ b/docs/models/pooling_models/scoring.md
@@ -41,6 +41,7 @@ The score models is designed to compute similarity scores between two input prom
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
 | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
 | `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
+| `ModernBertForSequenceClassification` | ModernBERT-based | `Alibaba-NLP/gte-reranker-modernbert-base`, etc. | N/A | | |
 | `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ |
 | `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | |
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index b78428f68ce1..e79fec8169f2 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -378,6 +378,7 @@ th {
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `thu-coai/ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ |
+| `CohereMoeForCausalLM` | Command (MoE) | (model checkpoints loaded with `trust_remote_code=True`) | ✅︎ | ✅︎ |
 | `CwmForCausalLM` | CWM | `facebook/cwm`, etc. | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
@@ -598,6 +599,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
 | `Molmo2ForConditionalGeneration` | Molmo2 | T + I<sup>+</sup> / V | `allenai/Molmo2-4B`, `allenai/Molmo2-8B`, `allenai/Molmo2-O-7B` | ✅︎ | ✅︎ |
+| `Moondream3ForCausalLM` | Moondream3 | T + I | `moondream/moondream3-preview` | | ✅︎ |
 | `MusicFlamingoForConditionalGeneration` | MusicFlamingo | T + A | `nvidia/music-flamingo-2601-hf`, `nvidia/music-flamingo-think-2601-hf` | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
 | `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
@@ -612,6 +614,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
 | `Phi4ForCausalLMV` | Phi-4-reasoning-vision | T + I<sup>+</sup> | `microsoft/Phi-4-reasoning-vision-15B`, etc. | | ✅︎ |
 | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | ✅︎ | ✅︎ |
+| `QianfanOCRForConditionalGeneration` | QianfanOCR | T + I<sup>E+</sup> | `baidu/Qianfan-OCR`, etc. | ✅︎ | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
@@ -660,6 +663,12 @@ Some models are supported only via the [Transformers modeling backend](#transfor
 !!! note
     For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc.), InternVL3 and InternVL3.5 have video inputs support currently.
 
+!!! note
+    `Moondream3ForCausalLM` uses task-specific prompt templates for `query`
+    and `caption`. The native `detect` and `point` skills require custom
+    coordinate decoding and are not exposed by this vLLM implementation.
+    See [Moondream3 prompt recipes](../features/multimodal_inputs.md#moondream3-prompt-recipes).
+
 !!! note
     To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 
diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md
index f0946eaf407a..1f18b92f95b4 100644
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@@ -16,7 +16,7 @@ For MoE models, when any requests are in progress in any rank, we must ensure th
 
 In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.
 
-This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/offline_inference/data_parallel.py](../../examples/offline_inference/data_parallel.py).
+This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/features/data_parallel/data_parallel_offline.py](../../examples/features/data_parallel/data_parallel_offline.py).
 
 There are two distinct modes supported for online deployments - self-contained with internal load balancing, or externally per-rank process deployment and load balancing.
 
@@ -98,7 +98,7 @@ For larger scale deployments especially, it can make sense to handle the orchest
 
 In this case, it's more convenient to treat each DP rank like a separate vLLM deployment, with its own endpoint, and have an external router balance HTTP requests between them, making use of appropriate real-time telemetry from each server for routing decisions.
 
-This can already be done trivially for non-MoE models, since each deployed server is fully independent. No data parallel CLI options need to be used for this.
+This can already be done trivially for non-MoE models, since each deployed server is fully independent. In that case, launch independent vLLM instances without any `--data-parallel-*` arguments; external DP CLI options are only supported for MoE deployments.
 
 We support an equivalent topology for MoE DP+EP which can be configured via the following CLI arguments.
 
diff --git a/docs/serving/distributed_troubleshooting.md b/docs/serving/distributed_troubleshooting.md
index b5354a7e55d5..e6dde4944284 100644
--- a/docs/serving/distributed_troubleshooting.md
+++ b/docs/serving/distributed_troubleshooting.md
@@ -4,11 +4,11 @@ For general troubleshooting, see [Troubleshooting](../usage/troubleshooting.md).
 
 ## Verify inter-node GPU communication
 
-After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script](../usage/troubleshooting.md#incorrect-hardwaredriver). If you need additional environment variables for communication configuration, append them to [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh), for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <https://github.com/vllm-project/vllm/issues/6803>.
+After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script](../usage/troubleshooting.md#incorrect-hardwaredriver). If you need additional environment variables for communication configuration, append them to [examples/ray_serving/run_cluster.sh](../../examples/ray_serving/run_cluster.sh), for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <https://github.com/vllm-project/vllm/issues/6803>.
 
 ## No available node types can fulfill resource request
 
-The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <https://github.com/vllm-project/vllm/issues/7815>.
+The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in [examples/ray_serving/run_cluster.sh](../../examples/ray_serving/run_cluster.sh) (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <https://github.com/vllm-project/vllm/issues/7815>.
 
 ## Ray observability
 
diff --git a/docs/serving/integrations/codex.md b/docs/serving/integrations/codex.md
new file mode 100644
index 000000000000..48148acfd51f
--- /dev/null
+++ b/docs/serving/integrations/codex.md
@@ -0,0 +1,88 @@
+# Codex
+
+[Codex](https://github.com/openai/codex) is OpenAI's official agentic coding tool that lives in your terminal. It can understand your codebase, edit files, run commands, and help you write code more efficiently.
+
+By pointing Codex at a vLLM server, you can use your own models as the backend instead of the OpenAI API. This is useful for:
+
+- Running fully local/private coding assistance
+- Using open-weight models with tool calling capabilities
+- Testing and developing with custom models
+
+## How It Works
+
+vLLM implements the OpenAI-Responses API, which is the same API that Codex uses to communicate with OpenAI's servers. By configuring Codex to point at your vLLM server, Codex sends its requests to vLLM instead of OpenAI. vLLM then translates these requests to work with your local model and returns responses in the format Codex expects.
+
+This means any model served by vLLM with proper tool calling support can act as a drop-in replacement for OpenAI models in Codex.
+
+## Requirements
+
+Codex requires a model with strong tool calling capabilities. The model must support the OpenAI-Responses tool calling API. See [Tool Calling](../../features/tool_calling.md) for details on enabling tool calling for your model.
+
+## Installation
+
+First, install Codex by following the [official installation guide](https://github.com/openai/codex).
+
+## Starting the vLLM Server
+
+Start vLLM with a tool-calling capable model - here's an example using `Qwen/Qwen3-27B`:
+
+```bash
+vllm serve Qwen/Qwen3.6-27B --port 8000 --tensor-parallel-size 8 --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
+
+```
+
+For other models, you'll need to enable tool calling explicitly with `--enable-auto-tool-choice` and the right `--tool-call-parser`. Refer to the [Tool Calling documentation](../../features/tool_calling.md) for the correct flags for your model.
+
+## Configuring Codex
+
+Codex is configured via a TOML file located at `~/.codex/config.toml`. Create or edit this file to point Codex at your vLLM server:
+
+```toml
+model = "my-model"
+model_provider = "vllm"
+
+[model_providers.vllm]
+name = "vLLM"
+env_key = "VLLM_API_KEY"
+base_url = "http://localhost:8000/v1"
+wire_api = "responses"
+```
+
+The configuration fields:
+
+| Field | Description |
+| ----- | ----------- |
+| `model` | The model name to use. Must match the `--served-model-name` you passed to vLLM. |
+| `model_provider` | Set to `"vllm"` to use your local vLLM server. |
+| `[model_providers.vllm]` | Configuration section for the vLLM provider. |
+| `name` | A display name for your vLLM provider. |
+| `env_key` | The name of an environment variable that Codex will read for the API key. vLLM does not require authentication by default, so this can be any value. |
+| `base_url` | The URL of your vLLM server's OpenAI-compatible API endpoint (default is `http://localhost:8000/v1`). |
+| `wire_api` | The API style to use. Set to `"responses"` for the OpenAI Responses API |
+
+!!! tip
+    You can set the `env_key` to any dummy environment variable since vLLM doesn't require authentication by default:
+    ```bash
+    export VLLM_API_KEY=dummy
+    ```
+
+!!! warning
+    When using the `responses` API, ensure your vLLM version supports the OpenAI Responses API.
+
+## Testing the Setup
+
+Once Codex is configured, launch it in your project directory:
+
+```bash
+codex
+```
+
+Try a simple prompt to verify the connection, such as asking it to explain a file in your project. If the model responds correctly, your setup is working. You can now use Codex with your vLLM-served model for coding tasks.
+
+## Troubleshooting
+
+**Connection refused**: Ensure vLLM is running and accessible at the specified URL. Check that the port matches and that `base_url` includes the `/v1` path suffix.
+
+**Tool calls not working**: Verify that your model supports tool calling and that you've enabled it with the correct `--tool-call-parser` flag. See [Tool Calling](../../features/tool_calling.md).
+
+**Model not found**: Ensure the `model` field in `~/.codex/config.toml` matches the `--served-model-name` you passed to vLLM.
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 59f02a006567..bf1c153ec0d5 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -542,6 +542,6 @@ Key capabilities:
 - Scales from a single GPU to a multi-node cluster without code changes.
 - Provides observability and autoscaling policies through Ray dashboards and metrics.
 
-The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: [examples/online_serving/ray_serve_deepseek.py](../../examples/online_serving/ray_serve_deepseek.py).
+The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: [examples/ray_serving/ray_serve_deepseek.py](../../examples/ray_serving/ray_serve_deepseek.py).
 
 Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/index.html).
diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index b69ca17e8334..0f86a256727c 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -78,7 +78,7 @@ For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.htm
 
 ### Ray cluster setup with containers
 
-The helper script [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) starts containers across nodes and initializes Ray. By default, the script runs Docker without administrative privileges, which prevents access to the GPU performance counters when profiling or tracing. To enable admin privileges, add the `--cap-add=CAP_SYS_ADMIN` flag to the Docker command.
+The helper script [examples/ray_serving/run_cluster.sh](../../examples/ray_serving/run_cluster.sh) starts containers across nodes and initializes Ray. By default, the script runs Docker without administrative privileges, which prevents access to the GPU performance counters when profiling or tracing. To enable admin privileges, add the `--cap-add=CAP_SYS_ADMIN` flag to the Docker command.
 
 Choose one node as the head node and run:
 
@@ -162,7 +162,7 @@ vllm serve /path/to/the/model/in/the/container \
 
 Efficient tensor parallelism requires fast internode communication, preferably through high-speed network adapters such as InfiniBand.
 To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the
-[examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) helper script.
+[examples/ray_serving/run_cluster.sh](../../examples/ray_serving/run_cluster.sh) helper script.
 Contact your system administrator for more information about the required flags.
 
 ## Enabling GPUDirect RDMA
diff --git a/docs/training/layerwise.md b/docs/training/layerwise.md
new file mode 100644
index 000000000000..d304c4a8425d
--- /dev/null
+++ b/docs/training/layerwise.md
@@ -0,0 +1,146 @@
+# What is Layerwise (Re)loading?
+
+Layerwise reloading is the system used to handle the loading of new weight data into existing weight data destinations without triggering recompilation of the cuda graph and other runtime artifacts. This system is used to enable [QeRL](https://arxiv.org/pdf/2510.11696)-style post training flows, where full-precision trainer weights are quantized and loaded into a target vLLM instance for fast, high-exploration rollouts. The core implementation can be found in [layerwise.py](../../vllm/model_executor/model_loader/reload/layerwise.py).
+
+![Layerwise](../assets/training/layerwise.png)
+
+## Layerwise Reloading for QeRL
+
+In order to load new weights into existing weight data destinations, a weight must undergo the following operations:
+
+- Transfer: weights must be transferred from trainer model to target node/device
+- Fuse: weight partitions must be fused, for example qkv/gate_up
+- Process: this typically means online quantization and kernel-specific padding or striding
+- Shard: weights must be sharded according to the selected parallelism strategy
+- Copy: weights must be copied into the existing weight data destinations
+
+Layerwise reloading achieves this using the following steps:
+
+1. Weights are **transferred** from the trainer to the target (see [weight_transfer](weight_transfer/README.md))
+2. Weights loaded via `model.load_weights`, during which they are **sharded** and **fused**
+3. Weights are **processed** in an online fashion as soon as all of a layer's weights are loaded
+4. Weights are **copied** into the existing weight data destinations
+
+For more information on implementation, see [Low Level `layerwise` API](#low-level-layerwise-api).
+
+## Layerwise Loading with Online Quantization
+
+Online quantization refers to when a user provides full precision weights and those weights are quantized on-the-fly as they are loaded into the model. The layerwise reloading system handles this by treating online quantization as a **processing** step, which is then handled in an online way both during first-time load and during reload. A typical online quantization method implementation should look like this:
+
+```python
+class Fp8OnlineLinearMethod(Fp8LinearMethod):
+    """Online version of Fp8LinearMethod which loads a full precision checkpoint
+    and quantizes weights during loading."""
+
+    uses_meta_device: bool = True
+
+    def create_weights(self, layer: torch.nn.Module, ...):
+        # weight is materialized and processed during loading
+        layer.weight = ModelWeightParameter(
+            data=torch.empty(..., device="meta"),
+            weight_loader=weight_loader,
+        )
+
+        # set up online processing
+        initialize_online_processing(layer)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
+
+        layer.weight, layer.weight_scale = ops.scaled_fp8_quant(layer.weight)
+
+        # Prevent duplicate processing (e.g., during weight reload)
+        layer._already_called_process_weights_after_loading = True
+```
+
+## Example Usages
+
+### High Level Weight Transfer API
+
+The layerwise reloading system is integrated with the post-training weight transfer system. To use layerwise reloading in conjunction to the weight transfer system, follow the examples found [here](../../examples/rl/). Layerwise reloading is controlled by the `WeightTransferUpdateInfo.is_checkpoint_format` flag and is set to `True` by default.
+
+### Mid Level `reload_weights` API
+
+Layerwise reloading is also exposed via the `reload_weights` API. This interface can be called using the following code:
+
+```python
+from vllm import LLM
+
+llm = LLM("Qwen/Qwen3-0.6B")
+llm.collective_rpc("reload_weights")
+```
+
+This interface also allows specifying a `weights_path` which can be used to select a checkpoint path to load from:
+
+```python
+from vllm import LLM
+
+# fine tuned model checkpoints for testing
+mul_path = "inference-optimization/Qwen3-0.6B-debug-multiply"
+add_path = "inference-optimization/Qwen3-0.6B-debug-add"
+
+llm = LLM("Qwen/Qwen3-0.6B")
+llm.collective_rpc("reload_weights", kwargs={"weights_path": mul_path})
+llm.generate("3 4 = ")  # 12
+
+llm.collective_rpc("reload_weights", kwargs={"weights_path": add_path})
+llm.generate("3 4 = ")  # 7
+```
+
+Finally, a `weights_iterator` can be provided directly. This iterator can be lazy or eagerly defined.
+
+```python
+from vllm import LLM
+
+weights_iterator = [("q_proj", ...), ("k_proj", ...), ...]
+
+llm = LLM("Qwen/Qwen3-0.6B")
+llm.collective_rpc("reload_weights", kwargs={"weights_iterator": weights_iterator})
+```
+
+### Low Level `layerwise` API
+
+[layerwise.py](../../vllm/model_executor/model_loader/reload/layerwise.py) Implements the following functions to execute its lifecycle:
+
+| Function | Purpose | Quantized Reload | Online Quantization |
+| - | - | - | - |
+| `record_metadata_for_reloading` | Record tensor metadata so that layers can be restored on the meta device | Called by `BaseModelLoader` | Called by `BaseModelLoader` |
+| `restore_layer_on_meta` | Restore layer to model format at start of reload | Called by `initialize_layerwise_reload` | Not called. Online quantized weights already start on meta device via `...OnlineLinearMethod.create_weights` |
+| `initialize_online_processing` | Wrap weight loaders with the `online_process_loader` wrapper, which buffers weights until all layer weights have been loaded | Called by `initialize_layerwise_reload` | Called by `...OnlineLinearMethod.create_weights` |
+| `_layerwise_process` | Process layer once all weights are loaded | Called by `online_process_loader` during loading | Called by `online_process_loader` during loading |
+| `_copy_and_restore_kernel_tensors` | Copy processed weights into original tensor locations to affect compiled cuda graphs, etc. | Called by `_layerwise_process` after `process_weights_after_loading` | Not called. There is no compiled cuda graph yet |
+| `finalize_layerwise_processing` | Catch any layers which did not load all weights (for example attention weights or weights with padding) | Called by `BaseModelLoader` | Called by `BaseModelLoader` |
+
+You can plug into this lifecycle directly by calling the `initialize_layerwise_reload`, loading weights, then calling `finalize_layerwise_processing`:
+
+```python
+from vllm import LLM
+from vllm.model_executor.model_loader.reload import initialize_layerwise_reload, finalize_layerwise_processing
+
+llm = LLM("Qwen/Qwen3-0.6B")
+
+# this model path requires `VLLM_ENABLE_V1_MULTIPROCESSING=0` and is not stable
+model = llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.get_model()
+
+# layerwise reload
+initialize_layerwise_reload(model)
+model.load_weights(...)
+finalize_layerwise_processing(model, llm.model_config)
+```
+
+## Troubleshooting Excessive Memory Usage
+
+Layerwise reloading allows users to incrementally load and process weights as they are loaded into the model. This system relies on buffering layer weights on device until all weights of a layer have been loaded. However, without offloading, this approach necessarily causes excessive buffering if weights are loaded out of order.
+
+For this reason, users must take care as to the order of weights when they are reloading into the model. Weight should be loaded "in order", meaning that each layer's weights are fully loaded before beginning to load the next layer's weights. "Out of order" loading can cause layer weights to stay buffered while other layer weights are loading, leading to excessive memory usage. In the example below, q_proj, k_proj, v_proj, and up_proj are all buffered at the same time, using more memory than if up_proj was loaded after q_proj, k_proj and v_proj.
+
+| Correct Loading | Incorrect Loading |
+| - | - |
+| ![Layerwise](../assets/training/layerwise_good_loading.png) | ![Layerwise](../assets/training/layerwise_bad_loading.png) |
+
+Users will see a warning like the one below if weights are loaded out-of-order.
+
+```console
+WARNING [layerwise.py:198] Allocating 28.5 MB of device memory to buffers to load ["QKVParallelLinear", "MergedColumnParallelLinear"] layers. This extra memory usage can be avoided by ordering weights by their parent layer when reloading.
+```
diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md
index a8e49d0a3398..680791bbe24a 100644
--- a/docs/usage/reproducibility.md
+++ b/docs/usage/reproducibility.md
@@ -7,7 +7,7 @@ reproducible results:
   or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling.
 - In online mode, you can only enable [batch invariance](../features/batch_invariance.md).
 
-Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
+Example: [examples/features/batch_invariance/reproducibility_offline.py](../../examples/features/batch_invariance/reproducibility_offline.py)
 
 !!! warning
 
diff --git a/docs/usage/security.md b/docs/usage/security.md
index 4879ddbf64ef..e548899abbf1 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -138,14 +138,22 @@ When `--api-key` is configured, the following `/v1` endpoints require Bearer tok
 
 - `/v1/models` - List available models
 - `/v1/chat/completions` - Chat completions
+- `/v1/chat/completions/batch` - Batch chat completions
+- `/v1/chat/completions/render` - Render chat completion requests
 - `/v1/completions` - Text completions
+- `/v1/completions/render` - Render completion requests
 - `/v1/embeddings` - Generate embeddings
 - `/v1/audio/transcriptions` - Audio transcription
 - `/v1/audio/translations` - Audio translation
 - `/v1/messages` - Anthropic-compatible messages API
-- `/v1/responses` - Response management
+- `/v1/messages/count_tokens` - Count tokens for Anthropic messages
+- `/v1/responses` - Create a response
+- `/v1/responses/{response_id}` - Retrieve a response
+- `/v1/responses/{response_id}/cancel` - Cancel a response
 - `/v1/score` - Scoring API
 - `/v1/rerank` - Reranking API
+- `/v1/load_lora_adapter` - Load a LoRA adapter (can alter model behavior; only available when `--enable-lora` is set and `VLLM_ALLOW_RUNTIME_LORA_UPDATING=True`)
+- `/v1/unload_lora_adapter` - Unload a LoRA adapter (can alter model behavior; only available when `--enable-lora` is set and `VLLM_ALLOW_RUNTIME_LORA_UPDATING=True`)
 
 ### Unprotected Endpoints (No API Key Required)
 
@@ -155,16 +163,23 @@ The following endpoints **do not require authentication** even when `--api-key`
 
 - `/invocations` - SageMaker-compatible endpoint (routes to the same inference functions as `/v1` endpoints)
 - `/inference/v1/generate` - Generate completions
+- `/generative_scoring` - Generative scoring API
 - `/pooling` - Pooling API
 - `/classify` - Classification API
 - `/score` - Scoring API (non-`/v1` variant)
 - `/rerank` - Reranking API (non-`/v1` variant)
 
-**Operational control endpoints (always enabled):**
+**Operational control endpoints (only when `"generate"` task is supported):**
 
 - `/pause` - Pause generation (causes denial of service)
 - `/resume` - Resume generation
+- `/is_paused` - Check if generation is paused
 - `/scale_elastic_ep` - Trigger scaling operations
+- `/is_scaling_elastic_ep` - Check if scaling is in progress
+- `/init_weight_transfer_engine` - Initialize weight transfer engine for RLHF
+- `/update_weights` - Update model weights (can alter model behavior)
+- `/get_world_size` - Get distributed world size
+- `/abort_requests` - Abort in-flight requests (only when `--tokens-only` is also set)
 
 **Utility endpoints:**
 
@@ -207,9 +222,9 @@ These endpoints are only available when profiling is enabled and should only be
 
 An attacker who can reach the vLLM HTTP server can:
 
-1. **Bypass authentication** by using non-`/v1` endpoints like `/invocations`, `/inference/v1/generate`, `/pooling`, `/classify`, `/score`, or `/rerank` to run arbitrary inference without credentials
-2. **Cause denial of service** by calling `/pause` or `/scale_elastic_ep` without a token
-3. **Access operational controls** to manipulate server state (e.g., pausing generation)
+1. **Bypass authentication** by using non-`/v1` endpoints like `/invocations`, `/inference/v1/generate`, `/generative_scoring`, `/pooling`, `/classify`, `/score`, or `/rerank` to run arbitrary inference without credentials
+2. **Cause denial of service** by calling `/pause`, `/scale_elastic_ep`, or `/abort_requests` without a token
+3. **Access operational controls** to manipulate server state (e.g., pausing generation, updating model weights via `/update_weights`)
 4. **If `--enable-tokenizer-info-endpoint` is set:** Access sensitive tokenizer configuration including chat templates, which may reveal prompt engineering strategies or other implementation details
 5. **If `VLLM_SERVER_DEV_MODE=1` is set:** Execute arbitrary RPC commands via `/collective_rpc`, reset caches, put the engine to sleep, and access detailed server configuration
 
@@ -288,6 +303,36 @@ To disable the Python code interpreter specifically, omit `code_interpreter` fro
 
 **Consider a custom implementation**: The GPT-OSS Python tool is a reference implementation. For production deployments, consider implementing a custom code execution sandbox with stricter isolation guarantees. See the [GPT-OSS documentation](https://github.com/openai/gpt-oss?tab=readme-ov-file#python) for guidance.
 
+## Dynamic LoRA Loading
+
+vLLM supports dynamically loading and unloading LoRA adapters at runtime via the `/v1/load_lora_adapter` and `/v1/unload_lora_adapter` API endpoints. This functionality is **not enabled by default** — it requires both `--enable-lora` and the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING=True` to be set.
+
+**Warning:** Dynamic LoRA loading is not a secure operation and should not be enabled in deployments exposed to untrusted clients. If you must enable dynamic LoRA loading, restrict access to the `/v1/load_lora_adapter` and `/v1/unload_lora_adapter` endpoints to trusted administrators only, using a reverse proxy or network-level access controls. Do not expose these endpoints to end users. For details on configuring LoRA adapters, see the [LoRA Adapters documentation](../features/lora.md).
+
+## Cache Directory Security
+
+vLLM assumes that its cache directories are **private and trusted**. Cache contents are loaded without cryptographic integrity verification, including formats that support arbitrary code execution. If an untrusted user or process can write to vLLM's cache directories, they may be able to crash vLLM or cause it to execute arbitrary code.
+
+**Do not share vLLM cache directories with untrusted users or mount them from untrusted storage.** Treat the cache directory with the same care as the vLLM installation itself.
+
+### Cache Directory Configuration
+
+Most cache paths default to subdirectories under a single root. Changing `VLLM_CACHE_ROOT` changes the default location for all features that inherit from it. When `torch.compile` caching is enabled (the default), vLLM also redirects `TRITON_CACHE_DIR` into this tree. If compile caching is disabled, Triton falls back to its own default location (`~/.triton/cache`).
+
+| Environment Variable | Default | Description |
+| --- | --- | --- |
+| `VLLM_CACHE_ROOT` | `~/.cache/vllm` | Base cache directory. Respects `XDG_CACHE_HOME` if set. All paths below inherit from this unless explicitly overridden. |
+| *(torch.compile)* | `$VLLM_CACHE_ROOT/torch_compile_cache/` | Compilation cache for AOT-compiled models, Inductor graphs, and Triton kernels. Controlled by `VLLM_DISABLE_COMPILE_CACHE` (set to `1` to disable). |
+| `VLLM_ASSETS_CACHE` | `$VLLM_CACHE_ROOT/assets/` | Downloaded assets (e.g., tokenizer files). |
+| `VLLM_XLA_CACHE_PATH` | `$VLLM_CACHE_ROOT/xla_cache/` | XLA/TPU compilation cache. |
+| `VLLM_MEDIA_CACHE` | *(disabled)* | Optional cache for downloaded media (images, video, audio). Not enabled unless explicitly set. |
+
+### Recommendations
+
+- **Restrict file permissions** on `VLLM_CACHE_ROOT` (and any other cache directories used by dependencies, such as `~/.triton` if compile caching is disabled) so that only the vLLM process owner can read and write to them.
+- **Do not copy cache contents from untrusted sources.** If you distribute cache artifacts between environments, ensure they originate from a trusted build pipeline.
+- **Container deployments:** If mounting cache directories into containers, ensure the volume source is trusted.
+
 ## Reporting Security Vulnerabilities
 
 If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/disaggregated/disaggregated_encoder/README.md
similarity index 100%
rename from examples/online_serving/disaggregated_encoder/README.md
rename to examples/disaggregated/disaggregated_encoder/README.md
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/disaggregated/disaggregated_encoder/disagg_1e1p1d_example.sh
similarity index 100%
rename from examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
rename to examples/disaggregated/disaggregated_encoder/disagg_1e1p1d_example.sh
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/disaggregated/disaggregated_encoder/disagg_1e1pd_example.sh
similarity index 100%
rename from examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
rename to examples/disaggregated/disaggregated_encoder/disagg_1e1pd_example.sh
diff --git a/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py b/examples/disaggregated/disaggregated_encoder/disagg_epd_proxy.py
similarity index 100%
rename from examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
rename to examples/disaggregated/disaggregated_encoder/disagg_epd_proxy.py
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/disaggregated/disaggregated_prefill.py
similarity index 100%
rename from examples/offline_inference/disaggregated_prefill.py
rename to examples/disaggregated/disaggregated_prefill.py
diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/disaggregated/disaggregated_prefill.sh
similarity index 100%
rename from examples/online_serving/disaggregated_prefill.sh
rename to examples/disaggregated/disaggregated_prefill.sh
diff --git a/examples/online_serving/disaggregated_serving/README.md b/examples/disaggregated/disaggregated_serving/README.md
similarity index 100%
rename from examples/online_serving/disaggregated_serving/README.md
rename to examples/disaggregated/disaggregated_serving/README.md
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/disaggregated/disaggregated_serving/disagg_proxy_demo.py
similarity index 99%
rename from examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
rename to examples/disaggregated/disaggregated_serving/disagg_proxy_demo.py
index 763361a30e02..57deef6a15d2 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/disaggregated/disaggregated_serving/disagg_proxy_demo.py
@@ -5,7 +5,7 @@
 example usage of XpYd disaggregated prefilling.
 We can launch multiple vllm instances (2 for prefill and 2 for decode), and
 launch this proxy demo through:
-  python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py  \
+  python3 examples/disaggregated/disaggregated_serving/disagg_proxy_demo.py  \
        --model $model_name  \
        --prefill localhost:8100 localhost:8101   \
        --decode localhost:8200 localhost:8201   \
diff --git a/examples/online_serving/disaggregated_serving/example_mm_serve.py b/examples/disaggregated/disaggregated_serving/example_mm_serve.py
similarity index 100%
rename from examples/online_serving/disaggregated_serving/example_mm_serve.py
rename to examples/disaggregated/disaggregated_serving/example_mm_serve.py
diff --git a/examples/online_serving/disaggregated_serving/kv_events.sh b/examples/disaggregated/disaggregated_serving/kv_events.sh
similarity index 100%
rename from examples/online_serving/disaggregated_serving/kv_events.sh
rename to examples/disaggregated/disaggregated_serving/kv_events.sh
diff --git a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py b/examples/disaggregated/disaggregated_serving/moriio_toy_proxy_server.py
similarity index 100%
rename from examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
rename to examples/disaggregated/disaggregated_serving/moriio_toy_proxy_server.py
diff --git a/examples/online_serving/ec_both_encoder/ec_both_encoder.sh b/examples/disaggregated/ec_both_encoder/ec_both_encoder.sh
old mode 100755
new mode 100644
similarity index 100%
rename from examples/online_serving/ec_both_encoder/ec_both_encoder.sh
rename to examples/disaggregated/ec_both_encoder/ec_both_encoder.sh
diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/disaggregated/example_connector/README.md
similarity index 81%
rename from examples/offline_inference/disaggregated-prefill-v1/README.md
rename to examples/disaggregated/example_connector/README.md
index abf6883f8d3e..43f16223896c 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/disaggregated/example_connector/README.md
@@ -5,6 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl
 ## Files
 
 - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
-    - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
+    - Make sure you are in the `examples/disaggregated/example_connector` directory before running `run.sh`.
 - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
 - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/disaggregated/example_connector/decode_example.py
similarity index 100%
rename from examples/offline_inference/disaggregated-prefill-v1/decode_example.py
rename to examples/disaggregated/example_connector/decode_example.py
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/disaggregated/example_connector/prefill_example.py
similarity index 100%
rename from examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
rename to examples/disaggregated/example_connector/prefill_example.py
diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/disaggregated/example_connector/run.sh
similarity index 100%
rename from examples/offline_inference/disaggregated-prefill-v1/run.sh
rename to examples/disaggregated/example_connector/run.sh
diff --git a/examples/offline_inference/prefix_caching_flexkv.py b/examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py
similarity index 98%
rename from examples/offline_inference/prefix_caching_flexkv.py
rename to examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py
index f2ffb75ef845..b67c2459319f 100644
--- a/examples/offline_inference/prefix_caching_flexkv.py
+++ b/examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py
@@ -14,7 +14,7 @@
 
 Usage:
     1. Run this script:
-       python examples/offline_inference/prefix_caching_flexkv.py \
+       python examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py \
            --model /path/to/your/model
 
     2. Arguments:
diff --git a/examples/offline_inference/kv_load_failure_recovery/README.md b/examples/disaggregated/kv_load_failure_recovery_offline/README.md
similarity index 91%
rename from examples/offline_inference/kv_load_failure_recovery/README.md
rename to examples/disaggregated/kv_load_failure_recovery_offline/README.md
index 176141b5de4a..7205b2135ebb 100644
--- a/examples/offline_inference/kv_load_failure_recovery/README.md
+++ b/examples/disaggregated/kv_load_failure_recovery_offline/README.md
@@ -1,12 +1,12 @@
 # KV Load Failure Recovery Test
 
-This example builds upon the `disaggregated-prefill-v1` example in `examples/offline_inference`.
+This example builds upon the `example_connector` example in `examples/disaggregated`.
 
 It demonstrates vLLM's ability to recover from KV load failures in both synchronous and asynchronous loading modes. The goal is to verify that vLLM correctly identifies invalid KV blocks, reschedules the affected requests, and ensures successful and consistent output.
 
 ## Files
 
-- `prefill_example.py` – performs the prefill stage and saves KV data (same as in `disaggregated-prefill-v1`).
+- `prefill_example.py` – performs the prefill stage and saves KV data (same as in `example_connector`).
 - `decode_example.py` – performs the decode stage. Accepts:
     - `--simulate-failure`: simulates KV load failure using a custom connector.
     - `--async-load`: enables asynchronous KV loading mode.
diff --git a/examples/offline_inference/kv_load_failure_recovery/decode_example.py b/examples/disaggregated/kv_load_failure_recovery_offline/decode_example.py
similarity index 100%
rename from examples/offline_inference/kv_load_failure_recovery/decode_example.py
rename to examples/disaggregated/kv_load_failure_recovery_offline/decode_example.py
diff --git a/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py b/examples/disaggregated/kv_load_failure_recovery_offline/load_recovery_example_connector.py
similarity index 100%
rename from examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
rename to examples/disaggregated/kv_load_failure_recovery_offline/load_recovery_example_connector.py
diff --git a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py b/examples/disaggregated/kv_load_failure_recovery_offline/prefill_example.py
similarity index 100%
rename from examples/offline_inference/kv_load_failure_recovery/prefill_example.py
rename to examples/disaggregated/kv_load_failure_recovery_offline/prefill_example.py
diff --git a/examples/offline_inference/kv_load_failure_recovery/run.sh b/examples/disaggregated/kv_load_failure_recovery_offline/run.sh
old mode 100755
new mode 100644
similarity index 100%
rename from examples/offline_inference/kv_load_failure_recovery/run.sh
rename to examples/disaggregated/kv_load_failure_recovery_offline/run.sh
diff --git a/examples/others/lmcache/README.md b/examples/disaggregated/lmcache/README.md
similarity index 100%
rename from examples/others/lmcache/README.md
rename to examples/disaggregated/lmcache/README.md
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/disaggregated/lmcache/cpu_offload_lmcache.py
similarity index 100%
rename from examples/others/lmcache/cpu_offload_lmcache.py
rename to examples/disaggregated/lmcache/cpu_offload_lmcache.py
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v0.py
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v0.py
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v0.py
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/disaggregated/lmcache/kv_cache_sharing_lmcache_v1.py
similarity index 100%
rename from examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
rename to examples/disaggregated/lmcache/kv_cache_sharing_lmcache_v1.py
diff --git a/examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py b/examples/disaggregated/mooncake_connector/mooncake_connector_proxy.py
similarity index 100%
rename from examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py
rename to examples/disaggregated/mooncake_connector/mooncake_connector_proxy.py
diff --git a/examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh b/examples/disaggregated/mooncake_connector/run_mooncake_connector.sh
similarity index 100%
rename from examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh
rename to examples/disaggregated/mooncake_connector/run_mooncake_connector.sh
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/disaggregated/p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
similarity index 100%
rename from examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
rename to examples/disaggregated/p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/disaggregated/p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
similarity index 100%
rename from examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
rename to examples/disaggregated/p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py
similarity index 98%
rename from examples/offline_inference/automatic_prefix_caching.py
rename to examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py
index 2d3c28d9dd4f..801b4b769792 100644
--- a/examples/offline_inference/automatic_prefix_caching.py
+++ b/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py
@@ -15,7 +15,7 @@
 but ask different questions.
 
 Run:
-python examples/offline_inference/automatic_prefix_caching.py
+python examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py
 """
 
 import time
diff --git a/examples/offline_inference/prefix_caching.py b/examples/features/automatic_prefix_caching/prefix_caching_offline.py
similarity index 100%
rename from examples/offline_inference/prefix_caching.py
rename to examples/features/automatic_prefix_caching/prefix_caching_offline.py
diff --git a/examples/offline_inference/reproducibility.py b/examples/features/batch_invariance/reproducibility_offline.py
similarity index 100%
rename from examples/offline_inference/reproducibility.py
rename to examples/features/batch_invariance/reproducibility_offline.py
diff --git a/examples/offline_inference/context_extension.py b/examples/features/context_extension/context_extension_offline.py
similarity index 96%
rename from examples/offline_inference/context_extension.py
rename to examples/features/context_extension/context_extension_offline.py
index fae8590f914e..3874288b5e11 100644
--- a/examples/offline_inference/context_extension.py
+++ b/examples/features/context_extension/context_extension_offline.py
@@ -6,7 +6,7 @@
 and run a simple chat example.
 
 Usage:
-    python examples/offline_inference/context_extension.py
+    python examples/features/context_extension/context_extension_offline.py
 """
 
 from vllm import LLM, RequestOutput, SamplingParams
diff --git a/examples/offline_inference/data_parallel.py b/examples/features/data_parallel/data_parallel_offline.py
similarity index 96%
rename from examples/offline_inference/data_parallel.py
rename to examples/features/data_parallel/data_parallel_offline.py
index 287409fa2b5c..c38ff7297afc 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/features/data_parallel/data_parallel_offline.py
@@ -3,14 +3,14 @@
 """
 Usage:
 Single node:
-    python examples/offline_inference/data_parallel.py \
+    python examples/features/data_parallel/data_parallel_offline.py \
             --model="ibm-research/PowerMoE-3b" \
             -dp=2 \
             -tp=2
 
 Multi-node:
     Node 0 (assume the node has ip of 10.99.48.128):
-            python examples/offline_inference/data_parallel.py \
+            python examples/features/data_parallel/data_parallel_offline.py \
                     --model="ibm-research/PowerMoE-3b" \
                     -dp=2 \
                     -tp=2 \
@@ -19,7 +19,7 @@
                     --dp-master-addr=10.99.48.128 \
                     --dp-master-port=13345
     Node 1:
-            python examples/offline_inference/data_parallel.py \
+            python examples/features/data_parallel/data_parallel_offline.py \
                     --model="ibm-research/PowerMoE-3b" \
                     -dp=2 \
                     -tp=2 \
diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/features/data_parallel/multi_instance_data_parallel.py
similarity index 97%
rename from examples/online_serving/multi_instance_data_parallel.py
rename to examples/features/data_parallel/multi_instance_data_parallel.py
index 04d21e048940..66fcd3d24644 100644
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/features/data_parallel/multi_instance_data_parallel.py
@@ -12,7 +12,7 @@
 """
 To run this example, run the following commands simultaneously with
 different CUDA_VISIBLE_DEVICES:
-    python examples/online_serving/multi_instance_data_parallel.py
+    python examples/features/data_parallel/multi_instance_data_parallel.py
 
     vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
         --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/features/kv_events/kv_events_subscriber.py
similarity index 100%
rename from examples/online_serving/kv_events_subscriber.py
rename to examples/features/kv_events/kv_events_subscriber.py
diff --git a/examples/offline_inference/logits_processor/README.md b/examples/features/logits_processor/README.md
similarity index 90%
rename from examples/offline_inference/logits_processor/README.md
rename to examples/features/logits_processor/README.md
index 6b6e16942f85..07ca07dc71ed 100644
--- a/examples/offline_inference/logits_processor/README.md
+++ b/examples/features/logits_processor/README.md
@@ -9,7 +9,7 @@ This directory contains examples demonstrating how to use custom logits processo
 Demonstrates how to instantiate vLLM with a custom logits processor class that operates at the batch level. The example uses a `DummyLogitsProcessor` that masks out all tokens except a specified `target_token` when passed via `SamplingParams.extra_args`.
 
 ```bash
-python examples/offline_inference/logits_processor/custom.py
+python examples/features/logits_processor/custom.py
 ```
 
 ### `custom_req.py` — Request-level logits processor wrapper
@@ -17,7 +17,7 @@ python examples/offline_inference/logits_processor/custom.py
 Shows how to wrap a request-level logits processor (which operates on individual requests) to be compatible with vLLM's batch-level logits processing interface.
 
 ```bash
-python examples/offline_inference/logits_processor/custom_req.py
+python examples/features/logits_processor/custom_req.py
 ```
 
 ### `custom_req_init.py` — Request-level processor with engine config
@@ -25,7 +25,7 @@ python examples/offline_inference/logits_processor/custom_req.py
 A special case of wrapping a request-level logits processor where the processor needs access to engine configuration or model metadata during initialization (e.g., vocabulary size, tokenizer info).
 
 ```bash
-python examples/offline_inference/logits_processor/custom_req_init.py
+python examples/features/logits_processor/custom_req_init.py
 ```
 
 ## Key Concepts
diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/features/logits_processor/custom.py
similarity index 100%
rename from examples/offline_inference/logits_processor/custom.py
rename to examples/features/logits_processor/custom.py
diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/features/logits_processor/custom_req.py
similarity index 100%
rename from examples/offline_inference/logits_processor/custom_req.py
rename to examples/features/logits_processor/custom_req.py
diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/features/logits_processor/custom_req_init.py
similarity index 100%
rename from examples/offline_inference/logits_processor/custom_req_init.py
rename to examples/features/logits_processor/custom_req_init.py
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/features/lora/lora_with_quantization_offline.py
similarity index 100%
rename from examples/offline_inference/lora_with_quantization_inference.py
rename to examples/features/lora/lora_with_quantization_offline.py
diff --git a/examples/offline_inference/multilora_inference.py b/examples/features/lora/multilora_offline.py
similarity index 100%
rename from examples/offline_inference/multilora_inference.py
rename to examples/features/lora/multilora_offline.py
diff --git a/examples/offline_inference/openai_batch/README.md b/examples/features/openai_batch/README.md
similarity index 94%
rename from examples/offline_inference/openai_batch/README.md
rename to examples/features/openai_batch/README.md
index ef4e438d6b72..a9bd31691210 100644
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/features/openai_batch/README.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
 
 The OpenAI batch file format consists of a series of json objects on new lines.
 
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/features/openai_batch/openai_example_batch.jsonl)
 
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
 
@@ -30,13 +30,13 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```bash
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```bash
-cat offline_inference/openai_batch/openai_example_batch.jsonl
+cat features/openai_batch/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -49,7 +49,7 @@ You can run the batch with the following command, which will write its results t
 
 ```bash
 python -m vllm.entrypoints.openai.run_batch \
-    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -i features/openai_batch/openai_example_batch.jsonl \
     -o results.jsonl \
     --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
@@ -58,7 +58,7 @@ or use command-line:
 
 ```bash
 vllm run-batch \
-    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -i features/openai_batch/openai_example_batch.jsonl \
     -o results.jsonl \
     --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
@@ -77,11 +77,11 @@ cat results.jsonl
 
 The batch runner supports remote input and output urls that are accessible via http/https.
 
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl`, you can run
 
 ```bash
 python -m vllm.entrypoints.openai.run_batch \
-    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \
     -o results.jsonl \
     --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
@@ -90,7 +90,7 @@ or use command-line:
 
 ```bash
 vllm run-batch \
-    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \
     -o results.jsonl \
     --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
@@ -113,13 +113,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```bash
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```bash
-cat offline_inference/openai_batch/openai_example_batch.jsonl
+cat features/openai_batch/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -127,7 +127,7 @@ cat offline_inference/openai_batch/openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 
 ```bash
-aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp features/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
 ### Step 2: Generate your presigned urls
diff --git a/examples/offline_inference/openai_batch/openai_example_batch.jsonl b/examples/features/openai_batch/openai_example_batch.jsonl
similarity index 100%
rename from examples/offline_inference/openai_batch/openai_example_batch.jsonl
rename to examples/features/openai_batch/openai_example_batch.jsonl
diff --git a/examples/online_serving/data_parallel_pause_resume.py b/examples/features/pause_resume/data_parallel_pause_resume.py
similarity index 96%
rename from examples/online_serving/data_parallel_pause_resume.py
rename to examples/features/pause_resume/data_parallel_pause_resume.py
index e94de22a1271..1f11536e5366 100644
--- a/examples/online_serving/data_parallel_pause_resume.py
+++ b/examples/features/pause_resume/data_parallel_pause_resume.py
@@ -1,135 +1,135 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Test pause/resume with Data Parallel (DP) via HTTP API.
-
-This example demonstrates coordinated pause/resume across multiple DP ranks.
-The pause synchronizes across all DP engines via all-reduce.
-
-Prerequisites:
-    Start a vLLM server with data parallelism:
-
-    $ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \
-        --enforce-eager \
-        --data-parallel-size 4 \
-        --tensor-parallel-size 1
-
-    Then run this script:
-
-    $ python data_parallel_pause_resume.py
-
-The test verifies pause works by:
-1. Starting a streaming generation request
-2. Pausing the server mid-generation
-3. Sleeping for PAUSE_DURATION seconds
-4. Resuming the server
-5. Verifying there was a gap in token generation matching the pause duration
-"""
-
-import argparse
-import threading
-import time
-
-import requests
-from openai import OpenAI
-
-BASE_URL = "http://localhost:8000"
-MODEL_NAME = "facebook/opt-125m"
-PAUSE_DURATION = 3.0
-
-
-def pause_generation(base_url: str, mode: str = "keep") -> None:
-    """Pause generation via HTTP endpoint."""
-    url = f"{base_url}/pause"
-    response = requests.post(url, params={"mode": mode}, timeout=60)
-    response.raise_for_status()
-    print("Server paused")
-
-
-def resume_generation(base_url: str) -> None:
-    """Resume generation via HTTP endpoint."""
-    url = f"{base_url}/resume"
-    response = requests.post(url, timeout=60)
-    response.raise_for_status()
-    print("Server resumed")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base-url", default=BASE_URL)
-    parser.add_argument("--model", default=MODEL_NAME)
-    args = parser.parse_args()
-
-    client = OpenAI(
-        base_url=f"{args.base_url}/v1",
-        api_key="EMPTY",
-    )
-
-    prompt = "Write a long story about a dragon. Once upon a time"
-    token_times: list[float] = []
-    pause_token_idx = 0
-    pause_triggered = threading.Event()
-
-    def generator_thread():
-        """Stream tokens and record timestamps."""
-        stream = client.completions.create(
-            model=args.model,
-            prompt=prompt,
-            max_tokens=50,
-            stream=True,
-        )
-        for chunk in stream:
-            if chunk.choices[0].text:
-                token_times.append(time.monotonic())
-                token_count = len(token_times)
-                print(f"Token {token_count}: {chunk.choices[0].text!r}")
-
-                # Signal controller after some tokens
-                if token_count >= 5 and not pause_triggered.is_set():
-                    pause_triggered.set()
-
-    def controller_thread():
-        """Pause and resume the server."""
-        nonlocal pause_token_idx
-
-        # Wait for some tokens
-        pause_triggered.wait()
-
-        print(f"\nPausing server (keep mode) at token {len(token_times)}...")
-        pause_generation(args.base_url, mode="keep")
-        pause_token_idx = len(token_times)
-        print(f"Sleeping for {PAUSE_DURATION}s...")
-
-        time.sleep(PAUSE_DURATION)
-
-        print("Resuming server...")
-        resume_generation(args.base_url)
-        print("Resumed!\n")
-
-    # Run both threads
-    gen_thread = threading.Thread(target=generator_thread)
-    ctrl_thread = threading.Thread(target=controller_thread)
-
-    gen_thread.start()
-    ctrl_thread.start()
-
-    gen_thread.join()
-    ctrl_thread.join()
-
-    # Check gap at the pause point
-    if pause_token_idx < len(token_times):
-        pause_gap = token_times[pause_token_idx] - token_times[pause_token_idx - 1]
-        print(
-            f"\nGap after pause (token {pause_token_idx} -> "
-            f"{pause_token_idx + 1}): {pause_gap:.3f}s"
-        )
-        if pause_gap >= PAUSE_DURATION * 0.9:
-            print("Test passed! Pause synchronized across DP ranks.")
-        else:
-            print(f"Test failed! Expected ~{PAUSE_DURATION}s gap, got {pause_gap:.3f}s")
-    else:
-        print("Test failed! No tokens were generated after resuming.")
-
-
-if __name__ == "__main__":
-    main()
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test pause/resume with Data Parallel (DP) via HTTP API.
+
+This example demonstrates coordinated pause/resume across multiple DP ranks.
+The pause synchronizes across all DP engines via all-reduce.
+
+Prerequisites:
+    Start a vLLM server with data parallelism:
+
+    $ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \
+        --enforce-eager \
+        --data-parallel-size 4 \
+        --tensor-parallel-size 1
+
+    Then run this script:
+
+    $ python data_parallel_pause_resume.py
+
+The test verifies pause works by:
+1. Starting a streaming generation request
+2. Pausing the server mid-generation
+3. Sleeping for PAUSE_DURATION seconds
+4. Resuming the server
+5. Verifying there was a gap in token generation matching the pause duration
+"""
+
+import argparse
+import threading
+import time
+
+import requests
+from openai import OpenAI
+
+BASE_URL = "http://localhost:8000"
+MODEL_NAME = "facebook/opt-125m"
+PAUSE_DURATION = 3.0
+
+
+def pause_generation(base_url: str, mode: str = "keep") -> None:
+    """Pause generation via HTTP endpoint."""
+    url = f"{base_url}/pause"
+    response = requests.post(url, params={"mode": mode}, timeout=60)
+    response.raise_for_status()
+    print("Server paused")
+
+
+def resume_generation(base_url: str) -> None:
+    """Resume generation via HTTP endpoint."""
+    url = f"{base_url}/resume"
+    response = requests.post(url, timeout=60)
+    response.raise_for_status()
+    print("Server resumed")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-url", default=BASE_URL)
+    parser.add_argument("--model", default=MODEL_NAME)
+    args = parser.parse_args()
+
+    client = OpenAI(
+        base_url=f"{args.base_url}/v1",
+        api_key="EMPTY",
+    )
+
+    prompt = "Write a long story about a dragon. Once upon a time"
+    token_times: list[float] = []
+    pause_token_idx = 0
+    pause_triggered = threading.Event()
+
+    def generator_thread():
+        """Stream tokens and record timestamps."""
+        stream = client.completions.create(
+            model=args.model,
+            prompt=prompt,
+            max_tokens=50,
+            stream=True,
+        )
+        for chunk in stream:
+            if chunk.choices[0].text:
+                token_times.append(time.monotonic())
+                token_count = len(token_times)
+                print(f"Token {token_count}: {chunk.choices[0].text!r}")
+
+                # Signal controller after some tokens
+                if token_count >= 5 and not pause_triggered.is_set():
+                    pause_triggered.set()
+
+    def controller_thread():
+        """Pause and resume the server."""
+        nonlocal pause_token_idx
+
+        # Wait for some tokens
+        pause_triggered.wait()
+
+        print(f"\nPausing server (keep mode) at token {len(token_times)}...")
+        pause_generation(args.base_url, mode="keep")
+        pause_token_idx = len(token_times)
+        print(f"Sleeping for {PAUSE_DURATION}s...")
+
+        time.sleep(PAUSE_DURATION)
+
+        print("Resuming server...")
+        resume_generation(args.base_url)
+        print("Resumed!\n")
+
+    # Run both threads
+    gen_thread = threading.Thread(target=generator_thread)
+    ctrl_thread = threading.Thread(target=controller_thread)
+
+    gen_thread.start()
+    ctrl_thread.start()
+
+    gen_thread.join()
+    ctrl_thread.join()
+
+    # Check gap at the pause point
+    if pause_token_idx < len(token_times):
+        pause_gap = token_times[pause_token_idx] - token_times[pause_token_idx - 1]
+        print(
+            f"\nGap after pause (token {pause_token_idx} -> "
+            f"{pause_token_idx + 1}): {pause_gap:.3f}s"
+        )
+        if pause_gap >= PAUSE_DURATION * 0.9:
+            print("Test passed! Pause synchronized across DP ranks.")
+        else:
+            print(f"Test failed! Expected ~{PAUSE_DURATION}s gap, got {pause_gap:.3f}s")
+    else:
+        print("Test failed! No tokens were generated after resuming.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/pause_resume.py b/examples/features/pause_resume/pause_resume_offline.py
similarity index 100%
rename from examples/offline_inference/pause_resume.py
rename to examples/features/pause_resume/pause_resume_offline.py
diff --git a/examples/offline_inference/run_one_batch.py b/examples/features/profiling/run_one_batch_offline.py
similarity index 100%
rename from examples/offline_inference/run_one_batch.py
rename to examples/features/profiling/run_one_batch_offline.py
diff --git a/examples/offline_inference/simple_profiling.py b/examples/features/profiling/simple_profiling_offline.py
similarity index 100%
rename from examples/offline_inference/simple_profiling.py
rename to examples/features/profiling/simple_profiling_offline.py
diff --git a/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py b/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py
new file mode 100644
index 000000000000..f3204645d0a0
--- /dev/null
+++ b/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""vLLM OpenAI-Compatible Client with Prompt Embeddings.
+
+This script demonstrates how to:
+1. Generate prompt embeddings using Hugging Face Transformers.
+2. Encode them in base64 format.
+3. Send them to a vLLM server for inference via both:
+    - OpenAI-compatible Chat Completions API
+    - OpenAI-compatible Completions API
+
+Important distinction between the two APIs:
+
+- Chat Completions API: `prompt_embeds` content parts should encode ONLY
+  the user-provided content, not a templated conversation. The server
+  renders the surrounding chat template around the embedded content at
+  request time, the same way it would for a plain text `content` string.
+  Embedding a full templated conversation here would double-apply the
+  template and likely produce undesirable results.
+
+- Completions API: the server does NOT apply a chat template to
+  `prompt_embeds`. The caller is responsible for producing embeddings for
+  the full, already-templated prompt (i.e. apply the chat template first, 
+  then embed the resulting token IDs). Anything the model would normally
+  need (system prompt, role markers, generation prompt, etc.) must already
+  be baked into the embedded tokens.
+
+Run the vLLM server first:
+vllm serve meta-llama/Llama-3.2-1B-Instruct \
+  --runner generate \
+  --max-model-len 4096 \
+  --enable-prompt-embeds
+
+Run the client:
+python examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py
+
+Model: meta-llama/Llama-3.2-1B-Instruct
+Note: This model is gated on Hugging Face Hub.
+      You must request access to use it:
+      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
+
+Dependencies:
+- transformers
+- torch
+- openai
+"""
+
+import transformers
+from openai import OpenAI
+
+from vllm.utils.serial_utils import tensor2base64
+
+
+def run_chat_completion_prompt_embeds(
+    client: OpenAI,
+    model_name: str,
+    tokenizer: transformers.PreTrainedTokenizerBase,
+    embedding_layer,
+    messages: list[dict],
+) -> None:
+    """Run a Chat Completions API request using prompt_embeds content parts.
+
+    This example embeds ONLY the user-provided content of the final user turn, the
+    vLLM server applies the chat template around it at request time.
+    """
+    user_content = messages[-1]["content"]
+    content_token_ids = tokenizer(
+        user_content, return_tensors="pt", add_special_tokens=False
+    ).input_ids
+    content_prompt_embeds = embedding_layer(content_token_ids).squeeze(0)
+    encoded_embeds = tensor2base64(content_prompt_embeds)
+
+    api_messages = [
+        *messages[:-1],
+        {
+            "role": messages[-1]["role"],
+            "content": [{"type": "prompt_embeds", "data": encoded_embeds}],
+        },
+    ]
+
+    chat_completion = client.chat.completions.create(
+        model=model_name,
+        max_tokens=6,
+        temperature=0.0,
+        messages=api_messages,
+    )
+
+    print("-" * 30)
+    print("Chat Completions API")
+    print(chat_completion.choices[0].message.content)
+    print("-" * 30)
+
+
+def run_completion_prompt_embeds(
+    client: OpenAI,
+    model_name: str,
+    tokenizer: transformers.PreTrainedTokenizerBase,
+    embedding_layer,
+    messages: list[dict],
+) -> None:
+    """Run a Completions API request using prompt embeddings.
+
+    The Completions endpoint does not apply a chat template,
+    so the caller must apply it and embed the full templated prompt.
+    """
+    templated_token_ids = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, return_tensors="pt", return_dict=True
+    ).input_ids
+    templated_prompt_embeds = embedding_layer(templated_token_ids).squeeze(0)
+    encoded_embeds = tensor2base64(templated_prompt_embeds)
+
+    completion = client.completions.create(
+        model=model_name,
+        prompt=None,
+        max_tokens=6,
+        temperature=0.0,
+        # NOTE: The OpenAI client allows passing in extra JSON body via the
+        # `extra_body` argument.
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+
+    print("-" * 30)
+    print("Completions API")
+    print(completion.choices[0].text)
+    print("-" * 30)
+
+
+def main() -> None:
+    client = OpenAI(
+        api_key="EMPTY",
+        base_url="http://localhost:8000/v1",
+    )
+
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
+    embedding_layer = transformers_model.get_input_embeddings()
+
+    messages = [
+        {"role": "user", "content": "Please tell me about the capital of France."}
+    ]
+
+    # Chat Completions API: embed ONLY the user content. The server wraps
+    # the embedding in the chat template when it renders the messages.
+    run_chat_completion_prompt_embeds(
+        client, model_name, tokenizer, embedding_layer, messages
+    )
+
+    # Completions API: embed the FULL templated prompt. The caller must
+    # apply the chat template up-front.
+    run_completion_prompt_embeds(
+        client, model_name, tokenizer, embedding_layer, messages
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/features/prompt_embed/prompt_embed_offline.py
similarity index 97%
rename from examples/offline_inference/prompt_embed_inference.py
rename to examples/features/prompt_embed/prompt_embed_offline.py
index a0eaeb6810a2..29853bce9673 100644
--- a/examples/offline_inference/prompt_embed_inference.py
+++ b/examples/features/prompt_embed/prompt_embed_offline.py
@@ -15,7 +15,7 @@
 - transformers
 
 Run:
-    python examples/offline_inference/prompt_embed_inference.py
+    python examples/features/prompt_embed/prompt_embed_offline.py
 """
 
 import torch
diff --git a/examples/offline_inference/llm_engine_reset_kv.py b/examples/features/reset_kv/reset_kv_offline.py
similarity index 100%
rename from examples/offline_inference/llm_engine_reset_kv.py
rename to examples/features/reset_kv/reset_kv_offline.py
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/features/sharded_state/load_sharded_state_offline.py
similarity index 94%
rename from examples/offline_inference/load_sharded_state.py
rename to examples/features/sharded_state/load_sharded_state_offline.py
index 0085e8e8e32b..e867db5d12fe 100644
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/features/sharded_state/load_sharded_state_offline.py
@@ -3,16 +3,16 @@
 """
 Validates the loading of a model saved with the sharded_state format.
 This script demonstrates how to load a model that was previously saved
-using save_sharded_state.py and validates it by running inference.
+using save_sharded_state_offline.py and validates it by running inference.
 Example usage:
 (First need to save a sharded_state mode)
 
-python save_sharded_state.py \
+python save_sharded_state_offline.py \
     --model /path/to/load \
     --tensor-parallel-size 8 \
     --output /path/to/save/sharded/model
 
-python load_sharded_state.py \
+python load_sharded_state_offline.py \
     --model /path/to/saved/sharded/model \
     --load-format sharded_state \
     --tensor-parallel-size 8 \
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/features/sharded_state/save_sharded_state_offline.py
similarity index 98%
rename from examples/offline_inference/save_sharded_state.py
rename to examples/features/sharded_state/save_sharded_state_offline.py
index 14d472ee3f23..675f2e35a53f 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/features/sharded_state/save_sharded_state_offline.py
@@ -7,7 +7,7 @@
 
 Example usage:
 
-python save_sharded_state.py \
+python save_sharded_state_offline.py \
     --model /path/to/load \
     --tensor-parallel-size 8 \
     --output /path/to/save
diff --git a/examples/offline_inference/extract_hidden_states.py b/examples/features/speculative_decoding/extract_hidden_states_offline.py
similarity index 100%
rename from examples/offline_inference/extract_hidden_states.py
rename to examples/features/speculative_decoding/extract_hidden_states_offline.py
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/features/speculative_decoding/mlpspeculator_offline.py
similarity index 100%
rename from examples/offline_inference/mlpspeculator.py
rename to examples/features/speculative_decoding/mlpspeculator_offline.py
diff --git a/examples/offline_inference/spec_decode.py b/examples/features/speculative_decoding/spec_decode_offline.py
similarity index 100%
rename from examples/offline_inference/spec_decode.py
rename to examples/features/speculative_decoding/spec_decode_offline.py
diff --git a/examples/online_serving/structured_outputs/README.md b/examples/features/structured_outputs/README.md
similarity index 85%
rename from examples/online_serving/structured_outputs/README.md
rename to examples/features/structured_outputs/README.md
index 7f539716ecf8..f2863eb0cbcf 100644
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/features/structured_outputs/README.md
@@ -20,7 +20,7 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
 If you want to run this script standalone with `uv`, you can use the following:
 
 ```bash
-uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/features/structured_outputs \
     structured-outputs
 ```
 
@@ -34,19 +34,19 @@ See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.ht
 Run all constraints, non-streaming:
 
 ```bash
-uv run structured_outputs.py
+uv run structured_outputs_offline.py
 ```
 
 Run all constraints, streaming:
 
 ```bash
-uv run structured_outputs.py --stream
+uv run structured_outputs_offline.py --stream
 ```
 
 Run certain constraints, for example `structural_tag` and `regex`, streaming:
 
 ```bash
-uv run structured_outputs.py \
+uv run structured_outputs_offline.py \
     --constraint structural_tag regex \
     --stream
 ```
@@ -54,5 +54,5 @@ uv run structured_outputs.py \
 Run all constraints, with reasoning models and streaming:
 
 ```bash
-uv run structured_outputs.py --reasoning --stream
+uv run structured_outputs_offline.py --reasoning --stream
 ```
diff --git a/examples/online_serving/structured_outputs/pyproject.toml b/examples/features/structured_outputs/pyproject.toml
similarity index 100%
rename from examples/online_serving/structured_outputs/pyproject.toml
rename to examples/features/structured_outputs/pyproject.toml
diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/features/structured_outputs/structured_outputs_client.py
similarity index 100%
rename from examples/online_serving/structured_outputs/structured_outputs.py
rename to examples/features/structured_outputs/structured_outputs_client.py
diff --git a/examples/offline_inference/structured_outputs.py b/examples/features/structured_outputs/structured_outputs_offline.py
similarity index 100%
rename from examples/offline_inference/structured_outputs.py
rename to examples/features/structured_outputs/structured_outputs_offline.py
diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/features/torchrun/torchrun_dp_example_offline.py
similarity index 95%
rename from examples/offline_inference/torchrun_dp_example.py
rename to examples/features/torchrun/torchrun_dp_example_offline.py
index eb7ed969ea4b..f18f6042e9c6 100644
--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/features/torchrun/torchrun_dp_example_offline.py
@@ -7,15 +7,15 @@
 
 To run this example:
 ```bash
-$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
+$ torchrun --nproc-per-node=2 examples/features/torchrun/torchrun_dp_example_offline.py
 ```
 
 With custom parallelism settings:
 ```bash
-$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
+$ torchrun --nproc-per-node=8 examples/features/torchrun/torchrun_dp_example_offline.py \
     --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 ```
-"""
+"""  # noqa: E501
 
 import argparse
 
diff --git a/examples/offline_inference/torchrun_example.py b/examples/features/torchrun/torchrun_example_offline.py
similarity index 99%
rename from examples/offline_inference/torchrun_example.py
rename to examples/features/torchrun/torchrun_example_offline.py
index 2960d329968a..e41bcd420c20 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/features/torchrun/torchrun_example_offline.py
@@ -4,7 +4,7 @@
 experimental support for tensor-parallel inference with torchrun,
 see https://github.com/vllm-project/vllm/issues/11400 for
 the motivation and use case for this example.
-run the script with `torchrun --nproc-per-node=4 torchrun_example.py`,
+run the script with `torchrun --nproc-per-node=4 torchrun_example_offline.py`,
 the argument `4` should match the product of `tensor_parallel_size` and
 `pipeline_parallel_size` below. see `tests/distributed/test_torchrun_example.py`
 for the unit test.
diff --git a/examples/generate/multimodal/vision_language_offline.py b/examples/generate/multimodal/vision_language_offline.py
index 87d42c036ec1..794f20dd0a52 100644
--- a/examples/generate/multimodal/vision_language_offline.py
+++ b/examples/generate/multimodal/vision_language_offline.py
@@ -2466,6 +2466,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 MODELS_SUPPORT_VIT_CUDA_GRAPH = [
     "qwen3_vl",
     "qwen3_vl_moe",
+    "qwen2_5_vl",
 ]
 
 
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_multiturn.py b/examples/online_serving/disaggregated_serving/disagg_proxy_multiturn.py
new file mode 100644
index 000000000000..24d90eab0292
--- /dev/null
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_multiturn.py
@@ -0,0 +1,562 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Disaggregated Prefill/Decode Proxy with Bidirectional KV Transfer
+
+This proxy sits between clients and a vLLM Prefill/Decode (P/D) deployment,
+routing multi-turn chat requests so that each turn reuses KV cache blocks
+from the previous turn's Decode node via bidirectional KV transfer.
+
+Architecture:
+    Client  ──►  Proxy  ──►  Prefill (P)  ──►  Decode (D)
+                   │              │                 │
+                   │   kv_transfer_params flow:     │
+                   │   D finish ──► proxy caches    │
+                   │   next turn ──► proxy sends    │
+                   │   cached D blocks to P ──►     │
+                   │   P reads D blocks (bidir)     │
+                   │   P sends its blocks to D      │
+
+Per-request flow:
+    1. Client sends chat/completions request to proxy.
+    2. Proxy looks up cached D block info from the previous turn
+       (keyed by conversation_id).
+    3. If cache hit, proxy attaches D's block info to the request
+       so P can read D's KV blocks instead of recomputing.
+    4. Proxy sends request to P (max_tokens=1, non-streaming).
+    5. P returns kv_transfer_params with its own block info.
+    6. Proxy forwards request + P's block info to D (streaming).
+    7. D streams the response. The final chunk includes D's
+       kv_transfer_params, which the proxy caches for the next turn.
+    8. Proxy returns D's response to the client.
+
+Conversation isolation:
+    Each request must include a ``conversation_id`` field (top-level in
+    the JSON body) to scope the KV cache across turns. Without it, the
+    proxy cannot link turns and falls back to no-cache behavior.
+
+Usage:
+    python disagg_proxy_multiturn.py \\
+        --host 0.0.0.0 --port 8000 \\
+        --prefiller-host 10.0.0.1 --prefiller-port 8100 \\
+        --decoder-host 10.0.0.2 --decoder-port 8200
+
+Dependencies:
+    pip install fastapi uvicorn httpx
+"""
+
+from __future__ import annotations
+
+import argparse
+import itertools
+import json
+import logging
+import os
+import time
+import uuid
+from contextlib import asynccontextmanager
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+# Logging
+logging.basicConfig(
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger("disagg_proxy")
+
+
+# Data structures
+@dataclass
+class CachedKVEntry:
+    """KV transfer parameters cached from D's response for one turn."""
+
+    kv_transfer_params: dict[str, Any]
+    timestamp: float = field(default_factory=time.time)
+
+
+class ConversationKVCache:
+    """Per-conversation KV block cache.
+
+    Each conversation is identified by a ``conversation_id`` supplied by
+    the client. After D finishes a turn, its ``kv_transfer_params`` are
+    stored here. On the next turn, the proxy retrieves them so P can
+    read D's blocks via bidirectional KV transfer.
+    """
+
+    def __init__(self, ttl_seconds: float = 600.0) -> None:
+        self._store: dict[str, CachedKVEntry] = {}
+        self._ttl = ttl_seconds
+
+    def get(self, conversation_id: str) -> dict[str, Any] | None:
+        """Retrieve and consume cached KV params for a conversation.
+
+        Returns a *copy* of the kv_transfer_params dict, or None.
+        The entry is removed after retrieval (single-use).
+        """
+        entry = self._store.pop(conversation_id, None)
+        if entry is None:
+            return None
+        age = time.time() - entry.timestamp
+        if age > self._ttl:
+            logger.info(
+                "conv=%s: stale cache entry (age=%.1fs > ttl=%.1fs), discarding",
+                conversation_id,
+                age,
+                self._ttl,
+            )
+            return None
+        logger.info(
+            "conv=%s: cache HIT (age=%.1fs)",
+            conversation_id,
+            age,
+        )
+        return dict(entry.kv_transfer_params)
+
+    def put(self, conversation_id: str, kv_params: dict[str, Any]) -> None:
+        """Store D's kv_transfer_params for a conversation."""
+        self._store[conversation_id] = CachedKVEntry(
+            kv_transfer_params=dict(kv_params),  # defensive copy
+        )
+        logger.info(
+            "conv=%s: cached D blocks (remote_request_id=%s, blocks=%d)",
+            conversation_id,
+            kv_params.get("remote_request_id", "?"),
+            len(kv_params.get("remote_block_ids", [[]])[0])
+            if kv_params.get("remote_block_ids")
+            else 0,
+        )
+
+    def evict_stale(self) -> int:
+        """Remove entries older than TTL. Returns count of evicted entries."""
+        now = time.time()
+        stale = [
+            cid
+            for cid, entry in self._store.items()
+            if now - entry.timestamp > self._ttl
+        ]
+        for cid in stale:
+            del self._store[cid]
+        return len(stale)
+
+    @property
+    def size(self) -> int:
+        return len(self._store)
+
+
+# Global state
+kv_cache = ConversationKVCache(
+    ttl_seconds=450.0
+)  # Must be < VLLM_NIXL_ABORT_REQUEST_TIMEOUT (480s)
+
+
+# Service client helpers
+@dataclass
+class ServiceClient:
+    """Wrapper around an httpx.AsyncClient for a P or D instance."""
+
+    client: httpx.AsyncClient
+    host: str
+    port: int
+    id: int
+
+
+def _make_headers(request_id: str) -> dict[str, str]:
+    """Build HTTP headers for upstream requests."""
+    headers = {"X-Request-Id": request_id}
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    return headers
+
+
+async def _send_to_prefill(
+    client: ServiceClient,
+    endpoint: str,
+    req_data: dict[str, Any],
+    request_id: str,
+) -> dict[str, Any]:
+    """Send a non-streaming prefill request (max_tokens=1).
+
+    Returns the JSON response from P, which includes kv_transfer_params.
+    """
+    payload = req_data.copy()
+    payload["stream"] = False
+    payload["max_tokens"] = 1
+    payload.pop("max_completion_tokens", None)
+    payload.pop("min_tokens", None)
+    payload.pop("stream_options", None)
+
+    resp = await client.client.post(
+        endpoint,
+        json=payload,
+        headers=_make_headers(request_id),
+    )
+    resp.raise_for_status()
+    return resp.json()
+
+
+async def _stream_from_decode(
+    client: ServiceClient,
+    endpoint: str,
+    req_data: dict[str, Any],
+    request_id: str,
+    conversation_id: str,
+) -> tuple[str, str | None, dict[str, Any] | None, str, str | None, int | None]:
+    """Stream response from D, capturing text and kv_transfer_params.
+
+    Returns (collected_text, finish_reason, kv_params, response_id, created).
+    Also stores kv_params in the conversation cache.
+    """
+    payload = req_data.copy()
+    payload["stream"] = True
+
+    collected_text = ""
+    finish_reason: str | None = None
+    response_id: str | None = None
+    model_name: str | None = None
+    created: int | None = None
+    captured_kv: dict[str, Any] | None = None
+
+    async with client.client.stream(
+        "POST",
+        endpoint,
+        json=payload,
+        headers=_make_headers(request_id),
+    ) as resp:
+        resp.raise_for_status()
+        async for line in resp.aiter_lines():
+            if not line or not line.startswith("data: "):
+                continue
+            if line == "data: [DONE]":
+                break
+            try:
+                chunk = json.loads(line[6:])
+            except json.JSONDecodeError:
+                continue
+
+            if response_id is None:
+                response_id = chunk.get("id")
+                model_name = chunk.get("model")
+                created = chunk.get("created")
+
+            for choice in chunk.get("choices", []):
+                collected_text += choice.get("text", "")
+                delta = choice.get("delta", {})
+                collected_text += delta.get("content", "")
+                if choice.get("finish_reason"):
+                    finish_reason = choice["finish_reason"]
+
+            kv_params = chunk.get("kv_transfer_params")
+            if kv_params:
+                kv_params["remote_host"] = client.host
+                captured_kv = kv_params
+                if conversation_id:
+                    kv_cache.put(conversation_id, kv_params)
+
+    return (
+        collected_text,
+        finish_reason,
+        captured_kv,
+        response_id or request_id,
+        model_name,
+        created,
+    )
+
+
+async def _stream_from_decode_sse(
+    client: ServiceClient,
+    endpoint: str,
+    req_data: dict[str, Any],
+    request_id: str,
+    conversation_id: str,
+):
+    """Yield SSE chunks from D to the client, capturing kv_transfer_params."""
+    payload = req_data.copy()
+    payload["stream"] = True
+
+    async with client.client.stream(
+        "POST",
+        endpoint,
+        json=payload,
+        headers=_make_headers(request_id),
+    ) as resp:
+        resp.raise_for_status()
+        async for line in resp.aiter_lines():
+            if not line:
+                yield "\n"
+                continue
+
+            if line.startswith("data: ") and line != "data: [DONE]":
+                try:
+                    chunk = json.loads(line[6:])
+                    kv_params = chunk.get("kv_transfer_params")
+                    if kv_params and conversation_id:
+                        kv_params["remote_host"] = client.host
+                        kv_cache.put(conversation_id, kv_params)
+                except json.JSONDecodeError:
+                    pass
+
+            yield line + "\n"
+
+
+# FastAPI application
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Initialize HTTP clients for P and D instances."""
+    app.state.prefill_clients: list[ServiceClient] = []
+    app.state.decode_clients: list[ServiceClient] = []
+
+    for i, (host, port) in enumerate(global_args.prefiller_instances):
+        app.state.prefill_clients.append(
+            ServiceClient(
+                client=httpx.AsyncClient(
+                    timeout=None,
+                    base_url=f"http://{host}:{port}/v1",
+                ),
+                host=host,
+                port=port,
+                id=i,
+            )
+        )
+
+    for i, (host, port) in enumerate(global_args.decoder_instances):
+        app.state.decode_clients.append(
+            ServiceClient(
+                client=httpx.AsyncClient(
+                    timeout=None,
+                    base_url=f"http://{host}:{port}/v1",
+                ),
+                host=host,
+                port=port,
+                id=i,
+            )
+        )
+
+    app.state.prefill_iter = itertools.cycle(range(len(app.state.prefill_clients)))
+    app.state.decode_iter = itertools.cycle(range(len(app.state.decode_clients)))
+
+    logger.info(
+        "Ready: %d prefill, %d decode instances",
+        len(app.state.prefill_clients),
+        len(app.state.decode_clients),
+    )
+    yield
+
+    for sc in app.state.prefill_clients + app.state.decode_clients:
+        await sc.client.aclose()
+
+
+app = FastAPI(title="Disaggregated P/D Proxy (Multi-turn)", lifespan=lifespan)
+
+
+def _next_client(app_state, role: str) -> ServiceClient:
+    if role == "prefill":
+        return app_state.prefill_clients[next(app_state.prefill_iter)]
+    return app_state.decode_clients[next(app_state.decode_iter)]
+
+
+# Request handler
+async def _handle_request(api_path: str, request: Request):
+    """Core request handler for both /v1/chat/completions and /v1/completions."""
+    req_data = await request.json()
+    request_id = str(uuid.uuid4())
+    conversation_id: str = req_data.pop("conversation_id", "")
+    client_wants_stream = req_data.get("stream", False)
+
+    if not conversation_id:
+        logger.warning(
+            "[%s] No conversation_id provided — KV cache reuse disabled "
+            "for this request. Add a 'conversation_id' field to enable "
+            "cross-turn KV sharing.",
+            request_id,
+        )
+
+    # Step 1: Look up cached D blocks from the previous turn
+    cached_kv = kv_cache.get(conversation_id) if conversation_id else None
+
+    if cached_kv:
+        # Tell P to read D's blocks (bidirectional transfer)
+        cached_kv["do_remote_decode"] = True
+        cached_kv["do_remote_prefill"] = False
+        req_data["kv_transfer_params"] = cached_kv
+        logger.info(
+            "[%s] conv=%s: sending D's cached blocks to P (remote_request_id=%s)",
+            request_id,
+            conversation_id,
+            cached_kv.get("remote_request_id"),
+        )
+    else:
+        # No cached blocks — P recomputes from scratch
+        req_data["kv_transfer_params"] = {
+            "do_remote_decode": True,
+            "do_remote_prefill": False,
+            "remote_engine_id": None,
+            "remote_block_ids": None,
+            "remote_host": None,
+            "remote_port": None,
+        }
+        logger.info("[%s] conv=%s: cache MISS", request_id, conversation_id)
+
+    # Step 2: Send to Prefill node (non-streaming, max_tokens=1)
+    prefill_client = _next_client(request.app.state, "prefill")
+    t0 = time.time()
+    prefill_resp = await _send_to_prefill(
+        prefill_client,
+        api_path,
+        req_data,
+        request_id,
+    )
+    logger.info(
+        "[%s] Prefill done in %.0fms",
+        request_id,
+        (time.time() - t0) * 1000,
+    )
+
+    # Attach P's kv_transfer_params for D to read P's blocks
+    p_kv_params = prefill_resp.get("kv_transfer_params", {})
+    if p_kv_params:
+        p_kv_params["remote_host"] = prefill_client.host
+        req_data["kv_transfer_params"] = p_kv_params
+
+    # Step 3: Stream from Decode node, capturing kv_transfer_params
+    decode_client = _next_client(request.app.state, "decode")
+
+    if client_wants_stream:
+        return StreamingResponse(
+            _stream_from_decode_sse(
+                decode_client,
+                api_path,
+                req_data,
+                request_id,
+                conversation_id,
+            ),
+            media_type="text/event-stream",
+        )
+
+    text, finish_reason, _, resp_id, model, created = await _stream_from_decode(
+        decode_client,
+        api_path,
+        req_data,
+        request_id,
+        conversation_id,
+    )
+
+    # Build OpenAI-compatible response
+    is_chat = "messages" in req_data
+    if is_chat:
+        body = {
+            "id": resp_id,
+            "object": "chat.completion",
+            "created": created or int(time.time()),
+            "model": model or req_data.get("model", ""),
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": text},
+                    "finish_reason": finish_reason,
+                }
+            ],
+            "usage": None,
+        }
+    else:
+        body = {
+            "id": resp_id,
+            "object": "text_completion",
+            "created": created or int(time.time()),
+            "model": model or req_data.get("model", ""),
+            "choices": [
+                {
+                    "index": 0,
+                    "text": text,
+                    "logprobs": None,
+                    "finish_reason": finish_reason,
+                }
+            ],
+            "usage": None,
+        }
+    return JSONResponse(content=body)
+
+
+# Routes
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    return await _handle_request("/chat/completions", request)
+
+
+@app.post("/v1/completions")
+async def completions(request: Request):
+    return await _handle_request("/completions", request)
+
+
+@app.get("/health")
+async def health():
+    evicted = kv_cache.evict_stale()
+    return {
+        "status": "ok",
+        "cached_conversations": kv_cache.size,
+        "evicted_stale": evicted,
+    }
+
+
+# CLI
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Disaggregated P/D proxy with bidirectional KV transfer",
+    )
+    p.add_argument("--host", default="0.0.0.0")
+    p.add_argument("--port", type=int, default=8000)
+    p.add_argument(
+        "--prefiller-host",
+        "--prefiller-hosts",
+        dest="prefiller_hosts",
+        nargs="+",
+        default=["localhost"],
+    )
+    p.add_argument(
+        "--prefiller-port",
+        "--prefiller-ports",
+        dest="prefiller_ports",
+        type=int,
+        nargs="+",
+        default=[8100],
+    )
+    p.add_argument(
+        "--decoder-host",
+        "--decoder-hosts",
+        dest="decoder_hosts",
+        nargs="+",
+        default=["localhost"],
+    )
+    p.add_argument(
+        "--decoder-port",
+        "--decoder-ports",
+        dest="decoder_ports",
+        type=int,
+        nargs="+",
+        default=[8200],
+    )
+    args = p.parse_args()
+
+    if len(args.prefiller_hosts) != len(args.prefiller_ports):
+        p.error("Number of prefiller hosts must match ports")
+    if len(args.decoder_hosts) != len(args.decoder_ports):
+        p.error("Number of decoder hosts must match ports")
+
+    args.prefiller_instances = list(zip(args.prefiller_hosts, args.prefiller_ports))
+    args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports))
+    return args
+
+
+if __name__ == "__main__":
+    global global_args
+    global_args = parse_args()
+
+    import uvicorn
+
+    uvicorn.run(app, host=global_args.host, port=global_args.port)
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
deleted file mode 100644
index fa4b64c00703..000000000000
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-vLLM OpenAI-Compatible Client with Prompt Embeddings
-
-This script demonstrates how to:
-1. Generate prompt embeddings using Hugging Face Transformers
-2. Encode them in base64 format
-3. Send them to a vLLM server via the OpenAI-compatible Completions API
-
-Run the vLLM server first:
-vllm serve meta-llama/Llama-3.2-1B-Instruct \
-  --runner generate \
-  --max-model-len 4096 \
-  --enable-prompt-embeds
-
-Run the client:
-python examples/online_serving/prompt_embed_inference_with_openai_client.py
-
-Model: meta-llama/Llama-3.2-1B-Instruct
-Note: This model is gated on Hugging Face Hub.
-      You must request access to use it:
-      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
-
-Dependencies:
-- transformers
-- torch
-- openai
-"""
-
-import transformers
-from openai import OpenAI
-
-from vllm.utils.serial_utils import tensor2base64
-
-
-def main():
-    client = OpenAI(
-        api_key="EMPTY",
-        base_url="http://localhost:8000/v1",
-    )
-
-    model_name = "meta-llama/Llama-3.2-1B-Instruct"
-
-    # Transformers
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-    transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
-
-    # Refer to the HuggingFace repo for the correct format to use
-    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
-    token_ids = tokenizer.apply_chat_template(
-        chat, add_generation_prompt=True, return_tensors="pt", return_dict=True
-    ).input_ids
-
-    embedding_layer = transformers_model.get_input_embeddings()
-    prompt_embeds = embedding_layer(token_ids).squeeze(0)
-
-    # Prompt embeddings
-    encoded_embeds = tensor2base64(prompt_embeds)
-
-    completion = client.completions.create(
-        model=model_name,
-        prompt=None,
-        max_tokens=5,
-        temperature=0.0,
-        # NOTE: The OpenAI client allows passing in extra JSON body via the
-        # `extra_body` argument.
-        extra_body={"prompt_embeds": encoded_embeds},
-    )
-
-    print("-" * 30)
-    print(completion.choices[0].text)
-    print("-" * 30)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/ray_serving/batch_llm_inference.py
similarity index 100%
rename from examples/offline_inference/batch_llm_inference.py
rename to examples/ray_serving/batch_llm_inference.py
diff --git a/examples/online_serving/elastic_ep/bench.sh b/examples/ray_serving/elastic_ep/bench.sh
similarity index 100%
rename from examples/online_serving/elastic_ep/bench.sh
rename to examples/ray_serving/elastic_ep/bench.sh
diff --git a/examples/online_serving/elastic_ep/scale.py b/examples/ray_serving/elastic_ep/scale.py
similarity index 100%
rename from examples/online_serving/elastic_ep/scale.py
rename to examples/ray_serving/elastic_ep/scale.py
diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/ray_serving/elastic_ep/serve_deepseek_v2.sh
similarity index 100%
rename from examples/online_serving/elastic_ep/serve_deepseek_v2.sh
rename to examples/ray_serving/elastic_ep/serve_deepseek_v2.sh
diff --git a/examples/online_serving/multi-node-serving.sh b/examples/ray_serving/multi-node-serving.sh
similarity index 100%
rename from examples/online_serving/multi-node-serving.sh
rename to examples/ray_serving/multi-node-serving.sh
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/ray_serving/ray_serve_deepseek.py
similarity index 100%
rename from examples/online_serving/ray_serve_deepseek.py
rename to examples/ray_serving/ray_serve_deepseek.py
diff --git a/examples/online_serving/run_cluster.sh b/examples/ray_serving/run_cluster.sh
similarity index 100%
rename from examples/online_serving/run_cluster.sh
rename to examples/ray_serving/run_cluster.sh
diff --git a/examples/offline_inference/routed_experts_e2e.py b/examples/rl/routed_experts_e2e.py
similarity index 99%
rename from examples/offline_inference/routed_experts_e2e.py
rename to examples/rl/routed_experts_e2e.py
index bb1d7b411f99..1666bc3ffe16 100644
--- a/examples/offline_inference/routed_experts_e2e.py
+++ b/examples/rl/routed_experts_e2e.py
@@ -9,7 +9,7 @@
 3. Results are deterministic across runs (baseline vs reference).
 
 Usage:
-    python examples/offline_inference/routed_experts_e2e.py \
+    python examples/rl/routed_experts_e2e.py \
         --model Qwen/Qwen3-30B-A3B \
         --tp 4 \
         --max-model-len 4096 \
diff --git a/examples/offline_inference/skip_loading_weights_in_engine_init.py b/examples/rl/skip_loading_weights_in_engine_init.py
similarity index 100%
rename from examples/offline_inference/skip_loading_weights_in_engine_init.py
rename to examples/rl/skip_loading_weights_in_engine_init.py
diff --git a/examples/tool_chat_template_gemma4.jinja b/examples/tool_chat_template_gemma4.jinja
index 15c5238ac332..f62ca843a405 100644
--- a/examples/tool_chat_template_gemma4.jinja
+++ b/examples/tool_chat_template_gemma4.jinja
@@ -1,9 +1,9 @@
-{%- macro format_parameters(properties, required) -%}
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
     {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
     {%- set ns = namespace(found_first=false) -%}
     {%- for key, value in properties | dictsort -%}
         {%- set add_comma = false -%}
-        {%- if key not in standard_keys -%}
+        {%- if not filter_keys or key not in standard_keys -%}
             {%- if ns.found_first %},{% endif -%}
             {%- set ns.found_first = true -%}
             {{ key }}:{
@@ -11,34 +11,15 @@
                 description:<|"|>{{ value['description'] }}<|"|>
                 {%- set add_comma = true -%}
             {%- endif -%}
-            {%- if value['nullable'] %}
-                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
-                nullable:true
-            {%- endif -%}
             {%- if value['type'] | upper == 'STRING' -%}
                 {%- if value['enum'] -%}
                     {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
                     enum:{{ format_argument(value['enum']) }}
                 {%- endif -%}
-            {%- elif value['type'] | upper == 'OBJECT' -%}
-                ,properties:{
-                {%- if value['properties'] is defined and value['properties'] is mapping -%}
-                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
-                {%- elif value is mapping -%}
-                    {{- format_parameters(value, value['required'] | default([])) -}}
-                {%- endif -%}
-                }
-                {%- if value['required'] -%}
-                    ,required:[
-                    {%- for item in value['required'] | default([]) -%}
-                        <|"|>{{- item -}}<|"|>
-                        {%- if not loop.last %},{% endif -%}
-                    {%- endfor -%}
-                    ]
-                {%- endif -%}
             {%- elif value['type'] | upper == 'ARRAY' -%}
                 {%- if value['items'] is mapping and value['items'] -%}
-                    ,items:{
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
                     {%- set ns_items = namespace(found_first=false) -%}
                     {%- for item_key, item_value in value['items'] | dictsort -%}
                         {%- if item_value is not none -%}
@@ -71,6 +52,32 @@
                     }
                 {%- endif -%}
             {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
             {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
             type:<|"|>{{ value['type'] | upper }}<|"|>}
         {%- endif -%}
@@ -167,20 +174,25 @@
 
 {%- set ns = namespace(prev_message_type=None) -%}
 {%- set loop_messages = messages -%}
-{{ bos_token }}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
 {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
     {{- '<|turn>system\n' -}}
-
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
     {%- if enable_thinking is defined and enable_thinking -%}
-        {{- '<|think|>' -}}
+        {{- '<|think|>\n' -}}
         {%- set ns.prev_message_type = 'think' -%}
     {%- endif -%}
-
     {%- if messages[0]['role'] in ['system', 'developer'] -%}
-        {{- messages[0]['content'] | trim -}}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
         {%- set loop_messages = messages[1:] -%}
     {%- endif -%}
-
     {%- if tools -%}
         {%- for tool in tools %}
             {{- '<|tool>' -}}
@@ -189,10 +201,10 @@
         {%- endfor %}
         {%- set ns.prev_message_type = 'tool' -%}
     {%- endif -%}
-
     {{- '<turn|>\n' -}}
 {%- endif %}
 
+{#- Pre-scan: find last user message index for reasoning guard -#}
 {%- set ns_turn = namespace(last_user_idx=-1) -%}
 {%- for i in range(loop_messages | length) -%}
     {%- if loop_messages[i]['role'] == 'user' -%}
@@ -200,12 +212,12 @@
     {%- endif -%}
 {%- endfor -%}
 
+{#- Loop through messages -#}
 {%- for message in loop_messages -%}
     {%- if message['role'] != 'tool' -%}
     {%- set ns.prev_message_type = None -%}
     {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
-    {#- OpenAI may emit multiple assistant messages in one tool loop (user → asst → tool → asst → tool).
-        Only the first of those should open <|turn>model; later ones continue the same model turn. -#}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
     {%- set prev_nt = namespace(role=None, found=false) -%}
     {%- if loop.index0 > 0 -%}
         {%- for j in range(loop.index0 - 1, -1, -1) -%}
@@ -222,8 +234,10 @@
         {{- '<|turn>' + role + '\n' }}
     {%- endif -%}
 
-    {%- if message.get('reasoning') and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
-        {{- '<|channel>thought\n' + message['reasoning'] + '\n<channel|>'}}
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
     {%- endif -%}
 
             {%- if message['tool_calls'] -%}
@@ -247,14 +261,14 @@
 
             {%- set ns_tr_out = namespace(flag=false) -%}
             {%- if message.get('tool_responses') -%}
-                {#- Legacy: tool_responses embedded on the assistant message -#}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
                 {%- for tool_response in message['tool_responses'] -%}
                     {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
                     {%- set ns_tr_out.flag = true -%}
                     {%- set ns.prev_message_type = 'tool_response' -%}
                 {%- endfor -%}
             {%- elif message.get('tool_calls') -%}
-                {#- OpenAI Chat Completions: consecutive following messages with role "tool" (no break/continue; range scan) -#}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
                 {%- set ns_tool_scan = namespace(stopped=false) -%}
                 {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
                     {%- if ns_tool_scan.stopped -%}
@@ -262,12 +276,14 @@
                         {%- set ns_tool_scan.stopped = true -%}
                     {%- else -%}
                         {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
                         {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
                         {%- for tc in message['tool_calls'] -%}
                             {%- if tc.get('id') == follow.get('tool_call_id') -%}
                                 {%- set ns_tname.name = tc['function']['name'] -%}
                             {%- endif -%}
                         {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
                         {%- set tool_body = follow.get('content') -%}
                         {%- if tool_body is string -%}
                             {{- format_tool_response_block(ns_tname.name, tool_body) -}}
@@ -288,6 +304,7 @@
                 {%- endfor -%}
             {%- endif -%}
 
+            {%- set captured_content -%}
             {%- if message['content'] is string -%}
                 {%- if role == 'model' -%}
                     {{- strip_thinking(message['content']) -}}
@@ -303,29 +320,35 @@
                             {{- item['text'] | trim -}}
                         {%- endif -%}
                     {%- elif item['type'] == 'image' -%}
-                        {{- '\n\n<|image|>\n\n' -}}
+                        {{- '<|image|>' -}}
                         {%- set ns.prev_message_type = 'image' -%}
                     {%- elif item['type'] == 'audio' -%}
                         {{- '<|audio|>' -}}
                         {%- set ns.prev_message_type = 'audio' -%}
                     {%- elif item['type'] == 'video' -%}
-                        {{- '\n\n<|video|>\n\n' -}}
+                        {{- '<|video|>' -}}
                         {%- set ns.prev_message_type = 'video' -%}
                     {%- endif -%}
                 {%- endfor -%}
             {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
 
-        {%- if not (ns_tr_out.flag and not message.get('content')) -%}
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
             {{- '<turn|>\n' -}}
         {%- endif -%}
     {%- endif -%}
 {%- endfor -%}
 
 {%- if add_generation_prompt -%}
-    {%- if ns.prev_message_type != 'tool_response' -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
         {{- '<|turn>model\n' -}}
+        {%- if not enable_thinking | default(false) -%}
+            {{- '<|channel>thought\n<channel|>' -}}
+        {%- endif -%}
     {%- endif -%}
-    {%- if not enable_thinking | default(false) -%}
-        {{- '<|channel>thought\n<channel|>' -}}
-    {%- endif -%}
-{%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/requirements/common.txt b/requirements/common.txt
index 5d4519204ee9..68f5e165b923 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -24,14 +24,14 @@ outlines_core == 0.2.14
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar >= 0.1.32, < 1.0.0; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
+xgrammar >= 0.2.0, < 1.0.0; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.17.0
-mistral_common[image] >= 1.11.0
+mistral_common[image] >= 1.11.2
 opencv-python-headless >= 4.13.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
@@ -49,7 +49,7 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic >= 0.71.0
-model-hosting-container-standards >= 0.1.13, < 1.0.0
+model-hosting-container-standards >= 0.1.14, < 1.0.0
 mcp
 opentelemetry-sdk >= 1.27.0
 opentelemetry-api >= 1.27.0
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 0b472b90c026..037b20874b52 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -21,3 +21,6 @@ timm>=1.0.17
 # amd-quark: required for Quark quantization on ROCm 
 # To be consistent with test_quark.py
 amd-quark>=0.8.99
+# tilelang has to be installed for mhc module to be
+# imported correctly.
+tilelang==0.1.9
diff --git a/requirements/test/cuda.in b/requirements/test/cuda.in
index 73d50104d86a..71e496ccf650 100644
--- a/requirements/test/cuda.in
+++ b/requirements/test/cuda.in
@@ -31,7 +31,7 @@ torchaudio==2.11.0
 torchvision==0.26.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.11.0 # required for voxtral test
+mistral_common[image,audio] >= 1.11.2 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.13.0 # required for video test
@@ -61,7 +61,11 @@ fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
 instanttensor>=0.1.5
 pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0; platform_machine == "x86_64"
-terratorch >= 1.2.2 # Required for Prithvi tests
+# terratorch is temporarily disabled while PyPI has the `lightning` package
+# in `quarantined` status (every published terratorch version transitively
+# requires `lightning`, so the resolver fails with "no versions of lightning").
+# Re-enable once PyPI lifts the quarantine. Tracked in #41376.
+# terratorch >= 1.2.2 # Required for Prithvi tests
 imagehash # Required for Prithvi tests
 segmentation-models-pytorch > 0.4.0 # Required for Prithvi tests
 
diff --git a/requirements/test/cuda.txt b/requirements/test/cuda.txt
index ffb0d9d00c3d..8544f7b70969 100644
--- a/requirements/test/cuda.txt
+++ b/requirements/test/cuda.txt
@@ -1,15 +1,9 @@
 # This file was autogenerated by uv via the following command:
 #    uv pip compile requirements/test/cuda.in -c requirements/cuda.txt -o requirements/test/cuda.txt --index-strategy unsafe-best-match --torch-backend cu130 --python-platform x86_64-manylinux_2_28 --python-version 3.12
 absl-py==2.1.0
-    # via
-    #   rouge-score
-    #   tensorboard
+    # via rouge-score
 accelerate==1.13.0
     # via peft
-aenum==3.1.16
-    # via lightly
-affine==2.4.0
-    # via rasterio
 aiohappyeyeballs==2.6.1
     # via aiohttp
 aiohttp==3.13.3
@@ -25,22 +19,14 @@ aiohttp-cors==0.8.1
     # via ray
 aiosignal==1.4.0
     # via aiohttp
-albucore==0.0.16
-    # via terratorch
 albumentations==1.4.6
-    # via
-    #   -r requirements/test/cuda.in
-    #   terratorch
+    # via -r requirements/test/cuda.in
 alembic==1.16.4
     # via optuna
 annotated-doc==0.0.4
     # via fastapi
 annotated-types==0.7.0
     # via pydantic
-antlr4-python3-runtime==4.9.3
-    # via
-    #   hydra-core
-    #   omegaconf
 anyio==4.6.2.post1
     # via
     #   httpx
@@ -54,12 +40,10 @@ arrow==1.3.0
 attrs==24.2.0
     # via
     #   aiohttp
-    #   fiona
     #   hypothesis
     #   jsonlines
     #   jsonschema
     #   pytest-subtests
-    #   rasterio
     #   referencing
 audioread==3.0.1
     # via librosa
@@ -78,9 +62,7 @@ backoff==2.2.1
     #   -r requirements/test/cuda.in
     #   schemathesis
 bitsandbytes==0.49.2
-    # via
-    #   -r requirements/test/cuda.in
-    #   lightning
+    # via -r requirements/test/cuda.in
 black==24.10.0
     # via datamodel-code-generator
 blobfile==3.0.0
@@ -103,15 +85,9 @@ cachetools==5.5.2
     # via google-auth
 certifi==2024.8.30
     # via
-    #   fiona
     #   httpcore
     #   httpx
-    #   lightly
-    #   pyogrio
-    #   pyproj
-    #   rasterio
     #   requests
-    #   sentry-sdk
 cffi==2.0.0
     # via
     #   cryptography
@@ -125,25 +101,12 @@ chz==0.3.0
 click==8.1.7
     # via
     #   black
-    #   click-plugins
-    #   cligj
-    #   fiona
     #   jiwer
     #   nltk
-    #   rasterio
     #   ray
     #   schemathesis
     #   typer
     #   uvicorn
-    #   wandb
-click-plugins==1.1.1.2
-    # via
-    #   fiona
-    #   rasterio
-cligj==0.7.2
-    # via
-    #   fiona
-    #   rasterio
 colorama==0.4.6
     # via
     #   perceptron
@@ -191,8 +154,6 @@ decorator==5.1.1
     # via librosa
 decord==0.6.0
     # via -r requirements/test/cuda.in
-diffusers==0.36.0
-    # via terratorch
 dill==0.3.8
     # via
     #   datasets
@@ -207,14 +168,10 @@ docker==7.1.0
     # via gpt-oss
 docopt==0.6.2
     # via num2words
-docstring-parser==0.17.0
-    # via jsonargparse
 einops==0.8.1
     # via
     #   -r requirements/test/cuda.in
     #   encodec
-    #   terratorch
-    #   torchgeo
     #   vector-quantize-pytorch
     #   vocos
 einx==0.3.0
@@ -244,13 +201,10 @@ filelock==3.16.1
     #   -c requirements/common.txt
     #   blobfile
     #   datasets
-    #   diffusers
     #   huggingface-hub
     #   ray
     #   torch
     #   virtualenv
-fiona==1.10.1
-    # via torchgeo
 fonttools==4.55.0
     # via matplotlib
 fqdn==1.5.1
@@ -267,9 +221,6 @@ fsspec==2024.12.0
     #   evaluate
     #   fastparquet
     #   huggingface-hub
-    #   lightning
-    #   pytorch-lightning
-    #   tacoreader
     #   torch
 ftfy==6.3.1
     # via open-clip-torch
@@ -277,12 +228,6 @@ genai-perf==0.0.16
     # via -r requirements/test/cuda.in
 genson==1.3.0
     # via datamodel-code-generator
-geopandas==1.0.1
-    # via terratorch
-gitdb==4.0.12
-    # via gitpython
-gitpython==3.1.44
-    # via wandb
 google-api-core==2.24.2
     # via
     #   google-cloud-core
@@ -317,7 +262,6 @@ grpcio==1.78.0
     #   -r requirements/test/cuda.in
     #   grpcio-reflection
     #   ray
-    #   tensorboard
 grpcio-reflection==1.78.0
     # via -r requirements/test/cuda.in
 h11==0.14.0
@@ -326,8 +270,6 @@ h11==0.14.0
     #   uvicorn
 h2==4.3.0
     # via httpx
-h5py==3.13.0
-    # via terratorch
 harfile==0.3.0
     # via schemathesis
 hf-xet==1.4.3
@@ -343,7 +285,6 @@ httpcore==1.0.6
 httpx==0.27.2
     # via
     #   -r requirements/test/cuda.in
-    #   diffusers
     #   huggingface-hub
     #   perceptron
     #   schemathesis
@@ -351,23 +292,17 @@ huggingface-hub==1.10.2
     # via
     #   accelerate
     #   datasets
-    #   diffusers
     #   evaluate
     #   open-clip-torch
     #   peft
     #   segmentation-models-pytorch
     #   sentence-transformers
-    #   terratorch
     #   timm
     #   tokenizers
     #   transformers
     #   vocos
 humanize==4.11.0
     # via runai-model-streamer
-hydra-core==1.3.2
-    # via
-    #   lightly
-    #   lightning
 hyperframe==6.1.0
     # via h2
 hypothesis==6.131.0
@@ -392,11 +327,7 @@ imagehash==4.3.2
 imageio==2.37.0
     # via scikit-image
 importlib-metadata==8.7.0
-    # via
-    #   diffusers
-    #   opentelemetry-api
-importlib-resources==6.5.2
-    # via typeshed-client
+    # via opentelemetry-api
 inflect==5.6.2
     # via datamodel-code-generator
 iniconfig==2.0.0
@@ -426,14 +357,8 @@ joblib==1.4.2
     #   librosa
     #   nltk
     #   scikit-learn
-jsonargparse==4.46.0
-    # via
-    #   lightning
-    #   terratorch
 jsonlines==4.0.0
     # via lm-eval
-jsonnet==0.21.0
-    # via jsonargparse
 jsonpointer==3.0.0
     # via jsonschema
 jsonschema==4.23.0
@@ -452,10 +377,6 @@ kaleido==0.2.1
     # via genai-perf
 kiwisolver==1.4.7
     # via matplotlib
-kornia==0.8.1
-    # via torchgeo
-kornia-rs==0.1.9
-    # via kornia
 lazy-loader==0.4
     # via
     #   librosa
@@ -464,21 +385,6 @@ libnacl==2.1.0
     # via tensorizer
 librosa==0.10.2.post1
     # via -r requirements/test/cuda.in
-lightly==1.5.22
-    # via
-    #   terratorch
-    #   torchgeo
-lightly-utils==0.0.2
-    # via lightly
-lightning==2.6.1
-    # via
-    #   terratorch
-    #   torchgeo
-lightning-utilities==0.14.3
-    # via
-    #   lightning
-    #   pytorch-lightning
-    #   torchmetrics
 llvmlite==0.47.0
     # via numba
 lm-eval==0.4.11
@@ -490,8 +396,6 @@ lxml==5.3.0
     #   sacrebleu
 mako==1.3.10
     # via alembic
-markdown==3.8.2
-    # via tensorboard
 markdown-it-py==3.0.0
     # via rich
 markupsafe==3.0.1
@@ -500,11 +404,7 @@ markupsafe==3.0.1
     #   mako
     #   werkzeug
 matplotlib==3.9.2
-    # via
-    #   -r requirements/test/cuda.in
-    #   lightning
-    #   pycocotools
-    #   torchgeo
+    # via -r requirements/test/cuda.in
 mbstrdecoder==1.1.3
     # via
     #   dataproperty
@@ -512,7 +412,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.11.0
+mistral-common==1.11.2
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/cuda.in
@@ -559,7 +459,6 @@ numpy==2.2.6
     # via
     #   -r requirements/test/cuda.in
     #   accelerate
-    #   albucore
     #   albumentations
     #   bitsandbytes
     #   bm25s
@@ -567,19 +466,14 @@ numpy==2.2.6
     #   cupy-cuda12x
     #   datasets
     #   decord
-    #   diffusers
     #   einx
     #   encodec
     #   evaluate
     #   fastparquet
     #   genai-perf
-    #   geopandas
-    #   h5py
     #   imagehash
     #   imageio
     #   librosa
-    #   lightly
-    #   lightly-utils
     #   lm-eval
     #   matplotlib
     #   mistral-common
@@ -591,11 +485,7 @@ numpy==2.2.6
     #   patsy
     #   peft
     #   perceptron
-    #   pycocotools
-    #   pyogrio
     #   pywavelets
-    #   rasterio
-    #   rioxarray
     #   rouge-score
     #   runai-model-streamer
     #   sacrebleu
@@ -603,21 +493,14 @@ numpy==2.2.6
     #   scikit-learn
     #   scipy
     #   segmentation-models-pytorch
-    #   shapely
     #   soxr
     #   statsmodels
-    #   tensorboard
-    #   tensorboardx
     #   tensorizer
-    #   terratorch
     #   tifffile
-    #   torchgeo
-    #   torchmetrics
     #   torchvision
     #   transformers
     #   tritonclient
     #   vocos
-    #   xarray
 nvidia-cublas==13.1.0.3
     # via
     #   cuda-toolkit
@@ -657,10 +540,6 @@ nvidia-nvshmem-cu13==3.4.5
     # via torch
 nvidia-nvtx==13.0.85
     # via cuda-toolkit
-omegaconf==2.3.0
-    # via
-    #   hydra-core
-    #   lightning
 open-clip-torch==2.32.0
     # via -r requirements/test/cuda.in
 openai-harmony==0.0.4
@@ -675,7 +554,6 @@ opencv-python-headless==4.13.0.90
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/cuda.in
-    #   albucore
     #   albumentations
     #   mistral-common
 openpyxl==3.1.5
@@ -710,44 +588,27 @@ packaging==24.2
     #   datasets
     #   evaluate
     #   fastparquet
-    #   geopandas
     #   huggingface-hub
-    #   hydra-core
-    #   kornia
     #   lazy-loader
-    #   lightning
-    #   lightning-utilities
     #   matplotlib
     #   optuna
     #   peft
     #   plotly
     #   pooch
-    #   pyogrio
     #   pytest
     #   pytest-rerunfailures
-    #   pytorch-lightning
     #   ray
-    #   rioxarray
     #   scikit-image
     #   statsmodels
-    #   tensorboard
-    #   tensorboardx
-    #   torchmetrics
     #   transformers
     #   typepy
-    #   wandb
-    #   xarray
 pandas==2.2.3
     # via
     #   datasets
     #   evaluate
     #   fastparquet
     #   genai-perf
-    #   geopandas
     #   statsmodels
-    #   tacoreader
-    #   torchgeo
-    #   xarray
 pathspec==0.12.1
     # via black
 pathvalidate==3.2.1
@@ -762,25 +623,20 @@ perf-analyzer==0.1.0
     # via genai-perf
 pillow==10.4.0
     # via
-    #   diffusers
     #   genai-perf
     #   imagehash
     #   imageio
-    #   lightly-utils
     #   matplotlib
     #   mistral-common
     #   perceptron
     #   scikit-image
     #   segmentation-models-pytorch
-    #   tensorboard
-    #   torchgeo
     #   torchvision
 platformdirs==4.3.6
     # via
     #   black
     #   pooch
     #   virtualenv
-    #   wandb
 plotly==5.24.1
     # via
     #   -r requirements/test/cuda.in
@@ -817,10 +673,7 @@ protobuf==6.33.6
     #   opentelemetry-proto
     #   proto-plus
     #   ray
-    #   tensorboard
-    #   tensorboardx
     #   tensorizer
-    #   wandb
 psutil==6.1.0
     # via
     #   accelerate
@@ -834,16 +687,12 @@ pyarrow==23.0.0
     # via
     #   datasets
     #   genai-perf
-    #   tacoreader
-    #   terratorch
 pyasn1==0.6.1
     # via
     #   pyasn1-modules
     #   rsa
 pyasn1-modules==0.4.2
     # via google-auth
-pycocotools==2.0.8
-    # via terratorch
 pycountry==24.6.1
     # via pydantic-extra-types
 pycparser==2.22
@@ -858,13 +707,11 @@ pydantic==2.12.0
     #   datamodel-code-generator
     #   fastapi
     #   gpt-oss
-    #   lightly
     #   mistral-common
     #   mteb
     #   openai-harmony
     #   pydantic-extra-types
     #   ray
-    #   wandb
 pydantic-core==2.41.1
     # via pydantic
 pydantic-extra-types==2.10.5
@@ -873,17 +720,8 @@ pygments==2.18.0
     # via rich
 pyjwt==2.11.0
     # via msal
-pyogrio==0.11.0
-    # via geopandas
 pyparsing==3.2.0
-    # via
-    #   matplotlib
-    #   rasterio
-pyproj==3.7.1
-    # via
-    #   geopandas
-    #   rioxarray
-    #   torchgeo
+    # via matplotlib
 pyrate-limiter==3.7.0
     # via schemathesis
 pystemmer==3.0.0
@@ -920,22 +758,15 @@ pytest-subtests==0.14.1
     # via schemathesis
 pytest-timeout==2.3.1
     # via -r requirements/test/cuda.in
-python-box==7.3.2
-    # via terratorch
 python-dateutil==2.9.0.post0
     # via
     #   arrow
     #   botocore
-    #   lightly
     #   matplotlib
     #   pandas
     #   typepy
 python-rapidjson==1.20
     # via tritonclient
-pytorch-lightning==2.5.2
-    # via
-    #   lightly
-    #   lightning
 pytrec-eval-terrier==0.5.7
     # via mteb
 pytz==2024.2
@@ -952,26 +783,16 @@ pyyaml==6.0.2
     #   datasets
     #   genai-perf
     #   huggingface-hub
-    #   jsonargparse
-    #   lightning
-    #   omegaconf
     #   optuna
     #   peft
-    #   pytorch-lightning
     #   ray
     #   responses
     #   schemathesis
     #   timm
     #   transformers
     #   vocos
-    #   wandb
 rapidfuzz==3.12.1
     # via jiwer
-rasterio==1.4.3
-    # via
-    #   rioxarray
-    #   terratorch
-    #   torchgeo
 ray==2.48.0
     # via -r requirements/test/cuda.in
 redis==5.2.0
@@ -982,7 +803,6 @@ referencing==0.35.1
     #   jsonschema-specifications
 regex==2026.2.28
     # via
-    #   diffusers
     #   nltk
     #   open-clip-torch
     #   sacrebleu
@@ -994,13 +814,11 @@ requests==2.32.3
     #   azure-core
     #   buildkite-test-collector
     #   datasets
-    #   diffusers
     #   docker
     #   evaluate
     #   google-api-core
     #   google-cloud-storage
     #   gpt-oss
-    #   lightly
     #   lm-eval
     #   mistral-common
     #   msal
@@ -1010,9 +828,7 @@ requests==2.32.3
     #   responses
     #   schemathesis
     #   starlette-testclient
-    #   tacoreader
     #   tiktoken
-    #   wandb
 responses==0.25.3
     # via genai-perf
 rfc3339-validator==0.1.4
@@ -1022,13 +838,9 @@ rfc3987==1.3.8
 rich==13.9.4
     # via
     #   genai-perf
-    #   lightning
     #   mteb
     #   perceptron
-    #   terratorch
     #   typer
-rioxarray==0.19.0
-    # via terratorch
 rouge-score==0.1.2
     # via lm-eval
 rpds-py==0.20.1
@@ -1037,8 +849,6 @@ rpds-py==0.20.1
     #   referencing
 rsa==4.9.1
     # via google-auth
-rtree==1.4.0
-    # via torchgeo
 runai-model-streamer==0.15.7
     # via -r requirements/test/cuda.in
 runai-model-streamer-azure==0.15.7
@@ -1054,7 +864,6 @@ sacrebleu==2.4.3
 safetensors==0.4.5
     # via
     #   accelerate
-    #   diffusers
     #   open-clip-torch
     #   peft
     #   segmentation-models-pytorch
@@ -1063,9 +872,7 @@ safetensors==0.4.5
 schemathesis==3.39.15
     # via -r requirements/test/cuda.in
 scikit-image==0.25.2
-    # via
-    #   albumentations
-    #   terratorch
+    # via albumentations
 scikit-learn==1.5.2
     # via
     #   albumentations
@@ -1073,7 +880,6 @@ scikit-learn==1.5.2
     #   lm-eval
     #   mteb
     #   sentence-transformers
-    #   terratorch
 scipy==1.13.1
     # via
     #   albumentations
@@ -1087,27 +893,16 @@ scipy==1.13.1
     #   statsmodels
     #   vocos
 segmentation-models-pytorch==0.5.0
-    # via
-    #   -r requirements/test/cuda.in
-    #   terratorch
-    #   torchgeo
+    # via -r requirements/test/cuda.in
 sentence-transformers==5.2.0
     # via
     #   -r requirements/test/cuda.in
     #   mteb
-sentry-sdk==2.52.0
-    # via wandb
 setuptools==77.0.3
     # via
     #   -c requirements/common.txt
-    #   lightning-utilities
     #   pytablewriter
-    #   tensorboard
     #   torch
-shapely==2.1.1
-    # via
-    #   geopandas
-    #   torchgeo
 shellingham==1.5.4
     # via
     #   perceptron
@@ -1116,15 +911,12 @@ six==1.16.0
     # via
     #   -c requirements/common.txt
     #   junit-xml
-    #   lightly
     #   opencensus
     #   python-dateutil
     #   rfc3339-validator
     #   rouge-score
 smart-open==7.1.0
     # via ray
-smmap==5.0.2
-    # via gitdb
 sniffio==1.3.1
     # via
     #   anyio
@@ -1166,8 +958,6 @@ tabledata==1.3.3
     # via pytablewriter
 tabulate==0.9.0
     # via sacrebleu
-tacoreader==0.5.6
-    # via terratorch
 tblib==3.1.0
     # via -r requirements/test/cuda.in
 tcolorpy==0.1.6
@@ -1177,26 +967,14 @@ tenacity==9.1.2
     #   gpt-oss
     #   lm-eval
     #   plotly
-tensorboard==2.20.0
-    # via terratorch
-tensorboard-data-server==0.7.2
-    # via tensorboard
-tensorboardx==2.6.4
-    # via lightning
 tensorizer==2.10.1
     # via -r requirements/test/cuda.in
 termcolor==3.1.0
-    # via
-    #   gpt-oss
-    #   terratorch
-terratorch==1.2.2
-    # via -r requirements/test/cuda.in
+    # via gpt-oss
 threadpoolctl==3.5.0
     # via scikit-learn
 tifffile==2025.3.30
-    # via
-    #   scikit-image
-    #   terratorch
+    # via scikit-image
 tiktoken==0.12.0
     # via
     #   -c requirements/common.txt
@@ -1208,8 +986,6 @@ timm==1.0.17
     #   -r requirements/test/cuda.in
     #   open-clip-torch
     #   segmentation-models-pytorch
-    #   terratorch
-    #   torchgeo
 tokenizers==0.22.2
     # via
     #   -c requirements/common.txt
@@ -1227,21 +1003,14 @@ torch==2.11.0+cu130
     #   bitsandbytes
     #   encodec
     #   instanttensor
-    #   kornia
-    #   lightly
-    #   lightning
     #   mteb
     #   open-clip-torch
     #   peft
-    #   pytorch-lightning
     #   runai-model-streamer
     #   segmentation-models-pytorch
     #   sentence-transformers
     #   tensorizer
-    #   terratorch
     #   timm
-    #   torchgeo
-    #   torchmetrics
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
@@ -1251,31 +1020,18 @@ torchaudio==2.11.0+cu130
     #   -r requirements/test/cuda.in
     #   encodec
     #   vocos
-torchgeo==0.7.0
-    # via terratorch
-torchmetrics==1.7.4
-    # via
-    #   lightning
-    #   pytorch-lightning
-    #   terratorch
-    #   torchgeo
 torchvision==0.26.0+cu130
     # via
     #   -c requirements/cuda.txt
     #   -r requirements/test/cuda.in
-    #   lightly
     #   open-clip-torch
     #   segmentation-models-pytorch
-    #   terratorch
     #   timm
-    #   torchgeo
 tqdm==4.67.3
     # via
     #   datasets
     #   evaluate
     #   huggingface-hub
-    #   lightly
-    #   lightning
     #   lm-eval
     #   mteb
     #   nltk
@@ -1283,11 +1039,8 @@ tqdm==4.67.3
     #   optuna
     #   peft
     #   pqdm
-    #   pytorch-lightning
     #   segmentation-models-pytorch
     #   sentence-transformers
-    #   tacoreader
-    #   terratorch
     #   transformers
 transformers==5.5.3
     # via
@@ -1316,8 +1069,6 @@ typer==0.15.2
     #   transformers
 types-python-dateutil==2.9.0.20241206
     # via arrow
-typeshed-client==2.8.2
-    # via jsonargparse
 typing-extensions==4.15.0
     # via
     #   -c requirements/common.txt
@@ -1332,8 +1083,6 @@ typing-extensions==4.15.0
     #   grpcio
     #   huggingface-hub
     #   librosa
-    #   lightning
-    #   lightning-utilities
     #   lm-eval
     #   mistral-common
     #   mteb
@@ -1344,16 +1093,12 @@ typing-extensions==4.15.0
     #   pydantic
     #   pydantic-core
     #   pydantic-extra-types
-    #   pytorch-lightning
     #   sentence-transformers
     #   sqlalchemy
     #   starlette
     #   torch
-    #   torchgeo
     #   typer
-    #   typeshed-client
     #   typing-inspection
-    #   wandb
 typing-inspection==0.4.2
     # via pydantic
 tzdata==2024.2
@@ -1365,10 +1110,8 @@ urllib3==2.2.3
     #   blobfile
     #   botocore
     #   docker
-    #   lightly
     #   requests
     #   responses
-    #   sentry-sdk
     #   tritonclient
 uvicorn==0.35.0
     # via gpt-oss
@@ -1378,22 +1121,16 @@ virtualenv==20.31.2
     # via ray
 vocos==0.1.0
     # via -r requirements/test/cuda.in
-wandb==0.24.2
-    # via terratorch
 wcwidth==0.2.13
     # via ftfy
 webcolors==24.11.1
     # via jsonschema
 werkzeug==3.1.3
-    # via
-    #   schemathesis
-    #   tensorboard
+    # via schemathesis
 word2number==1.1
     # via lm-eval
 wrapt==1.17.2
     # via smart-open
-xarray==2025.7.1
-    # via rioxarray
 xxhash==3.5.0
     # via
     #   datasets
diff --git a/requirements/test/nightly-torch.txt b/requirements/test/nightly-torch.txt
index 0c34cf012031..75928e088dab 100644
--- a/requirements/test/nightly-torch.txt
+++ b/requirements/test/nightly-torch.txt
@@ -23,7 +23,7 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.11.0 # required for voxtral test
+mistral_common[image,audio] >= 1.11.2 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
diff --git a/requirements/test/rocm.in b/requirements/test/rocm.in
index b7329b5a9a48..105c9c3527d2 100644
--- a/requirements/test/rocm.in
+++ b/requirements/test/rocm.in
@@ -30,7 +30,7 @@ tblib # for pickling test exceptions
 timm>=1.0.17 # required for internvl and gemma3n-mm test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio]>=1.11.0 # required for voxtral test
+mistral_common[image,audio]>=1.11.2 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless>=4.13.0 # required for video test
@@ -61,7 +61,11 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 
 # Prithvi tests
-terratorch>=1.2.2
+# terratorch is temporarily disabled while PyPI has the `lightning` package
+# in `quarantined` status (every published terratorch version transitively
+# requires `lightning`, so the resolver fails with "no versions of lightning").
+# Re-enable once PyPI lifts the quarantine. Tracked in #41376.
+# terratorch>=1.2.2
 imagehash # Required for Prithvi tests
 segmentation-models-pytorch>0.4.0 # Required for Prithvi tests
 
@@ -79,5 +83,7 @@ plotly # required for perf comparison html report
 
 # ROCm-specific extras (not in CUDA cuda.in)
 rapidfuzz
-torchgeo==0.7.0
+# torchgeo also pulled in `lightning` transitively; disabled for the same
+# quarantine reason as terratorch above. Restore once the quarantine clears.
+# torchgeo==0.7.0
 multiprocess==0.70.16
diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt
index ca33e2d09aa0..6c0f8accdeb1 100644
--- a/requirements/test/rocm.txt
+++ b/requirements/test/rocm.txt
@@ -1,15 +1,9 @@
 # This file was autogenerated by uv via the following command:
 #    uv pip compile requirements/test/rocm.in -c requirements/rocm.txt -o requirements/test/rocm.txt --index-strategy unsafe-best-match --python-platform x86_64-manylinux_2_28 --python-version 3.12 --no-emit-package torch --no-emit-package torchvision --no-emit-package torchaudio --no-emit-package triton --no-emit-package cuda-bindings --no-emit-package cuda-pathfinder --no-emit-package cuda-toolkit --no-emit-package cupy-cuda12x --no-emit-package nvidia-cublas --no-emit-package nvidia-cuda-cupti --no-emit-package nvidia-cuda-nvrtc --no-emit-package nvidia-cuda-runtime --no-emit-package nvidia-cudnn --no-emit-package nvidia-cufft --no-emit-package nvidia-cufile --no-emit-package nvidia-curand --no-emit-package nvidia-cusolver --no-emit-package nvidia-cusparse --no-emit-package nvidia-cusparselt --no-emit-package nvidia-nccl --no-emit-package nvidia-nvjitlink --no-emit-package nvidia-nvshmem --no-emit-package nvidia-nvtx --no-emit-package nvidia-cublas-cu12 --no-emit-package nvidia-cuda-cupti-cu12 --no-emit-package nvidia-cuda-nvrtc-cu12 --no-emit-package nvidia-cuda-runtime-cu12 --no-emit-package nvidia-cudnn-cu12 --no-emit-package nvidia-cufft-cu12 --no-emit-package nvidia-cufile-cu12 --no-emit-package nvidia-curand-cu12 --no-emit-package nvidia-cusolver-cu12 --no-emit-package nvidia-cusparse-cu12 --no-emit-package nvidia-cusparselt-cu12 --no-emit-package nvidia-nccl-cu12 --no-emit-package nvidia-nvjitlink-cu12 --no-emit-package nvidia-nvshmem-cu12 --no-emit-package nvidia-nvtx-cu12 --no-emit-package nvidia-cublas-cu13 --no-emit-package nvidia-cuda-cupti-cu13 --no-emit-package nvidia-cuda-nvrtc-cu13 --no-emit-package nvidia-cuda-runtime-cu13 --no-emit-package nvidia-cudnn-cu13 --no-emit-package nvidia-cufft-cu13 --no-emit-package nvidia-cufile-cu13 --no-emit-package nvidia-curand-cu13 --no-emit-package nvidia-cusolver-cu13 --no-emit-package nvidia-cusparse-cu13 --no-emit-package nvidia-cusparselt-cu13 --no-emit-package nvidia-nccl-cu13 --no-emit-package nvidia-nvjitlink-cu13 --no-emit-package nvidia-nvshmem-cu13 --no-emit-package nvidia-nvtx-cu13
 absl-py==2.4.0
-    # via
-    #   rouge-score
-    #   tensorboard
+    # via rouge-score
 accelerate==1.13.0
     # via peft
-aenum==3.1.17
-    # via lightly
-affine==2.4.0
-    # via rasterio
 aiohappyeyeballs==2.6.1
     # via aiohttp
 aiohttp==3.13.3
@@ -25,12 +19,8 @@ aiohttp-cors==0.8.1
     # via ray
 aiosignal==1.4.0
     # via aiohttp
-albucore==0.1.2
-    # via terratorch
 albumentations==1.4.6
-    # via
-    #   -r requirements/test/rocm.in
-    #   terratorch
+    # via -r requirements/test/rocm.in
 alembic==1.18.4
     # via optuna
 annotated-doc==0.0.4
@@ -43,10 +33,6 @@ anthropic==0.93.0
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
-antlr4-python3-runtime==4.9.3
-    # via
-    #   hydra-core
-    #   omegaconf
 anyio==4.13.0
     # via
     #   anthropic
@@ -56,6 +42,8 @@ anyio==4.13.0
     #   sse-starlette
     #   starlette
     #   watchfiles
+apache-tvm-ffi==0.1.10
+    # via xgrammar
 arctic-inference==0.1.1
     # via -r requirements/test/rocm.in
 argcomplete==3.6.3
@@ -67,11 +55,9 @@ astor==0.8.1
 attrs==26.1.0
     # via
     #   aiohttp
-    #   fiona
     #   jsonlines
     #   jsonschema
     #   pytest-subtests
-    #   rasterio
     #   referencing
 audioread==3.0.1
     # via librosa
@@ -90,9 +76,7 @@ backoff==2.2.1
     #   -r requirements/test/rocm.in
     #   schemathesis
 bitsandbytes==0.49.2
-    # via
-    #   -r requirements/test/rocm.in
-    #   lightning
+    # via -r requirements/test/rocm.in
 black==26.3.1
     # via datamodel-code-generator
 blake3==1.0.8
@@ -119,13 +103,8 @@ cbor2==5.9.0
     # via -r requirements/test/../common.txt
 certifi==2026.2.25
     # via
-    #   fiona
     #   httpcore
     #   httpx
-    #   lightly
-    #   pyogrio
-    #   pyproj
-    #   rasterio
     #   requests
     #   sentry-sdk
 cffi==1.17.1
@@ -143,24 +122,13 @@ chz==0.4.0
 click==8.3.1
     # via
     #   black
-    #   click-plugins
-    #   cligj
-    #   fiona
     #   jiwer
     #   nltk
-    #   rasterio
     #   ray
     #   rich-toolkit
     #   schemathesis
     #   typer
     #   uvicorn
-    #   wandb
-click-plugins==1.1.1.2
-    # via fiona
-cligj==0.7.2
-    # via
-    #   fiona
-    #   rasterio
 cloudpickle==3.1.2
     # via -r requirements/test/../common.txt
 colorama==0.4.6
@@ -211,8 +179,6 @@ depyf==0.20.0
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
-diffusers==0.37.0
-    # via terratorch
 dill==0.3.8
     # via
     #   datasets
@@ -237,16 +203,12 @@ docker==7.1.0
 docopt==0.6.2
     # via num2words
 docstring-parser==0.17.0
-    # via
-    #   anthropic
-    #   jsonargparse
+    # via anthropic
 einops==0.8.2
     # via
     #   -r requirements/test/../common.txt
     #   -r requirements/test/rocm.in
     #   encodec
-    #   terratorch
-    #   torchgeo
     #   vector-quantize-pytorch
     #   vocos
 einx==0.4.2
@@ -283,14 +245,11 @@ filelock==3.25.2
     #   -r requirements/test/../common.txt
     #   blobfile
     #   datasets
-    #   diffusers
     #   huggingface-hub
     #   python-discovery
     #   ray
     #   torch
     #   virtualenv
-fiona==1.10.1
-    # via torchgeo
 fonttools==4.62.1
     # via matplotlib
 fqdn==1.5.1
@@ -307,9 +266,6 @@ fsspec==2025.3.0
     #   evaluate
     #   fastparquet
     #   huggingface-hub
-    #   lightning
-    #   pytorch-lightning
-    #   tacoreader
     #   torch
 ftfy==6.3.1
     # via open-clip-torch
@@ -317,16 +273,10 @@ genai-perf==0.0.16
     # via -r requirements/test/rocm.in
 genson==1.3.0
     # via datamodel-code-generator
-geopandas==1.1.3
-    # via terratorch
 gguf==0.18.0
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
-gitdb==4.0.12
-    # via gitpython
-gitpython==3.1.46
-    # via wandb
 google-api-core==2.30.0
     # via
     #   google-cloud-core
@@ -366,7 +316,6 @@ grpcio==1.78.0
     #   grpcio-reflection
     #   opentelemetry-exporter-otlp-proto-grpc
     #   ray
-    #   tensorboard
 grpcio-reflection==1.78.0
     # via
     #   -c requirements/rocm.txt
@@ -377,8 +326,6 @@ h11==0.16.0
     #   uvicorn
 h2==4.3.0
     # via httpx
-h5py==3.16.0
-    # via terratorch
 harfile==0.4.0
     # via schemathesis
 hf-xet==1.4.3
@@ -397,7 +344,6 @@ httpx==0.27.2
     # via
     #   -r requirements/test/rocm.in
     #   anthropic
-    #   diffusers
     #   fastapi
     #   fastapi-cloud-cli
     #   huggingface-hub
@@ -412,23 +358,17 @@ huggingface-hub==1.10.2
     # via
     #   accelerate
     #   datasets
-    #   diffusers
     #   evaluate
     #   open-clip-torch
     #   peft
     #   segmentation-models-pytorch
     #   sentence-transformers
-    #   terratorch
     #   timm
     #   tokenizers
     #   transformers
     #   vocos
 humanize==4.15.0
     # via runai-model-streamer
-hydra-core==1.3.2
-    # via
-    #   lightly
-    #   lightning
 hyperframe==6.1.0
     # via h2
 hypothesis==6.151.9
@@ -455,11 +395,7 @@ imagehash==4.3.2
 imageio==2.37.3
     # via scikit-image
 importlib-metadata==8.7.1
-    # via
-    #   diffusers
-    #   opentelemetry-api
-importlib-resources==6.5.2
-    # via typeshed-client
+    # via opentelemetry-api
 inflect==7.5.0
     # via datamodel-code-generator
 iniconfig==2.3.0
@@ -497,14 +433,8 @@ joblib==1.5.3
     #   librosa
     #   nltk
     #   scikit-learn
-jsonargparse==4.47.0
-    # via
-    #   lightning
-    #   terratorch
 jsonlines==4.0.0
     # via lm-eval
-jsonnet==0.21.0
-    # via jsonargparse
 jsonpointer==3.1.0
     # via jsonschema
 jsonschema==4.26.0
@@ -524,10 +454,6 @@ kaleido==1.0.0
     # via genai-perf
 kiwisolver==1.5.0
     # via matplotlib
-kornia==0.8.2
-    # via torchgeo
-kornia-rs==0.1.10
-    # via kornia
 lark==1.2.2
     # via
     #   -c requirements/common.txt
@@ -540,21 +466,6 @@ libnacl==2.1.0
     # via tensorizer
 librosa==0.10.2.post1
     # via -r requirements/test/rocm.in
-lightly==1.5.22
-    # via
-    #   terratorch
-    #   torchgeo
-lightly-utils==0.0.2
-    # via lightly
-lightning==2.6.1
-    # via
-    #   terratorch
-    #   torchgeo
-lightning-utilities==0.15.3
-    # via
-    #   lightning
-    #   pytorch-lightning
-    #   torchmetrics
 llguidance==1.3.0
     # via
     #   -c requirements/common.txt
@@ -580,8 +491,6 @@ lxml==6.0.2
     #   sacrebleu
 mako==1.3.10
     # via alembic
-markdown==3.10.2
-    # via tensorboard
 markdown-it-py==4.0.0
     # via rich
 markupsafe==3.0.3
@@ -590,10 +499,7 @@ markupsafe==3.0.3
     #   mako
     #   werkzeug
 matplotlib==3.10.8
-    # via
-    #   -r requirements/test/rocm.in
-    #   lightning
-    #   torchgeo
+    # via -r requirements/test/rocm.in
 mbstrdecoder==1.1.4
     # via
     #   dataproperty
@@ -603,7 +509,7 @@ mcp==1.27.0
     # via -r requirements/test/../common.txt
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.11.0
+mistral-common==1.11.2
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
@@ -660,14 +566,11 @@ numba==0.65.0
     #   -c requirements/rocm.txt
     #   -r requirements/test/rocm.in
     #   librosa
-numkong==7.1.1
-    # via albucore
 numpy==2.2.6
     # via
     #   -r requirements/test/../common.txt
     #   -r requirements/test/rocm.in
     #   accelerate
-    #   albucore
     #   albumentations
     #   bitsandbytes
     #   bm25s
@@ -675,20 +578,15 @@ numpy==2.2.6
     #   cupy-cuda12x
     #   datasets
     #   decord
-    #   diffusers
     #   einx
     #   encodec
     #   evaluate
     #   fastparquet
     #   genai-perf
-    #   geopandas
     #   gguf
-    #   h5py
     #   imagehash
     #   imageio
     #   librosa
-    #   lightly
-    #   lightly-utils
     #   lm-eval
     #   matplotlib
     #   mistral-common
@@ -700,12 +598,8 @@ numpy==2.2.6
     #   patsy
     #   peft
     #   perceptron
-    #   pycocotools
-    #   pyogrio
     #   pytrec-eval-terrier
     #   pywavelets
-    #   rasterio
-    #   rioxarray
     #   rouge-score
     #   runai-model-streamer
     #   sacrebleu
@@ -714,27 +608,16 @@ numpy==2.2.6
     #   scipy
     #   segmentation-models-pytorch
     #   sentence-transformers
-    #   shapely
     #   soundfile
     #   soxr
     #   statsmodels
-    #   tensorboard
-    #   tensorboardx
     #   tensorizer
-    #   terratorch
     #   tifffile
-    #   torchgeo
-    #   torchmetrics
     #   torchvision
     #   transformers
     #   tritonclient
     #   vocos
-    #   xarray
     #   xgrammar
-omegaconf==2.3.0
-    # via
-    #   hydra-core
-    #   lightning
 open-clip-torch==2.32.0
     # via -r requirements/test/rocm.in
 openai==2.31.0
@@ -824,46 +707,29 @@ packaging==26.0
     #   datasets
     #   evaluate
     #   fastparquet
-    #   geopandas
     #   huggingface-hub
-    #   hydra-core
     #   kaleido
-    #   kornia
     #   lazy-loader
-    #   lightning
-    #   lightning-utilities
     #   lm-format-enforcer
     #   matplotlib
     #   optuna
     #   peft
     #   plotly
     #   pooch
-    #   pyogrio
     #   pytest
     #   pytest-rerunfailures
-    #   pytorch-lightning
     #   ray
-    #   rioxarray
     #   scikit-image
     #   statsmodels
-    #   tensorboard
-    #   tensorboardx
-    #   torchmetrics
     #   transformers
     #   typepy
-    #   wandb
-    #   xarray
 pandas==3.0.1
     # via
     #   datasets
     #   evaluate
     #   fastparquet
     #   genai-perf
-    #   geopandas
     #   statsmodels
-    #   tacoreader
-    #   torchgeo
-    #   xarray
 partial-json-parser==0.2.1.1.post7
     # via -r requirements/test/../common.txt
 pathspec==1.0.4
@@ -881,18 +747,14 @@ perf-analyzer==0.1.0
 pillow==12.1.1
     # via
     #   -r requirements/test/../common.txt
-    #   diffusers
     #   genai-perf
     #   imagehash
     #   imageio
-    #   lightly-utils
     #   matplotlib
     #   mistral-common
     #   perceptron
     #   scikit-image
     #   segmentation-models-pytorch
-    #   tensorboard
-    #   torchgeo
     #   torchvision
 platformdirs==4.3.6
     # via
@@ -900,7 +762,6 @@ platformdirs==4.3.6
     #   pooch
     #   python-discovery
     #   virtualenv
-    #   wandb
 plotly==6.6.0
     # via
     #   -r requirements/test/rocm.in
@@ -946,10 +807,7 @@ protobuf==6.33.6
     #   opentelemetry-proto
     #   proto-plus
     #   ray
-    #   tensorboard
-    #   tensorboardx
     #   tensorizer
-    #   wandb
 psutil==7.2.2
     # via
     #   -r requirements/test/../common.txt
@@ -966,16 +824,12 @@ pyarrow==23.0.1
     # via
     #   datasets
     #   genai-perf
-    #   tacoreader
-    #   terratorch
 pyasn1==0.6.3
     # via pyasn1-modules
 pyasn1-modules==0.4.2
     # via google-auth
 pybase64==1.4.3
     # via -r requirements/test/../common.txt
-pycocotools==2.0.11
-    # via terratorch
 pycountry==26.2.16
     # via pydantic-extra-types
 pycparser==3.0
@@ -994,7 +848,6 @@ pydantic==2.12.5
     #   fastapi
     #   fastapi-cloud-cli
     #   gpt-oss
-    #   lightly
     #   lm-format-enforcer
     #   mcp
     #   mistral-common
@@ -1005,7 +858,6 @@ pydantic==2.12.5
     #   pydantic-extra-types
     #   pydantic-settings
     #   ray
-    #   wandb
     #   xgrammar
 pydantic-core==2.41.5
     # via pydantic
@@ -1023,17 +875,8 @@ pyjwt==2.12.1
     # via
     #   mcp
     #   msal
-pyogrio==0.12.1
-    # via geopandas
 pyparsing==3.3.2
-    # via
-    #   matplotlib
-    #   rasterio
-pyproj==3.7.2
-    # via
-    #   geopandas
-    #   rioxarray
-    #   torchgeo
+    # via matplotlib
 pyrate-limiter==3.9.0
     # via schemathesis
 pystemmer==3.0.0
@@ -1070,13 +913,10 @@ pytest-subtests==0.14.2
     # via schemathesis
 pytest-timeout==2.3.1
     # via -r requirements/test/rocm.in
-python-box==7.4.1
-    # via terratorch
 python-dateutil==2.9.0.post0
     # via
     #   arrow
     #   botocore
-    #   lightly
     #   matplotlib
     #   pandas
     #   typepy
@@ -1096,10 +936,6 @@ python-rapidjson==1.23
     # via tritonclient
 pytokens==0.4.1
     # via black
-pytorch-lightning==2.6.1
-    # via
-    #   lightly
-    #   lightning
 pytrec-eval-terrier==0.5.10
     # via mteb
 pytz==2026.1.post1
@@ -1116,13 +952,9 @@ pyyaml==6.0.3
     #   genai-perf
     #   gguf
     #   huggingface-hub
-    #   jsonargparse
-    #   lightning
     #   lm-format-enforcer
-    #   omegaconf
     #   optuna
     #   peft
-    #   pytorch-lightning
     #   ray
     #   responses
     #   schemathesis
@@ -1130,7 +962,6 @@ pyyaml==6.0.3
     #   transformers
     #   uvicorn
     #   vocos
-    #   wandb
 pyzmq==27.1.0
     # via
     #   -c requirements/common.txt
@@ -1139,11 +970,6 @@ rapidfuzz==3.12.1
     # via
     #   -r requirements/test/rocm.in
     #   jiwer
-rasterio==1.5.0
-    # via
-    #   rioxarray
-    #   terratorch
-    #   torchgeo
 ray==2.54.0
     # via -r requirements/test/rocm.in
 redis==7.3.0
@@ -1155,7 +981,6 @@ referencing==0.37.0
 regex==2026.2.28
     # via
     #   -r requirements/test/../common.txt
-    #   diffusers
     #   nltk
     #   open-clip-torch
     #   sacrebleu
@@ -1168,14 +993,12 @@ requests==2.32.5
     #   azure-core
     #   buildkite-test-collector
     #   datasets
-    #   diffusers
     #   docker
     #   evaluate
     #   gguf
     #   google-api-core
     #   google-cloud-storage
     #   gpt-oss
-    #   lightly
     #   lm-eval
     #   mistral-common
     #   msal
@@ -1186,9 +1009,7 @@ requests==2.32.5
     #   responses
     #   schemathesis
     #   starlette-testclient
-    #   tacoreader
     #   tiktoken
-    #   wandb
 responses==0.26.0
     # via genai-perf
 rfc3339-validator==0.1.4
@@ -1198,11 +1019,9 @@ rfc3987==1.3.8
 rich==14.3.3
     # via
     #   genai-perf
-    #   lightning
     #   mteb
     #   perceptron
     #   rich-toolkit
-    #   terratorch
     #   typer
 rich-toolkit==0.19.7
     # via
@@ -1210,16 +1029,12 @@ rich-toolkit==0.19.7
     #   fastapi-cloud-cli
 rignore==0.7.6
     # via fastapi-cloud-cli
-rioxarray==0.22.0
-    # via terratorch
 rouge-score==0.1.2
     # via lm-eval
 rpds-py==0.30.0
     # via
     #   jsonschema
     #   referencing
-rtree==1.4.1
-    # via torchgeo
 runai-model-streamer==0.15.7
     # via
     #   -c requirements/rocm.txt
@@ -1237,7 +1052,6 @@ sacrebleu==2.6.0
 safetensors==0.7.0
     # via
     #   accelerate
-    #   diffusers
     #   open-clip-torch
     #   peft
     #   segmentation-models-pytorch
@@ -1246,9 +1060,7 @@ safetensors==0.7.0
 schemathesis==3.39.15
     # via -r requirements/test/rocm.in
 scikit-image==0.26.0
-    # via
-    #   albumentations
-    #   terratorch
+    # via albumentations
 scikit-learn==1.8.0
     # via
     #   albumentations
@@ -1256,7 +1068,6 @@ scikit-learn==1.8.0
     #   lm-eval
     #   mteb
     #   sentence-transformers
-    #   terratorch
 scipy==1.17.1
     # via
     #   albumentations
@@ -1271,10 +1082,7 @@ scipy==1.17.1
     #   statsmodels
     #   vocos
 segmentation-models-pytorch==0.5.0
-    # via
-    #   -r requirements/test/rocm.in
-    #   terratorch
-    #   torchgeo
+    # via -r requirements/test/rocm.in
 sentence-transformers==5.3.0
     # via
     #   -r requirements/test/rocm.in
@@ -1282,9 +1090,7 @@ sentence-transformers==5.3.0
 sentencepiece==0.2.1
     # via -r requirements/test/../common.txt
 sentry-sdk==2.55.0
-    # via
-    #   fastapi-cloud-cli
-    #   wandb
+    # via fastapi-cloud-cli
 setproctitle==1.3.7
     # via -r requirements/test/../common.txt
 setuptools==79.0.1
@@ -1294,12 +1100,7 @@ setuptools==79.0.1
     #   -r requirements/test/../common.txt
     #   model-hosting-container-standards
     #   pytablewriter
-    #   tensorboard
     #   torch
-shapely==2.1.2
-    # via
-    #   geopandas
-    #   torchgeo
 shellingham==1.5.4
     # via
     #   perceptron
@@ -1311,15 +1112,12 @@ six==1.17.0
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
     #   junit-xml
-    #   lightly
     #   opencensus
     #   python-dateutil
     #   rfc3339-validator
     #   rouge-score
 smart-open==7.5.1
     # via ray
-smmap==5.0.3
-    # via gitdb
 sniffio==1.3.1
     # via
     #   anthropic
@@ -1358,8 +1156,6 @@ starlette-testclient==0.4.1
     # via schemathesis
 statsmodels==0.14.6
     # via genai-perf
-stringzilla==4.6.0
-    # via albucore
 structlog==25.5.0
     # via gpt-oss
 supervisor==4.3.0
@@ -1372,8 +1168,6 @@ tabledata==1.3.4
     # via pytablewriter
 tabulate==0.10.0
     # via sacrebleu
-tacoreader==0.5.6
-    # via terratorch
 tblib==3.1.0
     # via -r requirements/test/rocm.in
 tcolorpy==0.1.7
@@ -1382,28 +1176,16 @@ tenacity==9.1.4
     # via
     #   gpt-oss
     #   lm-eval
-tensorboard==2.20.0
-    # via terratorch
-tensorboard-data-server==0.7.2
-    # via tensorboard
-tensorboardx==2.6.4
-    # via lightning
 tensorizer==2.10.1
     # via
     #   -c requirements/rocm.txt
     #   -r requirements/test/rocm.in
 termcolor==3.3.0
-    # via
-    #   gpt-oss
-    #   terratorch
-terratorch==1.2.2
-    # via -r requirements/test/rocm.in
+    # via gpt-oss
 threadpoolctl==3.6.0
     # via scikit-learn
 tifffile==2026.3.3
-    # via
-    #   scikit-image
-    #   terratorch
+    # via scikit-image
 tiktoken==0.12.0
     # via
     #   -c requirements/common.txt
@@ -1417,8 +1199,6 @@ timm==1.0.17
     #   -r requirements/test/rocm.in
     #   open-clip-torch
     #   segmentation-models-pytorch
-    #   terratorch
-    #   torchgeo
 tokenizers==0.22.2
     # via
     #   -c requirements/common.txt
@@ -1429,16 +1209,6 @@ tomli==2.4.0
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torchgeo==0.7.0
-    # via
-    #   -r requirements/test/rocm.in
-    #   terratorch
-torchmetrics==1.9.0
-    # via
-    #   lightning
-    #   pytorch-lightning
-    #   terratorch
-    #   torchgeo
 tqdm==4.67.3
     # via
     #   -r requirements/test/../common.txt
@@ -1446,8 +1216,6 @@ tqdm==4.67.3
     #   evaluate
     #   gguf
     #   huggingface-hub
-    #   lightly
-    #   lightning
     #   lm-eval
     #   mteb
     #   nltk
@@ -1456,11 +1224,8 @@ tqdm==4.67.3
     #   optuna
     #   peft
     #   pqdm
-    #   pytorch-lightning
     #   segmentation-models-pytorch
     #   sentence-transformers
-    #   tacoreader
-    #   terratorch
     #   transformers
 transformers==5.5.3
     # via
@@ -1492,8 +1257,6 @@ typer==0.24.1
     #   huggingface-hub
     #   perceptron
     #   transformers
-typeshed-client==2.9.0
-    # via jsonargparse
 typing-extensions==4.15.0
     # via
     #   -c requirements/common.txt
@@ -1503,6 +1266,7 @@ typing-extensions==4.15.0
     #   alembic
     #   anthropic
     #   anyio
+    #   apache-tvm-ffi
     #   azure-core
     #   azure-identity
     #   azure-storage-blob
@@ -1511,8 +1275,6 @@ typing-extensions==4.15.0
     #   grpcio
     #   huggingface-hub
     #   librosa
-    #   lightning
-    #   lightning-utilities
     #   lm-eval
     #   mcp
     #   mistral-common
@@ -1527,18 +1289,14 @@ typing-extensions==4.15.0
     #   pydantic
     #   pydantic-core
     #   pydantic-extra-types
-    #   pytorch-lightning
     #   referencing
     #   rich-toolkit
     #   sentence-transformers
     #   sqlalchemy
     #   starlette
     #   torch
-    #   torchgeo
     #   typeguard
-    #   typeshed-client
     #   typing-inspection
-    #   wandb
     #   xgrammar
 typing-inspection==0.4.2
     # via
@@ -1555,7 +1313,6 @@ urllib3==2.6.3
     #   blobfile
     #   botocore
     #   docker
-    #   lightly
     #   requests
     #   responses
     #   sentry-sdk
@@ -1575,8 +1332,6 @@ virtualenv==21.2.0
     # via ray
 vocos==0.1.0
     # via -r requirements/test/rocm.in
-wandb==0.25.1
-    # via terratorch
 watchfiles==1.1.1
     # via
     #   -r requirements/test/../common.txt
@@ -1588,16 +1343,12 @@ webcolors==25.10.0
 websockets==16.0
     # via uvicorn
 werkzeug==3.1.6
-    # via
-    #   schemathesis
-    #   tensorboard
+    # via schemathesis
 word2number==1.1
     # via lm-eval
 wrapt==2.1.2
     # via smart-open
-xarray==2026.2.0
-    # via rioxarray
-xgrammar==0.1.33
+xgrammar==0.2.0
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
diff --git a/requirements/test/xpu.txt b/requirements/test/xpu.txt
index 601838f843f9..80b0c148116d 100644
--- a/requirements/test/xpu.txt
+++ b/requirements/test/xpu.txt
@@ -266,7 +266,7 @@ mbstrdecoder==1.1.4
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.11.0
+mistral-common==1.11.2
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/xpu.in
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index cee9fa6576e7..f1074e4a6f2f 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -11,4 +11,4 @@ ray[default]
 ray[data]
 setuptools==78.1.0
 nixl==0.3.0
-tpu-inference==0.18.0
+tpu-inference==0.19.0
diff --git a/tests/benchmarks/test_custom_dataset_seed.py b/tests/benchmarks/test_custom_dataset_seed.py
new file mode 100644
index 000000000000..dac87e6e6d98
--- /dev/null
+++ b/tests/benchmarks/test_custom_dataset_seed.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import json
+from pathlib import Path
+
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.benchmarks.datasets import get_samples
+
+
+@pytest.fixture(scope="session")
+def hf_tokenizer() -> PreTrainedTokenizerBase:
+    return AutoTokenizer.from_pretrained("gpt2")
+
+
+def _write_jsonl(path: Path, n_rows: int) -> None:
+    with path.open("w") as f:
+        for i in range(n_rows):
+            f.write(json.dumps({"prompt": f"row {i}: unique prompt content."}) + "\n")
+
+
+def _args_for_custom(dataset_path: str, seed: int) -> argparse.Namespace:
+    return argparse.Namespace(
+        dataset_name="custom",
+        dataset_path=dataset_path,
+        disable_shuffle=False,
+        num_prompts=30,
+        custom_output_len=32,
+        skip_chat_template=True,
+        no_oversample=False,
+        seed=seed,
+        request_id_prefix="",
+    )
+
+
+@pytest.mark.benchmark
+def test_custom_dataset_seed_propagates(
+    hf_tokenizer: PreTrainedTokenizerBase, tmp_path: Path
+) -> None:
+    """--seed must control the CustomDataset shuffle used by get_samples.
+
+    Without the fix, CustomDataset was instantiated without random_seed,
+    so its load-time shuffle always used DEFAULT_SEED=0 regardless of
+    args.seed, causing every run with --dataset-name custom to pick the
+    same subset of rows from a larger file.
+    """
+    jsonl = tmp_path / "data.jsonl"
+    _write_jsonl(jsonl, n_rows=60)
+
+    samples_a = get_samples(_args_for_custom(str(jsonl), seed=0), hf_tokenizer)
+    samples_b = get_samples(_args_for_custom(str(jsonl), seed=42), hf_tokenizer)
+
+    prompts_a = {s.prompt for s in samples_a}
+    prompts_b = {s.prompt for s in samples_b}
+
+    assert len(prompts_a) == 30
+    assert len(prompts_b) == 30
+    assert prompts_a != prompts_b
+
+
+@pytest.mark.benchmark
+def test_custom_dataset_same_seed_is_deterministic(
+    hf_tokenizer: PreTrainedTokenizerBase, tmp_path: Path
+) -> None:
+    """Same --seed must yield the same CustomDataset subset."""
+    jsonl = tmp_path / "data.jsonl"
+    _write_jsonl(jsonl, n_rows=60)
+
+    samples_a = get_samples(_args_for_custom(str(jsonl), seed=7), hf_tokenizer)
+    samples_b = get_samples(_args_for_custom(str(jsonl), seed=7), hf_tokenizer)
+
+    prompts_a = [s.prompt for s in samples_a]
+    prompts_b = [s.prompt for s in samples_b]
+
+    assert prompts_a == prompts_b
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index d61c128a59b6..87f98946a8ad 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -12,10 +12,17 @@
 from torch.fx._utils import lazy_format_graph_code
 
 from vllm.compilation.passes.fx_utils import find_op_nodes
-from vllm.compilation.passes.inductor_pass import InductorPass
+from vllm.compilation.passes.inductor_pass import (
+    InductorPass,
+    pass_context,
+)
+from vllm.compilation.passes.ir.inplace_functionalization import (
+    VllmIRInplaceFunctionalizationPass,
+)
 from vllm.compilation.passes.pass_manager import with_pattern_match_debug
 from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config.utils import Range
 from vllm.logger import init_logger
 
 logger = init_logger("vllm.tests.compile.backend")
@@ -53,11 +60,17 @@ def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
         self.custom_passes = list(passes)
         vllm_config = get_current_vllm_config()
         compile_config = vllm_config.compilation_config
+        self.range = Range(1, vllm_config.scheduler_config.max_num_batched_tokens)
         # Deepcopy to allow multiple TestBackend instances to use the same VllmConfig
         self.inductor_config = deepcopy(compile_config.inductor_compile_config)
         self.inductor_config["force_disable_caches"] = True
         self.inductor_config["post_grad_custom_post_pass"] = self.post_pass
 
+        # Add VllmIRInplaceFunctionalizationPass as pre-grad pass by default
+        self.inductor_config["pre_grad_custom_pass"] = (
+            VllmIRInplaceFunctionalizationPass(vllm_config)
+        )
+
         if debug_dump_path := vllm_config.compile_debug_dump_path():
             logger.debug("Dumping depyf output to %s", debug_dump_path)
             self.debug_ctx = depyf.prepare_debug(debug_dump_path.as_posix())
@@ -68,7 +81,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs):
         self.graph_pre_compile = deepcopy(graph)
         from torch._inductor.compile_fx import compile_fx
 
-        with self.debug_ctx:
+        with self.debug_ctx, pass_context(self.range):
             return compile_fx(
                 graph, example_inputs, config_patches=self.inductor_config
             )
diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py
index 6aafac7bcad3..1263cce04c6c 100644
--- a/tests/compile/conftest.py
+++ b/tests/compile/conftest.py
@@ -24,10 +24,24 @@ def test_something(mock_cuda_platform):
     def _mock_platform(is_cuda: bool = True, capability: tuple[int, int] | None = None):
         mock_platform = MagicMock()
         mock_platform.is_cuda.return_value = is_cuda
-        if capability is not None:
-            mock_platform.get_device_capability.return_value = DeviceCapability(
-                *capability
+        device_capability = (
+            DeviceCapability(*capability) if capability is not None else None
+        )
+        mock_platform.get_device_capability.return_value = device_capability
+
+        def is_device_capability_family(
+            requested_capability: int, device_id: int = 0
+        ) -> bool:
+            current_capability = mock_platform.get_device_capability(
+                device_id=device_id
             )
+            if current_capability is None:
+                return False
+            return current_capability.major == (requested_capability // 10)
+
+        mock_platform.is_device_capability_family.side_effect = (
+            is_device_capability_family
+        )
         with patch("vllm.platforms.current_platform", mock_platform):
             yield mock_platform
 
diff --git a/tests/compile/fusions_e2e/conftest.py b/tests/compile/fusions_e2e/conftest.py
index b017f88881cb..b4922a3fdc5e 100644
--- a/tests/compile/fusions_e2e/conftest.py
+++ b/tests/compile/fusions_e2e/conftest.py
@@ -97,6 +97,12 @@ def run(
                 f"attention backend '{attn_backend.backend.name}'"
             )
 
+        if attn_backend.backend.name == "FLASHINFER":
+            from vllm.utils.flashinfer import supports_trtllm_attention
+
+            if not supports_trtllm_attention():
+                matches = matches._replace(attn_quant_fusion=0)
+
         # TODO: remove this after finishing migration from envs to model kwargs
         if model_name == "openai/gpt-oss-20b":
             from .common import is_blackwell
diff --git a/tests/compile/fusions_e2e/test_tp2_ar_rms.py b/tests/compile/fusions_e2e/test_tp2_ar_rms.py
index 9156f6afa06a..b5e2b2dc07ea 100644
--- a/tests/compile/fusions_e2e/test_tp2_ar_rms.py
+++ b/tests/compile/fusions_e2e/test_tp2_ar_rms.py
@@ -19,6 +19,8 @@
     FLASHINFER_ATTN,
     FLASHINFER_MLA_ATTN,
     FLASHMLA_SPARSE_ATTN,
+    ROCM_AITER_UNIFIED_ATTN,
+    ROCM_ATTN,
     TRITON_ATTN,
     deepseek_coder_v2_lite_fp8,
     deepseek_r1_fp4,
@@ -34,7 +36,9 @@
     qwen3_a3b_fp8,
 )
 
-pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Only test CUDA/ROCm"
+)
 
 
 @multi_gpu_test(num_gpus=2)
@@ -55,6 +59,7 @@
 @pytest.mark.parametrize("n_layers", [4])
 @pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
 def test_tp2_ar_rms_fp8_fusions(
     model_name: str,
     matches_fn: Callable[[int], Matches],
@@ -124,6 +129,7 @@ def test_tp2_ar_rms_fp8_fusions(
 @pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 @pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
 def test_tp2_ar_rms_fp4_fusions(
     model_name: str,
     matches_fn: Callable[[int], Matches],
@@ -176,10 +182,19 @@ def test_tp2_ar_rms_fp4_fusions(
     "model_name, matches_fn, model_kwargs, hf_overrides",
     [llama3_8b, qwen3_a3b, gpt_oss_20b],
 )
-@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
+@pytest.mark.parametrize(
+    "attn_backend",
+    [
+        TRITON_ATTN,
+        FLASHINFER_ATTN,
+        ROCM_ATTN,
+        ROCM_AITER_UNIFIED_ATTN,
+    ],
+)
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("custom_ops", tuple(custom_ops_combos("rms_norm")))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Only test CUDA/ROCm")
 def test_tp2_ar_rms_fusions(
     model_name: str,
     matches_fn: Callable[[int], Matches],
@@ -221,4 +236,5 @@ def test_tp2_ar_rms_fusions(
         compilation_config,
         matches_check,
         tp_size=2,
+        use_aiter=current_platform.is_rocm(),
     )
diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
index 609377e68958..baa7bdef0a7d 100644
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -13,7 +13,6 @@
     AttentionBackendCase,
     Matches,
     custom_ops_combos,
-    is_blackwell,
 )
 from .models import (
     FLASHINFER_ATTN,
@@ -46,14 +45,9 @@ def test_tp2_async_tp_fp8_fusions(
     custom_ops: str,
     inductor_graph_partition: bool,
     run_e2e_fusion_test,
-    monkeypatch,
 ):
     matches = matches_fn(n_layers)
 
-    if is_blackwell():
-        # Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
-        monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
-
     # Reduce size of model and skip weight loading time
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
@@ -173,14 +167,9 @@ def test_tp2_sp_ar_rms_fp8_fusions(
     custom_ops: str,
     inductor_graph_partition: bool,
     run_e2e_fusion_test,
-    monkeypatch,
 ):
     matches = matches_fn(n_layers)
 
-    if is_blackwell():
-        # Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
-        monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
-
     # Reduce size of model and skip weight loading time
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
diff --git a/tests/compile/h100/test_startup.py b/tests/compile/h100/test_startup.py
index ff4496c2ba6d..78554a3e93da 100644
--- a/tests/compile/h100/test_startup.py
+++ b/tests/compile/h100/test_startup.py
@@ -34,7 +34,10 @@ def _run_vllm(vllm_runner):
             mode=CompilationMode.VLLM_COMPILE,
             cudagraph_mode=CUDAGraphMode.NONE,
         ),
-        num_gpu_blocks_override=8,
+        # Phi-tiny-MoE uses SWA, whose admission cap is `cdiv(L, block_size) + 1`
+        # at default block_size=16 — i.e. 17 blocks for max_model_len=256. Use
+        # 32 for headroom.
+        num_gpu_blocks_override=32,
     ):
         pass
 
@@ -190,7 +193,7 @@ def _run_model(vllm_runner, spec: ModelStartupSpec):
             cudagraph_mode=CUDAGraphMode.NONE,
             pass_config=PassConfig(fuse_allreduce_rms=False),
         ),
-        num_gpu_blocks_override=8,
+        num_gpu_blocks_override=16,
     ):
         pass
 
diff --git a/tests/compile/passes/distributed/test_fusion_all_reduce.py b/tests/compile/passes/distributed/test_fusion_all_reduce.py
index e2c461e6692d..1a175b8dd335 100644
--- a/tests/compile/passes/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py
@@ -8,8 +8,12 @@
 import vllm.envs as envs
 from tests.compile.backend import TestBackend
 from tests.utils import TestFP8Layer, has_module_attribute, multi_gpu_test
+from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.compilation.passes.fusion.allreduce_rms_fusion import AllReduceFusionPass
+from vllm.compilation.passes.fusion.allreduce_rms_fusion import (
+    AllReduceFusionPass,
+    RocmAiterAllReduceFusionPass,
+)
 from vllm.compilation.passes.utility.fix_functionalization import (
     FixFunctionalizationPass,
 )
@@ -42,13 +46,19 @@
 
 class TestAllReduceRMSNormModel(torch.nn.Module):
     def __init__(
-        self, hidden_size=16, token_num=16, eps=1e-6, dtype: torch.dtype = torch.float16
+        self,
+        hidden_size=16,
+        token_num=16,
+        eps=1e-6,
+        dtype: torch.dtype = torch.float16,
+        use_aiter: bool = False,
     ):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
         self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
         self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
+        self.use_aiter = use_aiter
 
     def forward(self, x):
         # avoid having graph input be an arg to a pattern directly
@@ -76,6 +86,8 @@ def ops_in_model_before(self):
         return [torch.ops.vllm.all_reduce.default]
 
     def ops_in_model_after(self):
+        if self.use_aiter:
+            return [rocm_aiter_ops.get_fused_allreduce_rmsnorm_op()]
         return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
 
 
@@ -194,12 +206,36 @@ def ops_in_model_before(self):
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
-    "test_model, enable_quant_fp8_custom_op",
+    "test_model, enable_quant_fp8_custom_op, use_aiter",
     [
-        (TestAllReduceRMSNormModel, False),
-        (TestAllReduceRMSNormStaticQuantFP8Model, True),
-        (TestAllReduceRMSNormStaticQuantFP8Model, False),
-        (TestAllReduceFusedAddRMSNormStaticQuantFP4Model, False),
+        (TestAllReduceRMSNormModel, False, IS_AITER_FOUND),
+        pytest.param(
+            TestAllReduceRMSNormStaticQuantFP8Model,
+            True,
+            False,
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Not supported on ROCm platform",
+            ),
+        ),
+        pytest.param(
+            TestAllReduceRMSNormStaticQuantFP8Model,
+            False,
+            False,
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Not supported on ROCm platform",
+            ),
+        ),
+        pytest.param(
+            TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
+            False,
+            False,
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Not supported on ROCm platform",
+            ),
+        ),
     ],
 )
 @pytest.mark.parametrize("batch_size", [8])
@@ -210,9 +246,18 @@ def ops_in_model_before(self):
 @pytest.mark.parametrize("flashinfer_allreduce_backend", ["trtllm", "mnnvl"])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 @pytest.mark.skipif(
-    not find_spec("flashinfer")
-    or not has_module_attribute("flashinfer.comm", "allreduce_fusion")
-    or not has_module_attribute("flashinfer.comm", "create_allreduce_fusion_workspace"),
+    current_platform.is_rocm() and not IS_AITER_FOUND,
+    reason="aiter is not found",
+)
+@pytest.mark.skipif(
+    current_platform.is_cuda()
+    and (
+        not find_spec("flashinfer")
+        or not has_module_attribute("flashinfer.comm", "allreduce_fusion")
+        or not has_module_attribute(
+            "flashinfer.comm", "create_allreduce_fusion_workspace"
+        )
+    ),
     reason="flashinfer is not found or flashinfer "
     "is not compiled with allreduce_fusion",
 )
@@ -225,7 +270,14 @@ def test_all_reduce_fusion_pass_replace(
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     flashinfer_allreduce_backend,
+    use_aiter: bool,
+    monkeypatch: pytest.MonkeyPatch,
 ):
+    if use_aiter:
+        with monkeypatch.context() as m:
+            m.setenv("VLLM_ROCM_USE_AITER", str(use_aiter))
+            rocm_aiter_ops.refresh_env_variables()
+
     num_processes = 2
     if (
         test_model == TestAllReduceFusedAddRMSNormStaticQuantFP4Model
@@ -249,6 +301,8 @@ def run_torch_spawn(fn, nprocs):
                 enable_rms_norm_custom_op,
                 enable_quant_fp8_custom_op,
                 flashinfer_allreduce_backend,
+                use_aiter,
+                monkeypatch,
             ),
             nprocs=nprocs,
         )
@@ -267,6 +321,8 @@ def all_reduce_fusion_pass_on_test_model(
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     flashinfer_allreduce_backend,
+    use_aiter: bool,
+    monkeypatch: pytest.MonkeyPatch,
 ):
     set_random_seed(0)
 
@@ -313,7 +369,11 @@ def all_reduce_fusion_pass_on_test_model(
     )
     with set_current_vllm_config(vllm_config):
         initialize_model_parallel(tensor_model_parallel_size=world_size)
-        all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
+        all_reduce_fusion_pass = (
+            RocmAiterAllReduceFusionPass(vllm_config)
+            if use_aiter
+            else AllReduceFusionPass(vllm_config)
+        )
         noop_pass = NoOpEliminationPass(vllm_config)
         func_pass = FixFunctionalizationPass(vllm_config)
         cleanup_pass = PostCleanupPass(vllm_config)
@@ -323,7 +383,12 @@ def all_reduce_fusion_pass_on_test_model(
         )
 
         token_num = batch_size * seq_len
-        model = test_model_cls(hidden_size, token_num, dtype=dtype)
+        if test_model_cls is TestAllReduceRMSNormModel:
+            model = test_model_cls(
+                hidden_size, token_num, dtype=dtype, use_aiter=use_aiter
+            )
+        else:
+            model = test_model_cls(hidden_size, token_num, dtype=dtype)
 
         hidden_states = torch.randn((token_num, hidden_size), requires_grad=False)
 
diff --git a/tests/compile/passes/distributed/test_sequence_parallelism.py b/tests/compile/passes/distributed/test_sequence_parallelism.py
index 1f1eeb8b4789..c40d75f6754a 100644
--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -88,14 +88,10 @@ def ops_in_model_after(self):
         ]
 
     def ops_in_model(self):
-        return (
-            [torch.ops.vllm_ir.rms_norm]
-            + [
-                torch.ops._C.fused_add_rms_norm.default,
-            ]
-            if RMSNorm.enabled()
-            else []
-        )
+        return [
+            torch.ops.vllm_ir.rms_norm,
+            torch.ops.vllm_ir.fused_add_rms_norm,
+        ]
 
 
 class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
@@ -152,16 +148,17 @@ def ops_in_model_before(self):
     def ops_in_model(self):
         if self.vllm_config.compilation_config.pass_config.fuse_norm_quant:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
-        elif RMSNorm.enabled():
-            return [
-                torch.ops._C.fused_add_rms_norm.default,
-            ]
-        elif any(layer.is_quant_fp8_enabled() for layer in self.fp8_linear_layers):
+        else:
+            quant_ops = (
+                [torch.ops._C.static_scaled_fp8_quant.default]
+                if any(layer.is_quant_fp8_enabled() for layer in self.fp8_linear_layers)
+                else [torch.ops.aten.reciprocal]
+            )
             return [
-                torch.ops._C.static_scaled_fp8_quant.default,
+                torch.ops.vllm_ir.rms_norm,
+                torch.ops.vllm_ir.fused_add_rms_norm,
+                *quant_ops,
             ]
-        else:
-            return []
 
 
 @multi_gpu_test(num_gpus=2)
diff --git a/tests/compile/passes/ir/test_clone_cleanup.py b/tests/compile/passes/ir/test_clone_cleanup.py
new file mode 100644
index 000000000000..9fedb5fc9177
--- /dev/null
+++ b/tests/compile/passes/ir/test_clone_cleanup.py
@@ -0,0 +1,412 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Comprehensive tests for UnsafeCloneEliminationPass.
+
+This test suite exercises all possible valid FX graph patterns involving clones:
+1. Clone with no users (dead code)
+2. Clone with read-only users
+3. Clone with mutation users
+4. Clone of graph input
+5. Clone with original used after mutation
+6. Clone chains
+"""
+
+import pytest
+import torch
+from torch import fx
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.compilation.passes.fx_utils import find_op_nodes
+from vllm.compilation.passes.inductor_pass import get_pass_context, pass_context
+from vllm.compilation.passes.ir.clone_elimination import (
+    UnsafeCloneEliminationPass,
+    user_writes_to_node,
+)
+from vllm.config import VllmConfig
+from vllm.config.utils import Range
+
+
+def count_clones(graph: fx.Graph) -> int:
+    """Count clone nodes in a graph."""
+    return len(list(find_op_nodes(torch.ops.aten.clone.default, graph)))
+
+
+@pytest.fixture(scope="function")
+def clone_cleanup_pass():
+    return UnsafeCloneEliminationPass(VllmConfig())
+
+
+@pytest.fixture(autouse=True)
+def setup_pass_context():
+    """Set up pass context for each test."""
+    with pass_context(compile_range=Range(1, 8192)):
+        yield
+
+
+class TestCloneCleanup:
+    """Test UnsafeCloneEliminationPass behavior on various graph patterns."""
+
+    def test_remove_clone_readonly_users(self, clone_cleanup_pass):
+        """Clone with only read-only users should be removed."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x_clone = x.clone()
+            return x_clone + 1
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_keep_clone_with_mutation_and_original_used_after(self, clone_cleanup_pass):
+        """Clone must be kept if it's mutated AND original is used after mutation."""
+
+        def f(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            x = x.relu()  # not a graph param
+            x_clone = x.clone()
+            x_clone.add_(1)
+            return x, x_clone
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        # Clone should be KEPT because original is used after mutation
+        assert count_clones(graph_module.graph) == 1
+        torch.testing.assert_close(actual[0], expected[0])
+        torch.testing.assert_close(actual[1], expected[1])
+
+    def test_remove_clone_with_mutation_no_original_use(self, clone_cleanup_pass):
+        """Clone can be removed if it's mutated but original is not used after."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x = x.relu()  # not a graph param
+            x_clone = x.clone()
+            x_clone.add_(1)
+            return x_clone
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_clone_chain(self, clone_cleanup_pass):
+        """Test handling of clone chains: x -> clone1 -> clone2."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x = x.relu()  # not a graph param
+            x1 = x.clone()
+            x2 = x1.clone()
+            return x2 + 1
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 2
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        # Both clones should be removed
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_multiple_clones_of_same_input(self, clone_cleanup_pass):
+        """Test multiple independent clones of the same input."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x1 = x.clone()
+            x2 = x.clone()
+            return x1 + x2
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 2
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        # Both clones should be removed (only readonly uses)
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_no_clones_in_graph(self, clone_cleanup_pass):
+        """Test pass behavior when graph has no clones."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            return x + 1
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 0
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_multiple_passes(self, clone_cleanup_pass):
+        """Test running the pass multiple times (should be idempotent)."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x1 = x.clone()
+            return x1 + 1
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        expected = graph_module(inp)
+
+        clone_cleanup_pass(graph_module.graph)
+        assert count_clones(graph_module.graph) == 0
+        graph_module.recompile()
+        actual = graph_module(inp)
+        torch.testing.assert_close(actual, expected)
+
+        clone_cleanup_pass(graph_module.graph)
+        assert count_clones(graph_module.graph) == 0
+        graph_module.recompile()
+        actual = graph_module(inp)
+        torch.testing.assert_close(actual, expected)
+
+    def test_output_node_no_write(self):
+        """Output nodes never write to their inputs."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3))
+        x_node = [n for n in graph_module.graph.nodes if n.op == "placeholder"][0]
+        output_node = [n for n in graph_module.graph.nodes if n.op == "output"][0]
+
+        assert not user_writes_to_node(output_node, x_node)
+
+    def test_readonly_op_no_write(self):
+        """Readonly operations don't write to inputs."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            return x + y
+
+        graph_module = make_fx(f)(torch.randn(2, 3), torch.randn(2, 3))
+        placeholders = [n for n in graph_module.graph.nodes if n.op == "placeholder"]
+        add_node = [
+            n
+            for n in graph_module.graph.nodes
+            if n.op == "call_function" and n.target == torch.ops.aten.add.Tensor
+        ][0]
+
+        assert not user_writes_to_node(add_node, placeholders[0])
+        assert not user_writes_to_node(add_node, placeholders[1])
+
+    def test_inplace_op_writes(self):
+        """Inplace operations write to first argument."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            x.add_(y)
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3), torch.randn(2, 3))
+        placeholders = [n for n in graph_module.graph.nodes if n.op == "placeholder"]
+        add_node = [
+            n
+            for n in graph_module.graph.nodes
+            if n.op == "call_function" and "add_" in str(n.target)
+        ][0]
+
+        # add_ writes to first arg but not second
+        assert user_writes_to_node(add_node, placeholders[0])
+        assert not user_writes_to_node(add_node, placeholders[1])
+
+    def test_copy_writes(self):
+        """copy_ operation writes to first argument."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            x.copy_(y)
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3), torch.randn(2, 3))
+        placeholders = [n for n in graph_module.graph.nodes if n.op == "placeholder"]
+        copy_node = [
+            n
+            for n in graph_module.graph.nodes
+            if n.op == "call_function" and "copy_" in str(n.target)
+        ][0]
+
+        assert user_writes_to_node(copy_node, placeholders[0])
+        assert not user_writes_to_node(copy_node, placeholders[1])
+
+    def test_auto_functionalized_not_a_write(self):
+        """auto_functionalized ops are follow-up uses, not writes."""
+        from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3))
+        x_node = [n for n in graph_module.graph.nodes if n.op == "placeholder"][0]
+
+        # Create an auto_functionalized node in the graph
+        with graph_module.graph.inserting_before(None):
+            af_node = graph_module.graph.call_function(
+                auto_functionalized, kwargs={"input": x_node}
+            )
+
+        # auto_functionalized should not be treated as a write
+        assert not user_writes_to_node(af_node, x_node)
+
+    def test_higher_order_op_conservatively_writes(self):
+        """Other higher-order operators are conservatively treated as writes."""
+        from torch._ops import HigherOrderOperator
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3))
+        x_node = [n for n in graph_module.graph.nodes if n.op == "placeholder"][0]
+
+        # Create a concrete higher-order operator subclass
+        class MockHigherOrderOp(HigherOrderOperator):
+            def __call__(self, *args, **kwargs):
+                return args[0] if args else None
+
+        mock_hoo = MockHigherOrderOp("mock_higher_order_op")
+
+        with graph_module.graph.inserting_before(None):
+            hoo_node = graph_module.graph.call_function(mock_hoo, args=(x_node,))
+
+        # Should be conservative and assume it could write
+        assert user_writes_to_node(hoo_node, x_node)
+
+
+class TestCloneCleanupWithDonatedInputs:
+    """Test UnsafeCloneEliminationPass with donated input tracking via PassContext."""
+
+    @pytest.fixture(autouse=True)
+    def setup_pass_context(self):
+        """Set up pass context for each test."""
+        with pass_context(compile_range=Range(1, 8192)):
+            yield
+
+    def test_donated_input_clone_removed(self, clone_cleanup_pass):
+        """Clone of donated input should be removed."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x_clone = x.clone()
+            x_clone.add_(1)
+            return x_clone
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        # Mark first parameter as donated
+        get_pass_context().donated_input_ids = {0}
+
+        expected = graph_module(inp.clone())
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+
+        # Clone should be removed since input is donated
+        assert count_clones(graph_module.graph) == 0
+
+        # Input can be mutated (donated)
+        inp_copy = inp.clone()
+        actual = graph_module(inp_copy)
+        torch.testing.assert_close(actual, expected)
+
+    def test_non_donated_input_clone_kept(self, clone_cleanup_pass):
+        """Clone of non-donated input with mutation should be kept."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            x_clone = x.clone()
+            x_clone.add_(1)
+            return x, x_clone
+
+        inp_x = torch.randn(2, 3)
+        inp_y = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp_x, inp_y)
+        assert count_clones(graph_module.graph) == 1
+
+        # No donated inputs
+        get_pass_context().donated_input_ids = set()
+
+        expected = graph_module(inp_x.clone(), inp_y.clone())
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+
+        # Clone should be kept since input is not donated and original is used
+        assert count_clones(graph_module.graph) == 1
+
+        # Verify inputs are not mutated
+        inp_x_before = inp_x.clone()
+        inp_y_before = inp_y.clone()
+        actual = graph_module(inp_x, inp_y)
+        torch.testing.assert_close(
+            inp_x, inp_x_before, msg="Input x should not be mutated"
+        )
+        torch.testing.assert_close(
+            inp_y, inp_y_before, msg="Input y should not be mutated"
+        )
+        torch.testing.assert_close(actual[0], expected[0])
+        torch.testing.assert_close(actual[1], expected[1])
+
+    def test_mixed_donated_inputs(self, clone_cleanup_pass):
+        """Test with some inputs donated and some not."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            x_clone = x.clone()
+            x_clone.add_(1)
+            y_clone = y.clone()
+            y_clone.add_(2)
+            return x_clone, y_clone
+
+        inp_x = torch.randn(2, 3)
+        inp_y = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp_x, inp_y)
+        assert count_clones(graph_module.graph) == 2
+
+        # Only x is donated
+        get_pass_context().donated_input_ids = {0}
+
+        expected = graph_module(inp_x.clone(), inp_y.clone())
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+
+        # x_clone removed (x is donated), y_clone kept (y is not donated)
+        assert count_clones(graph_module.graph) == 1
+
+        # Verify y is not mutated (x can be mutated since it's donated)
+        inp_y_before = inp_y.clone()
+        actual = graph_module(inp_x.clone(), inp_y)
+        torch.testing.assert_close(
+            inp_y, inp_y_before, msg="Input y should not be mutated"
+        )
+        torch.testing.assert_close(actual[0], expected[0])
+        torch.testing.assert_close(actual[1], expected[1])
diff --git a/tests/compile/passes/ir/test_inplace_functionalization.py b/tests/compile/passes/ir/test_inplace_functionalization.py
new file mode 100644
index 000000000000..1e8d5662162f
--- /dev/null
+++ b/tests/compile/passes/ir/test_inplace_functionalization.py
@@ -0,0 +1,465 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for IR inplace functionalization pass integration.
+
+This test suite verifies that the inplace functionalization pass, lowering pass,
+and clone cleanup pass work together correctly with donated buffer tracking.
+"""
+
+from collections.abc import Callable
+
+import pytest
+import torch
+import torch._dynamo.exc
+from torch import nn
+
+import vllm.kernels  # noqa: F401 to register kernels
+from vllm.compilation.passes.inductor_pass import InductorPass, get_pass_context
+from vllm.compilation.passes.ir.clone_elimination import (
+    UnsafeCloneEliminationPass,
+)
+from vllm.compilation.passes.ir.inplace_functionalization import (
+    VllmIRInplaceFunctionalizationPass,
+)
+from vllm.compilation.passes.ir.lowering_pass import VllmIRLoweringPass
+from vllm.config import VllmConfig
+from vllm.ir import ops
+from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON, tl, triton
+
+from ...backend import TestBackend
+
+
+class StoreDonationInfoPass(InductorPass):
+    def __init__(self):
+        self.donated_input_ids_sets: list[set[int]] = []
+
+    def __call__(self, *args, **kwargs):
+        ctx = get_pass_context()
+        self.donated_input_ids_sets += [ctx.donated_input_ids]
+
+
+class MaybeInplaceModel(nn.Module):
+    """Model using only maybe_inplace variants."""
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.weight1 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+        self.weight2 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+    def forward(
+        self, x: torch.Tensor, residual1: torch.Tensor, residual2: torch.Tensor
+    ):
+        # First maybe_inplace - x & residual1 are donated
+        x_normed1, residual_out1 = ops.fused_add_rms_norm.maybe_inplace(
+            x, residual1, self.weight1, 1e-5
+        )
+        # Second maybe_inplace - residual2 is donated
+        x_normed2, residual_out2 = ops.fused_add_rms_norm.maybe_inplace(
+            x_normed1, residual2, self.weight2, 1e-5
+        )
+        return x_normed2, residual_out1, residual_out2
+
+
+class FunctionalModel(nn.Module):
+    """Model using only functional (default) variants."""
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.weight1 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+        self.weight2 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+    def forward(
+        self, x: torch.Tensor, residual1: torch.Tensor, residual2: torch.Tensor
+    ):
+        # First functional - no donation
+        x_normed1, residual_out1 = ops.fused_add_rms_norm(
+            x, residual1, self.weight1, 1e-5
+        )
+        # Second functional - no donation
+        x_normed2, residual_out2 = ops.fused_add_rms_norm(
+            x_normed1, residual2, self.weight2, 1e-5
+        )
+        return x_normed2, residual_out1, residual_out2
+
+
+class MixedModel(nn.Module):
+    """Model mixing maybe_inplace and functional variants."""
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.weight1 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+        self.weight2 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+    def forward(
+        self, x: torch.Tensor, residual1: torch.Tensor, residual2: torch.Tensor
+    ):
+        # First maybe_inplace - x & residual1 are donated
+        x_normed1, residual_out1 = ops.fused_add_rms_norm.maybe_inplace(
+            x, residual1, self.weight1, 1e-5
+        )
+        # Second functional - no donation, x_normed1 must be preserved as it's returned
+        x_normed2, residual_out2 = ops.fused_add_rms_norm(
+            x_normed1, residual2, self.weight2, 1e-5
+        )
+        # Return both to prevent x_normed1 from being optimized away
+        return x_normed1, x_normed2, residual_out1, residual_out2
+
+
+class ModelWithTritonAfterMaybeInplace(nn.Module):
+    """
+    Model using maybe_inplace followed by a Triton kernel.
+    Test clone elimination can handle Triton in the graph
+    """
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+        @triton.jit
+        def _triton_add_kernel(
+            x_ptr,
+            y_ptr,
+            n_elements,
+            BLOCK_SIZE: tl.constexpr,
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(x_ptr + offsets, mask=mask)
+            y = x + 0.1
+            tl.store(y_ptr + offsets, y, mask=mask)
+
+        def triton_add(x: torch.Tensor) -> torch.Tensor:
+            """Simple Triton add kernel."""
+            y = torch.empty_like(x)
+            n_elements = x.numel()
+            grid = (triton.cdiv(n_elements, 256),)
+            _triton_add_kernel[grid](x, y, n_elements, BLOCK_SIZE=256)
+            return y
+
+        self.triton_add = triton_add
+
+    def forward(self, x: torch.Tensor, residual: torch.Tensor, residual2: torch.Tensor):
+        x_normed, residual_out = ops.fused_add_rms_norm.maybe_inplace(
+            x, residual, self.weight, 1e-5
+        )
+
+        x_processed = self.triton_add(x_normed)
+
+        # x_processed does not need to be cloned, residual2 does
+        x_normed2, residual_out2 = ops.fused_add_rms_norm(
+            x_processed, residual2, self.weight, 1e-5
+        )
+        return x_normed2, residual_out2
+
+
+skipif_no_triton = pytest.mark.skipif(not HAS_TRITON, reason="Requires Triton")
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+@pytest.mark.parametrize(
+    "model_class,expected_functionalized,expected_donated,expected_clones",
+    [
+        # 2 inplace calls, all activations donated, all clones eliminated
+        (MaybeInplaceModel, 2, 3, 0),
+        # No inplace calls, no donations, 3 clones (one eliminated)
+        (FunctionalModel, 0, 0, 3),
+        # One inplace call, two donated activations, 2 clones
+        (MixedModel, 1, 2, 2),
+        # One inplace call, two donated, 1 clone remaining
+        pytest.param(ModelWithTritonAfterMaybeInplace, 1, 2, 1, marks=skipif_no_triton),
+    ],
+)
+def test_inplace_functionalization(
+    default_vllm_config: VllmConfig,
+    model_class,
+    expected_functionalized: int,
+    expected_clones: int,
+    expected_donated: int,
+):
+    """Test inplace functionalization, lowering, and clone cleanup."""
+    torch.set_default_device(current_platform.device_type)
+
+    # Use vllm_c so inplace path is triggered
+    default_vllm_config.kernel_config.ir_op_priority.fused_add_rms_norm = [
+        "vllm_c",
+        "native",
+    ]
+
+    # Create passes in order they run during compilation
+    functionalization_pass = VllmIRInplaceFunctionalizationPass(default_vllm_config)
+    lowering_pass = VllmIRLoweringPass(default_vllm_config)
+    donated_info_pass = StoreDonationInfoPass()
+    cleanup_pass = UnsafeCloneEliminationPass(default_vllm_config)
+
+    # Set up backend with pre-grad pass
+    backend = TestBackend(lowering_pass, donated_info_pass, cleanup_pass)
+    backend.inductor_config["pre_grad_custom_pass"] = functionalization_pass
+
+    model = model_class()
+    x = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual1 = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual2 = torch.randn(8, 16, dtype=torch.bfloat16)
+
+    with default_vllm_config.kernel_config.ir_op_priority.set_priority():
+        # Reference output without optimization
+        ref_output = model(x.clone(), residual1.clone(), residual2.clone())
+
+        # Compile with inplace optimization
+        compiled_model = torch.compile(model, backend=backend, fullgraph=True)
+        output = compiled_model(x.clone(), residual1.clone(), residual2.clone())
+
+    # Verify correctness (relaxed tolerance for bfloat16)
+    for i in range(len(ref_output)):
+        torch.testing.assert_close(output[i], ref_output[i], rtol=1e-2, atol=1e-2)
+
+    # Verify expected number of ops were functionalized
+    func_ops = functionalization_pass.functionalized_ops
+    assert len(func_ops) == int(bool(expected_functionalized))
+    if expected_functionalized > 0:
+        assert "fused_add_rms_norm" in func_ops
+        assert func_ops["fused_add_rms_norm"] == expected_functionalized
+
+    # Verify lowering happened (2 ops in all cases)
+    assert "fused_add_rms_norm" in lowering_pass.selected_impls
+    assert len(lowering_pass.selected_impls["fused_add_rms_norm"]) == 2
+    assert all(
+        provider == "vllm_c"
+        for node, provider in lowering_pass.selected_impls["fused_add_rms_norm"].items()
+    ), lowering_pass.selected_impls
+
+    # Verify correct number of donated IDs
+    assert len(donated_info_pass.donated_input_ids_sets) == 1
+    assert len(donated_info_pass.donated_input_ids_sets[0]) == expected_donated
+
+    # Verify expected number of clones after cleanup
+    actual_clones = backend.op_count(torch.ops.aten.clone.default, before=False)
+    assert actual_clones == expected_clones, (
+        f"Expected {expected_clones} clones, got {actual_clones}:"
+        f"{backend.print_graphs()}"
+    )
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_donated_buffer_context_propagation(default_vllm_config):
+    """Test that donated_input_ids propagates correctly through pass_context."""
+    torch.set_default_device(current_platform.device_type)
+
+    # Create a custom backend that inspects pass_context in cleanup pass
+    functionalization_pass = VllmIRInplaceFunctionalizationPass(default_vllm_config)
+    lowering_pass = VllmIRLoweringPass(default_vllm_config)
+
+    donation_info_pass = StoreDonationInfoPass()
+    cleanup_pass = UnsafeCloneEliminationPass(default_vllm_config)
+
+    backend = TestBackend(lowering_pass, donation_info_pass, cleanup_pass)
+    backend.inductor_config["pre_grad_custom_pass"] = functionalization_pass
+
+    model = MaybeInplaceModel()
+    x = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual1 = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual2 = torch.randn(8, 16, dtype=torch.bfloat16)
+
+    compiled_model = torch.compile(model, backend=backend, fullgraph=True)
+    compiled_model(x.clone(), residual1.clone(), residual2.clone())
+
+    donated_ids_seen = donation_info_pass.donated_input_ids_sets
+    # Verify donated_input_ids was set and propagated
+    assert len(donated_ids_seen) == 1
+    # Should have donated inputs (exact indices depend on AOTAutograd)
+    assert len(donated_ids_seen[0]) == 3
+    # All donated ids should be valid non-negative integers
+    for idx in donated_ids_seen[0]:
+        assert isinstance(idx, int) and idx >= 0, f"Invalid donated index: {idx}"
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_maybe_inplace_reuse_error(default_vllm_config):
+    """Test that reusing a donated activation input raises ValueError."""
+    torch.set_default_device(current_platform.device_type)
+
+    class ReuseModel(nn.Module):
+        """Model that incorrectly reuses a donated activation input."""
+
+        def __init__(self, hidden_size=16):
+            super().__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+        def forward(self, x: torch.Tensor, residual: torch.Tensor):
+            # x is donated to maybe_inplace
+            x_normed, residual_out = ops.fused_add_rms_norm.maybe_inplace(
+                x, residual, self.weight, 1e-5
+            )
+            # ERROR: x is used again after being donated
+            return x_normed + x  # This should raise ValueError
+
+    functionalization_pass = VllmIRInplaceFunctionalizationPass(default_vllm_config)
+    lowering_pass = VllmIRLoweringPass(default_vllm_config)
+    cleanup_pass = UnsafeCloneEliminationPass(default_vllm_config)
+
+    backend = TestBackend(lowering_pass, cleanup_pass)
+    backend.inductor_config["pre_grad_custom_pass"] = functionalization_pass
+
+    model = ReuseModel()
+    x = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual = torch.randn(8, 16, dtype=torch.bfloat16)
+
+    # Compilation should raise BackendCompilerFailed wrapping ValueError
+    with pytest.raises(
+        torch._dynamo.exc.BackendCompilerFailed,
+        match="is used again after the node",
+    ):
+        compiled_model = torch.compile(model, backend=backend, fullgraph=True)
+        compiled_model(x.clone(), residual.clone())
+
+
+# Piecewise compilation tests with graph splitting
+
+
+@torch.library.custom_op("vllm::test_split_marker", mutates_args=())
+def test_split_marker(x: torch.Tensor) -> torch.Tensor:
+    """Identity op that marks a split point for piecewise compilation."""
+    return x.clone()
+
+
+@test_split_marker.register_fake
+def _fake_split_marker(x: torch.Tensor) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+class TransformerBlockWithSplits(nn.Module):
+    """Transformer block with explicit split points for piecewise compilation."""
+
+    def __init__(self, hidden_size=32, intermediate_size=128):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+
+        # Attention-like projection
+        self.attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=False, dtype=torch.bfloat16
+        )
+
+        # Post-attention norm
+        self.post_attn_norm = nn.Parameter(
+            torch.ones(hidden_size, dtype=torch.bfloat16)
+        )
+
+        # MLP
+        self.gate_proj = nn.Linear(
+            hidden_size, intermediate_size, bias=False, dtype=torch.bfloat16
+        )
+        self.up_proj = nn.Linear(
+            hidden_size, intermediate_size, bias=False, dtype=torch.bfloat16
+        )
+        self.down_proj = nn.Linear(
+            intermediate_size, hidden_size, bias=False, dtype=torch.bfloat16
+        )
+
+        # Post-MLP norm
+        self.post_mlp_norm = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+    def forward(self, x: torch.Tensor):
+        # Attention block with residual
+        residual1 = x
+        attn_out = self.attn_proj(x)
+
+        # Fused add + norm (maybe_inplace: residual1 is donated)
+        normed1, residual1 = ops.fused_add_rms_norm.maybe_inplace(
+            attn_out, residual1, self.post_attn_norm, 1e-5
+        )
+
+        # Force a graph split here
+        normed1 = torch.ops.vllm.test_split_marker(normed1)
+
+        # MLP block
+        gate = self.gate_proj(normed1)
+        up = self.up_proj(normed1)
+        mlp_out = self.down_proj(gate * torch.nn.functional.silu(up))
+
+        # Fused add + norm (maybe_inplace: residual1 is donated)
+        normed2, residual2 = ops.fused_add_rms_norm.maybe_inplace(
+            mlp_out, residual1, self.post_mlp_norm, 1e-5
+        )
+
+        return normed2, residual2
+
+
+def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
+    def inner(*args):
+        torch._dynamo.mark_dynamic(args[arg_index], dim_index)
+        return fn(*args)
+
+    return inner
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_piecewise_compilation_with_donated_buffers(monkeypatch, fresh_vllm_cache):
+    """
+    Test piecewise compilation with donated buffers across graph splits.
+    Utilizes a custom splitting op. Uses fresh cache to avoid compilation caching.
+    """
+    torch.set_default_device(current_platform.device_type)
+
+    # Disable compilation cache to avoid serialization issues
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    from vllm.compilation.backends import VllmBackend
+    from vllm.config import CompilationConfig, VllmConfig
+
+    # Create config with custom splitting op
+    store_donation_info = StoreDonationInfoPass()
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            custom_ops=["all"],
+            splitting_ops=["vllm::test_split_marker"],
+            inductor_compile_config={"post_grad_custom_post_pass": store_donation_info},
+        )
+    )
+
+    backend = VllmBackend(vllm_config)
+
+    model = TransformerBlockWithSplits()
+    x = torch.randn(8, 32, dtype=torch.bfloat16)
+
+    # Reference output
+    ref_output = with_dyn_arg(model, 0, 0)(x.clone())
+
+    # Compile with piecewise compilation (graph will split at split_marker)
+    compiled_model = torch.compile(model, backend=backend, fullgraph=False)
+    output = with_dyn_arg(compiled_model, 0, 0)(x.clone())
+
+    # Verify correctness (relaxed tolerance for bfloat16)
+    torch.testing.assert_close(output[0], ref_output[0], rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(output[1], ref_output[1], rtol=1e-2, atol=1e-2)
+
+    # Verify the model was split into multiple submodules
+    assert hasattr(backend, "split_gm"), "Backend should have split graph module"
+
+    # Should have at least 2 submodules (split by test_split_marker op)
+    submodules = list(backend.split_gm.named_children())
+    num_submodules = len(submodules)
+    assert num_submodules >= 2, (
+        f"Expected at least 2 submodules (split), got {num_submodules}"
+    )
+
+    # Check that donation info was propagated correctly
+    donated_inputs_sets = store_donation_info.donated_input_ids_sets
+    assert len(donated_inputs_sets) == 2
+    assert len(donated_inputs_sets[0]) == 1
+    assert len(donated_inputs_sets[1]) == 1
diff --git a/tests/compile/passes/test_functionalization.py b/tests/compile/passes/test_functionalization.py
index 9a03a6988763..31bf225d4135 100644
--- a/tests/compile/passes/test_functionalization.py
+++ b/tests/compile/passes/test_functionalization.py
@@ -126,7 +126,7 @@ def ops_in_model(self, do_fusion):
         if TEST_FP8 and do_fusion:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
         else:
-            return [torch.ops._C.fused_add_rms_norm.default]
+            return []
 
     def ops_not_in_model(self):
         return []
diff --git a/tests/compile/passes/test_fuse_act_padding.py b/tests/compile/passes/test_fuse_act_padding.py
index f3f3bda47277..bfbe762abdb6 100644
--- a/tests/compile/passes/test_fuse_act_padding.py
+++ b/tests/compile/passes/test_fuse_act_padding.py
@@ -59,7 +59,7 @@ def forward(self, x):
 
     def ops_in_model_before(self):
         return [
-            rocm_aiter_ops.get_rmsnorm_fused_add_op(),
+            torch.ops.vllm_ir.fused_add_rms_norm,
             torch.ops.aten.constant_pad_nd,
         ]
 
diff --git a/tests/compile/passes/test_fusion.py b/tests/compile/passes/test_fusion.py
index 32803aad8c1c..2feb0bc4f787 100644
--- a/tests/compile/passes/test_fusion.py
+++ b/tests/compile/passes/test_fusion.py
@@ -17,7 +17,6 @@
     FusedRMSQuantKey,
     RMSNormQuantFusionPass,
 )
-from vllm.compilation.passes.fx_utils import find_op_nodes
 from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
 from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
 from vllm.config import (
@@ -243,9 +242,10 @@ def ops_in_model_after(self):
         ]
 
     def ops_in_model_before_partial(self):
-        return [torch.ops.vllm_ir.rms_norm] + (
-            [RMS_ADD_OP] if self.enable_rms_norm_custom_op else [torch.ops.aten.rsqrt]
-        )
+        return [
+            torch.ops.vllm_ir.rms_norm,
+            torch.ops.vllm_ir.fused_add_rms_norm.default,
+        ]
 
 
 def _run_fusion_test(
@@ -383,17 +383,6 @@ def test_fusion_rmsnorm_quant(
             model.ops_in_model_before_partial(), fully_replaced=False
         )
 
-        # If RMSNorm custom op is disabled (native/torch impl used),
-        # there's a risk that the fused add doesn't get included in the
-        # replacement and only the rms part gets fused with quant.
-        # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
-        if not enable_rms_norm_custom_op:
-            n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
-            # rms_norm is IR, not included
-            # 6 = 3x2 (3xRMS_ADD, 2 each)
-            assert n_add_nodes(backend.graph_pre_pass) == 6
-            assert n_add_nodes(backend.graph_post_pass) == 2
-
 
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [256])
diff --git a/tests/compile/test_codegen.py b/tests/compile/test_codegen.py
new file mode 100644
index 000000000000..21db287a2478
--- /dev/null
+++ b/tests/compile/test_codegen.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for vllm.compilation.codegen — execution code generation.
+
+Each test runs a real Python function through the same pipeline vLLM uses
+in production: ``make_fx`` to obtain an aten-level fx graph, ``split_graph``
+to split it into the stitching layer + submodules, and then
+``generate_execution_code``/``compile_execution_fn`` for codegen.
+"""
+
+from collections.abc import Callable
+
+import pytest
+import regex as re
+import torch
+import torch.fx as fx
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.compilation.backends import split_graph
+from vllm.compilation.codegen import (
+    _node_ref,
+    compile_execution_fn,
+    generate_execution_code,
+    generate_execution_code_with_name,
+)
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+
+def _trace_and_split(
+    model_fn: Callable[..., torch.Tensor],
+    example_inputs: tuple[torch.Tensor, ...],
+    split_ops: list[str],
+) -> fx.GraphModule:
+    """Trace ``model_fn`` with make_fx, then split on the named aten ops."""
+    gm = make_fx(model_fn)(*example_inputs)
+    split_gm, _ = split_graph(gm, split_ops)
+    return split_gm
+
+
+def _to_copy_model(x: torch.Tensor) -> torch.Tensor:
+    """Traces to ``aten._to_copy.default`` with device + dtype kwargs."""
+    return x.to(device=torch.device("cpu"), dtype=torch.float16)
+
+
+def _empty_model(x: torch.Tensor) -> torch.Tensor:
+    """Traces to ``aten.empty.memory_format`` with device + dtype kwargs."""
+    buf = torch.empty(x.shape, device=torch.device("cpu"), dtype=torch.float16)
+    return buf.fill_(0).add(x.to(dtype=torch.float16))
+
+
+@pytest.fixture
+def x() -> torch.Tensor:
+    return torch.zeros(2, 3)
+
+
+@pytest.mark.parametrize(
+    "model_fn,split_ops",
+    [
+        (_to_copy_model, ["aten::_to_copy.default"]),
+        (_empty_model, []),
+    ],
+    ids=["aten::_to_copy.default", "aten::empty.memory_format"],
+)
+def test_non_primitive_kwargs_lifted_to_consts(
+    model_fn: Callable[[torch.Tensor], torch.Tensor],
+    split_ops: list[str],
+    x: torch.Tensor,
+) -> None:
+    """Regression: arguments whose ``repr()`` is not a valid Python
+    expression in the generated function's namespace (notably
+    ``torch.device``) used to be inlined via ``repr()``, producing source
+    like
+
+        out = torch.ops.aten._to_copy.default(x, device=device(type='cpu'))
+
+    which fails at call time — only ``torch`` and ``operator`` are imported
+    into the namespace, so ``device`` is unbound. The fix collects such
+    objects into ``__vllm_consts__`` and references them by index. The
+    unqualified ``device(type=...)`` form must never appear in the
+    generated source."""
+    split_gm = _trace_and_split(model_fn, (x,), split_ops)
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    assert "device(type=" not in code, (
+        "Generated code contains unqualified `device(type=...)` from repr(); "
+        "torch.device should be lifted into __vllm_consts__"
+    )
+    assert torch.device("cpu") in consts, "torch.device kwarg not lifted to consts"
+    assert torch.float16 in consts, "torch.dtype kwarg not lifted to consts"
+
+    fn = compile_execution_fn(code, {}, submod_names, consts)
+    out = fn(x)
+    expected = model_fn(x)
+    assert torch.equal(out, expected), "Compiled output does not match reference"
+
+
+def test_dtype_singleton_deduped(x: torch.Tensor) -> None:
+    """``torch.float16`` is a process-wide singleton, so two ops referring
+    to it in the traced graph share a single consts slot via ``id()``-based
+    dedup. Distinct expressions (``x.to(...)`` vs ``(x*2).to(...)``) ensure
+    the tracer can't CSE the two ops into a single node."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.to(dtype=torch.float16) + (x * 2).to(dtype=torch.float16)
+
+    split_gm = _trace_and_split(model_fn, (x,), [])
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    # The traced graph must have two distinct _to_copy nodes (otherwise the
+    # dedup assertion below is trivially satisfied).
+    n_to_copy = sum(
+        1
+        for n in split_gm.graph.nodes
+        if n.op == "call_module"
+        for sn in getattr(split_gm, n.target).graph.nodes
+        if sn.op == "call_function" and "to_copy" in sn.name
+    )
+    assert n_to_copy >= 2, (
+        f"Test setup failed: expected ≥2 _to_copy nodes, got {n_to_copy}"
+    )
+
+    assert consts.count(torch.float16) == 1, (
+        f"torch.float16 should occupy exactly one slot, got consts={consts}"
+    )
+    assert code.count("__vllm_consts__[0]") >= 2, (
+        "Deduped const slot should be referenced from both _to_copy nodes"
+    )
+
+    fn = compile_execution_fn(code, {}, submod_names, consts)
+    assert torch.equal(fn(x), model_fn(x))
+
+
+def test_distinct_dtypes_get_distinct_slots(x: torch.Tensor) -> None:
+    """Distinct dtype singletons in the traced graph occupy distinct slots."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.to(dtype=torch.float16) + x.to(dtype=torch.bfloat16)
+
+    split_gm = _trace_and_split(model_fn, (x,), [])
+    _, _, consts = generate_execution_code(split_gm)
+
+    assert torch.float16 in consts
+    assert torch.bfloat16 in consts
+    assert len(consts) == 2, f"Expected 2 distinct dtype slots, got {consts}"
+
+
+def test_consts_ordering_deterministic(x: torch.Tensor) -> None:
+    """Two independent traces of the same model must produce equal consts
+    lists *in the same order*. Cache artifacts identify const slots by
+    index, so a non-deterministic order would invalidate cached code."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # Multiple distinct non-primitives encountered in a fixed graph order.
+        a = x.to(device=torch.device("cpu"), dtype=torch.float16)
+        return a.to(dtype=torch.bfloat16)
+
+    _, _, consts1 = generate_execution_code(_trace_and_split(model_fn, (x,), []))
+    _, _, consts2 = generate_execution_code(_trace_and_split(model_fn, (x,), []))
+
+    assert len(consts1) >= 2, "Test setup: model should produce ≥2 const slots"
+    assert consts1 == consts2, (
+        f"consts ordering must be reproducible across traces; "
+        f"got {consts1} vs {consts2}"
+    )
+
+
+def test_primitive_args_inlined(x: torch.Tensor) -> None:
+    """Primitive args (int dim, etc.) stay inline as repr — no consts."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return torch.transpose(x, 0, 1).relu()
+
+    split_gm = _trace_and_split(model_fn, (x,), [])
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    assert consts == [], "Primitive-only graph must produce empty consts"
+
+    fn = compile_execution_fn(code, {}, submod_names, consts)
+    assert torch.equal(fn(x), model_fn(x))
+
+
+def test_consts_shared_across_split_submods(x: torch.Tensor) -> None:
+    """Dedup must apply across inlined submodules, not just within one.
+
+    The function below splits into three inlined submods, two of which
+    independently reference ``torch.float16``. The shared ``const_index``
+    threaded through recursive ``generate_execution_code_with_name`` calls
+    must collapse the dtype to a single slot used from both submods."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        a = x.to(dtype=torch.float16)  # submod_0: _to_copy(fp16)
+        b = a.relu()  # submod_1: relu (split point)
+        c = b.to(dtype=torch.float32)  # submod_2: _to_copy(fp32)
+        return c.to(dtype=torch.float16) + 1  # submod_2: another _to_copy(fp16)
+
+    split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
+
+    n_submods = sum(1 for _ in split_gm.named_children())
+    assert n_submods >= 3, (
+        f"Test setup failed: expected ≥3 submods after split, got {n_submods}"
+    )
+
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    assert consts.count(torch.float16) == 1, (
+        f"fp16 singleton must dedup across submods, got consts={consts}"
+    )
+
+    # Find the consts index for fp16 and confirm at least two distinct
+    # inlined submods reference it. This rules out the false-positive where
+    # one submod references it twice and the other not at all.
+    fp16_idx = consts.index(torch.float16)
+    submod_bodies = re.findall(
+        r"def __vllm_inlined_submods__(\d+)\([^)]*\):\n((?:    .*\n)+)", code
+    )
+    assert len(submod_bodies) >= 2
+    referencing_submods = [
+        name for name, body in submod_bodies if f"__vllm_consts__[{fp16_idx}]" in body
+    ]
+    assert len(referencing_submods) >= 2, (
+        f"fp16 slot should be referenced from ≥2 inlined submods, "
+        f"got {referencing_submods}"
+    )
+
+    fn = compile_execution_fn(code, {}, submod_names, consts)
+    assert torch.equal(fn(x), model_fn(x))
+
+
+def test_non_graphmodule_submod_uses_indexed_callable(x: torch.Tensor) -> None:
+    """When a child of split_gm is *not* a ``torch.fx.GraphModule`` — as
+    happens in production once ``PiecewiseBackend`` replaces submods —
+    codegen emits ``__vllm_submods__[idx](...)`` instead of inlining, and
+    the runtime callable is bound from ``submod_callables``."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.relu().sigmoid()
+
+    split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
+
+    # Find a GraphModule child and wrap it in a non-GraphModule nn.Module
+    # that delegates to the original — this is the structural shape vLLM
+    # produces after PiecewiseBackend takes over a submod.
+    child_names = [name for name, _ in split_gm.named_children()]
+    target_name = child_names[0]
+
+    class NonGMWrapper(torch.nn.Module):
+        def __init__(self, gm: fx.GraphModule) -> None:
+            super().__init__()
+            self.gm = gm
+
+        def forward(self, *args, **kwargs):
+            return self.gm(*args, **kwargs)
+
+    original = getattr(split_gm, target_name)
+    del split_gm._modules[target_name]
+    split_gm.add_module(target_name, NonGMWrapper(original))
+
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    assert "__vllm_submods__[" in code, (
+        "Non-GraphModule submod should produce an indexed callable reference"
+    )
+    assert target_name in submod_names
+
+    submod_callables = {
+        name: getattr(split_gm, name)
+        for name in submod_names
+        if not isinstance(getattr(split_gm, name), fx.GraphModule)
+    }
+    fn = compile_execution_fn(code, submod_callables, submod_names, consts)
+    assert torch.equal(fn(x), model_fn(x))
+
+
+# split_graph only passes tuple_return=True to split_module on PyTorch >= 2.12,
+# so getitem nodes only appear in the stitching graph from that version onward.
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.12.0.dev"),
+    reason="split_module tuple_return requires PyTorch >= 2.12",
+)
+def test_getitem_in_stitching_graph(x: torch.Tensor) -> None:
+    """``operator.getitem`` on submod tuple returns is the ``call_function``
+    special case at codegen.py — emitted as ``name = source[index]``
+    rather than a function call."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.relu().sigmoid()
+
+    split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
+    code, _, _ = generate_execution_code(split_gm)
+
+    # split_module wraps each submod return in a tuple, so the stitching
+    # graph unpacks via getitem. The codegen must emit it as indexing.
+    assert re.search(r"\b\w+ = \w+\[\d+\]\n", code), (
+        "Stitching graph should emit `name = source[N]` for getitem nodes"
+    )
+
+
+def test_del_emitted_for_intermediate_values(x: torch.Tensor) -> None:
+    """The codegen schedules ``del`` after a value's last use to free
+    memory early. Multi-submod splits naturally have intermediates whose
+    last use is not the output node."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.relu().sigmoid().tanh()
+
+    split_gm = _trace_and_split(
+        model_fn, (x,), ["aten::relu.default", "aten::sigmoid.default"]
+    )
+    code, _, _ = generate_execution_code(split_gm)
+
+    assert re.search(r"^    del \w+", code, re.MULTILINE), (
+        "Liveness analysis should emit `del` for intermediates with "
+        "last-use before the output"
+    )
+
+
+def test_with_submod_false_rejects_call_module() -> None:
+    """``generate_execution_code_with_name(with_submod=False)`` is the
+    recursive entry for inlining a GraphModule into its parent. It must
+    refuse a graph that itself contains ``call_module`` nodes — the parent
+    is responsible for handling those."""
+    g = fx.Graph()
+    x_node = g.placeholder("x")
+    root = torch.nn.Module()
+    root.add_module("inner", torch.nn.Identity())
+    call = g.call_module("inner", args=(x_node,))
+    g.output(call)
+    gm = fx.GraphModule(root, g)
+
+    with pytest.raises(RuntimeError, match="call_module is not allowed"):
+        generate_execution_code_with_name(gm, "f", with_submod=False)
+
+
+def test_node_ref_recurses_through_containers() -> None:
+    """``_node_ref`` is the recursive walker that lifts non-primitives
+    nested inside list/tuple/dict args. Real aten ops rarely produce such
+    structures, but the path is needed for DTensor placement lists and
+    other future cases — unit-test the walker directly."""
+    consts: list = []
+    const_index: dict[int, int] = {}
+    cpu = torch.device("cpu")
+
+    # Non-primitive in a list, primitive alongside.
+    assert _node_ref([cpu, 1], consts, const_index) == "[__vllm_consts__[0], 1]"
+    assert consts == [cpu]
+
+    # Same object in a tuple — id-based dedup reuses the existing slot.
+    assert _node_ref((cpu, 2), consts, const_index) == "(__vllm_consts__[0], 2)"
+    assert consts == [cpu]
+
+    # Single-element tuple uses the trailing-comma form.
+    assert _node_ref((cpu,), consts, const_index) == "(__vllm_consts__[0],)"
+
+    # Dict value lifts the same way.
+    ref = _node_ref({"k": cpu}, consts, const_index)
+    assert ref == "{'k': __vllm_consts__[0]}"
+
+
+def test_legacy_code_without_consts() -> None:
+    """``compile_execution_fn(consts=None)`` must still load code that has
+    no ``__vllm_consts__`` reference, so older serialized cache artifacts
+    keep working."""
+    # Pre-consts codegen: no __vllm_consts__ reference, only torch/operator.
+    legacy_code = (
+        "import torch\n"
+        "def execution_fn(x, *, __vllm_submods__):\n"
+        "    return __vllm_submods__[0](x) + 1\n"
+    )
+
+    class AddOne(torch.nn.Module):
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return x + 1
+
+    fn = compile_execution_fn(legacy_code, {"sub": AddOne()}, ["sub"], consts=None)
+    out = fn(torch.zeros(3))
+    assert torch.equal(out, torch.full((3,), 2.0))
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index bbb9cb1fcbcc..d822b68c5036 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -405,6 +405,9 @@ def test_should_split():
         (None, 0, 1, False, 2048, CUDAGraphMode.NONE, 0),
         # truncated to nearest multiple of 8 or 16
         (None, 257, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        # max_num_batched_tokens <= max_cudagraph_capture_size should always be
+        # captured even if not landing on a 16-stride step
+        (None, 2048, 1, False, 257, CUDAGraphMode.FULL_AND_PIECEWISE, 257),
         # max from list
         ([1, 2, 4, 15], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 15),
         # SP forces full-graph compilation, sizes are filtered by TP
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index 373d856d1caa..e45e5cf425fe 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -55,12 +55,10 @@ def test_dynamic_shapes_compilation(
     evaluate_guards,
 ):
     """Test that all dynamic shapes types compile successfully"""
-    if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
-        pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
-
     if evaluate_guards and shapes_type == DynamicShapesType.UNBACKED:
         pytest.skip("unbacked dynamic shapes do not add guards")
 
+    # TODO is this still a requirement?
     if evaluate_guards and use_aot_compile:
         pytest.skip("evaluate_guards requires use_aot_compile=0")
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 9ec31d83c757..779bd475f34b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -371,6 +371,8 @@ def __init__(
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
         auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
+        tokenizer_name: str | None = None,
+        processor: Any | None = None,
         # Set this to avoid hanging issue
         default_torch_num_threads: int | None = None,
     ) -> None:
@@ -391,6 +393,8 @@ def __init__(
                 is_cross_encoder=is_cross_encoder,
                 skip_tokenizer_init=skip_tokenizer_init,
                 auto_cls=auto_cls,
+                tokenizer_name=tokenizer_name,
+                processor=processor,
             )
 
     def _init(
@@ -405,6 +409,8 @@ def _init(
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
         auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
+        tokenizer_name: str | None = None,
+        processor: Any | None = None,
     ) -> None:
         model_name = maybe_model_redirect(model_name)
         self.model_name = model_name
@@ -484,20 +490,27 @@ def _init(
         if not skip_tokenizer_init:
             self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = (
                 AutoTokenizer.from_pretrained(
-                    model_name,
+                    tokenizer_name or model_name,
                     trust_remote_code=trust_remote_code,
                 )
             )
 
-        # don't put this import at the top level
-        # it will call torch.accelerator.device_count()
-        from transformers import AutoProcessor
+        if processor is not None:
+            self.processor = processor
+        else:
+            # don't put this import at the top level
+            # it will call torch.accelerator.device_count()
+            from transformers import AutoProcessor
 
-        self.processor = AutoProcessor.from_pretrained(
-            model_name,
-            trust_remote_code=trust_remote_code,
-        )
+            self.processor = AutoProcessor.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+            )
         if skip_tokenizer_init:
+            if self.processor is None:
+                raise ValueError(
+                    "skip_tokenizer_init=True requires processor initialization."
+                )
             self.tokenizer = self.processor.tokenizer
 
     def get_inputs(
@@ -520,6 +533,12 @@ def get_inputs(
         all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
         for i, prompt in enumerate(prompts):
             if isinstance(prompt, str):
+                if self.processor is None:
+                    raise RuntimeError(
+                        "HfRunner.processor is not initialized. "
+                        "Pass processor=... to HfRunner or set "
+                        "hf_model.processor before generation."
+                    )
                 # Create a copy to avoid modifying the original dict
                 processor_kwargs = (
                     tokenization_kwargs.copy()
@@ -617,6 +636,10 @@ def generate(
                 use_cache=True,
                 **kwargs,
             )
+            if self.processor is None:
+                raise RuntimeError(
+                    "HfRunner.processor is not initialized; cannot decode output."
+                )
             output_str = self.processor.batch_decode(
                 output_ids,
                 skip_special_tokens=True,
@@ -973,6 +996,8 @@ def generate(
             req_sample_output_ids: list[list[int]] = []
             req_sample_output_strs: list[str] = []
             req_logprobs = []
+            if req_output.prompt_logprobs:
+                req_logprobs.extend(req_output.prompt_logprobs)
             for sample in req_output.outputs:
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
diff --git a/tests/distributed/test_dcp_a2a.py b/tests/distributed/test_dcp_a2a.py
index 2f92413e58d9..d80ed36be650 100644
--- a/tests/distributed/test_dcp_a2a.py
+++ b/tests/distributed/test_dcp_a2a.py
@@ -10,10 +10,95 @@
 
 import math
 
+import multiprocess as mp
 import pytest
 import torch
+import torch.distributed as dist
 
 from vllm.config.parallel import ParallelConfig
+from vllm.utils.network_utils import get_open_port
+from vllm.utils.system_utils import update_environment_variables
+
+mp.set_start_method("spawn", force=True)
+
+
+class _FakeCPGroup:
+    def __init__(self, world_size: int, device_group: dist.ProcessGroup):
+        self.world_size = world_size
+        self.device_group = device_group
+
+
+def _dtype_from_name(dtype_name: str) -> torch.dtype:
+    return {
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "float32": torch.float32,
+    }[dtype_name]
+
+
+def _packed_a2a_reference(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    world_size: int,
+    h_per_rank: int,
+    is_lse_base_on_e: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+    B, _H, D = cp_attn_out.shape
+    outputs = (
+        cp_attn_out.view(B, world_size, h_per_rank, D)
+        .permute(1, 0, 2, 3)
+        .contiguous()
+        .float()
+    )
+    lses = cp_attn_lse.view(B, world_size, h_per_rank).permute(1, 0, 2).contiguous()
+    return _lse_weighted_combine(
+        outputs,
+        lses,
+        return_lse=True,
+        is_lse_base_on_e=is_lse_base_on_e,
+    )
+
+
+def _assert_packed_a2a_close(
+    actual: torch.Tensor,
+    expected: torch.Tensor,
+    dtype: torch.dtype,
+) -> None:
+    if dtype == torch.float32:
+        torch.testing.assert_close(actual, expected, rtol=1e-5, atol=1e-5)
+    else:
+        torch.testing.assert_close(
+            actual.float(), expected.float(), rtol=3e-2, atol=3e-2
+        )
+
+
+def _distributed_run(fn, world_size: int, extra_env: dict[str, str]) -> None:
+    port = str(get_open_port())
+    processes: list[mp.Process] = []
+    for rank in range(world_size):
+        env = {
+            "RANK": str(rank),
+            "LOCAL_RANK": str(rank),
+            "WORLD_SIZE": str(world_size),
+            "LOCAL_WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": port,
+            **extra_env,
+        }
+        process = mp.Process(target=fn, args=(env,))
+        processes.append(process)
+        process.start()
+
+    for process in processes:
+        process.join(timeout=120)
+
+    for process in processes:
+        if process.is_alive():
+            process.kill()
+            process.join()
+        assert process.exitcode == 0
 
 
 class TestDCPCommBackendConfig:
@@ -38,14 +123,14 @@ def test_a2a_with_dcp_valid(self):
         """A2A backend is valid when DCP > 1."""
         config = ParallelConfig(
             dcp_comm_backend="a2a",
-            tensor_parallel_size=8,
+            tensor_parallel_size=4,
             decode_context_parallel_size=4,
         )
         assert config.dcp_comm_backend == "a2a"
 
     def test_invalid_backend_rejected(self):
         """Invalid backend values are rejected."""
-        with pytest.raises(ValueError, match="must be one of"):
+        with pytest.raises(ValueError, match="must be one of|Input should be"):
             ParallelConfig(
                 dcp_comm_backend="invalid",
             )
@@ -134,7 +219,7 @@ def test_dominant_rank(self):
         result = _lse_weighted_combine(outputs, lses)
 
         assert result.shape == (B, H, D)
-        torch.testing.assert_close(result, outputs[1].squeeze(0), atol=1e-5, rtol=1e-5)
+        torch.testing.assert_close(result, outputs[1], atol=1e-5, rtol=1e-5)
 
     def test_mathematically_correct(self):
         """Verify mathematical correctness of LSE combination."""
@@ -187,6 +272,224 @@ def test_return_lse(self):
         assert global_lse.shape == (B, H)
         assert abs(global_lse.item() - expected_global_lse) < 1e-5
 
+    def test_base2_return_lse(self):
+        """Base-2 LSE mode returns log2-sum-exp2 global LSE."""
+        from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+        outputs = torch.tensor(
+            [
+                [[[1.0, 2.0]]],
+                [[[3.0, 4.0]]],
+            ]
+        )
+        lses = torch.tensor(
+            [
+                [[1.0]],
+                [[2.0]],
+            ]
+        )
+
+        result, global_lse = _lse_weighted_combine(
+            outputs,
+            lses,
+            return_lse=True,
+            is_lse_base_on_e=False,
+        )
+
+        expected_global_lse = math.log2(2**1 + 2**2)
+        w0 = 2**1 / (2**1 + 2**2)
+        w1 = 2**2 / (2**1 + 2**2)
+        expected = torch.tensor([[[w0 * 1.0 + w1 * 3.0, w0 * 2.0 + w1 * 4.0]]])
+
+        torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
+        torch.testing.assert_close(
+            global_lse,
+            torch.tensor([[expected_global_lse]]),
+            rtol=1e-5,
+            atol=1e-5,
+        )
+
+    def test_lse_pack_dim(self):
+        """Packed A2A stores one fp32 LSE in output-dtype lanes."""
+        from vllm.v1.attention.ops.dcp_alltoall import _dcp_a2a_lse_pack_dim
+
+        assert _dcp_a2a_lse_pack_dim(torch.bfloat16) == 2
+        assert _dcp_a2a_lse_pack_dim(torch.float16) == 2
+        assert _dcp_a2a_lse_pack_dim(torch.float32) == 1
+
+
+class TestPackedA2AKernels:
+    @pytest.mark.skipif(
+        torch.accelerator.device_count() < 1, reason="CUDA is required."
+    )
+    @pytest.mark.parametrize("dtype_name", ["float16", "bfloat16", "float32"])
+    @pytest.mark.parametrize("return_lse", [False, True])
+    @pytest.mark.parametrize("is_lse_base_on_e", [False, True])
+    def test_pack_unpack_combine_matches_reference(
+        self,
+        dtype_name: str,
+        return_lse: bool,
+        is_lse_base_on_e: bool,
+    ):
+        from vllm.v1.attention.ops.dcp_alltoall import (
+            _dcp_a2a_lse_pack_dim,
+            _dcp_a2a_pack_send,
+            _dcp_a2a_unpack_combine,
+        )
+
+        torch.manual_seed(0)
+        dtype = _dtype_from_name(dtype_name)
+        device = torch.device("cuda")
+        world_size, B, h_per_rank, D = 4, 7, 2, 32
+        H = world_size * h_per_rank
+        cp_attn_out = torch.randn(B, H, D, device=device, dtype=dtype)
+        cp_attn_lse = torch.randn(B, H, device=device, dtype=torch.float32)
+        lse_pack_dim = _dcp_a2a_lse_pack_dim(dtype)
+        send_buffer = torch.empty(
+            (world_size, B, h_per_rank, D + lse_pack_dim),
+            device=device,
+            dtype=dtype,
+        )
+
+        _dcp_a2a_pack_send(
+            cp_attn_out,
+            cp_attn_lse,
+            send_buffer,
+            world_size,
+            h_per_rank,
+            D,
+            lse_pack_dim,
+        )
+        actual = _dcp_a2a_unpack_combine(
+            send_buffer, D, lse_pack_dim, return_lse, is_lse_base_on_e
+        )
+        expected_out, expected_lse = _packed_a2a_reference(
+            cp_attn_out, cp_attn_lse, world_size, h_per_rank, is_lse_base_on_e
+        )
+
+        if return_lse:
+            actual_out, actual_lse = actual
+            _assert_packed_a2a_close(actual_out, expected_out, dtype)
+            torch.testing.assert_close(actual_lse, expected_lse, rtol=1e-4, atol=1e-4)
+        else:
+            _assert_packed_a2a_close(actual, expected_out, dtype)
+
+
+def _distributed_packed_a2a_worker(env: dict[str, str]) -> None:
+    update_environment_variables(env)
+    local_rank = int(env["LOCAL_RANK"])
+    torch.accelerator.set_device_index(local_rank)
+    dist.init_process_group(backend="nccl")
+    use_workspace = env.get("USE_WORKSPACE") == "1"
+    if use_workspace:
+        from vllm.v1.worker.workspace import init_workspace_manager
+
+        init_workspace_manager(torch.device(f"cuda:{local_rank}"))
+    try:
+        from vllm.v1.attention.ops.dcp_alltoall import dcp_a2a_lse_reduce
+
+        dtype = _dtype_from_name(env["TEST_DTYPE"])
+        return_lse = env["RETURN_LSE"] == "1"
+        is_lse_base_on_e = env["LSE_BASE_E"] == "1"
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        B, h_per_rank, D = 5, 2, 32
+        H = world_size * h_per_rank
+
+        generator = torch.Generator(device=f"cuda:{local_rank}")
+        generator.manual_seed(1234 + rank)
+        cp_attn_out = torch.randn(
+            B,
+            H,
+            D,
+            device=f"cuda:{local_rank}",
+            dtype=dtype,
+            generator=generator,
+        )
+        cp_attn_lse = torch.randn(
+            B,
+            H,
+            device=f"cuda:{local_rank}",
+            dtype=torch.float32,
+            generator=generator,
+        )
+        actual = dcp_a2a_lse_reduce(
+            cp_attn_out,
+            cp_attn_lse,
+            _FakeCPGroup(world_size, dist.group.WORLD),
+            return_lse=return_lse,
+            is_lse_base_on_e=is_lse_base_on_e,
+        )
+
+        gathered_out = [torch.empty_like(cp_attn_out) for _ in range(world_size)]
+        gathered_lse = [torch.empty_like(cp_attn_lse) for _ in range(world_size)]
+        dist.all_gather(gathered_out, cp_attn_out)
+        dist.all_gather(gathered_lse, cp_attn_lse)
+        outputs = torch.stack(
+            [
+                t[:, rank * h_per_rank : (rank + 1) * h_per_rank, :]
+                for t in gathered_out
+            ],
+            dim=0,
+        ).float()
+        lses = torch.stack(
+            [t[:, rank * h_per_rank : (rank + 1) * h_per_rank] for t in gathered_lse],
+            dim=0,
+        )
+        from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+        expected_out, expected_lse = _lse_weighted_combine(
+            outputs,
+            lses,
+            return_lse=True,
+            is_lse_base_on_e=is_lse_base_on_e,
+        )
+
+        if return_lse:
+            actual_out, actual_lse = actual
+            _assert_packed_a2a_close(actual_out, expected_out, dtype)
+            torch.testing.assert_close(actual_lse, expected_lse, rtol=1e-4, atol=1e-4)
+        else:
+            _assert_packed_a2a_close(actual, expected_out, dtype)
+    finally:
+        if use_workspace:
+            from vllm.v1.worker.workspace import reset_workspace_manager
+
+            reset_workspace_manager()
+        dist.destroy_process_group()
+
+
+@pytest.mark.skipif(
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs."
+)
+@pytest.mark.parametrize("dtype_name", ["float16", "bfloat16", "float32"])
+def test_distributed_packed_a2a_matches_reference(dtype_name: str):
+    _distributed_run(
+        _distributed_packed_a2a_worker,
+        world_size=4,
+        extra_env={
+            "TEST_DTYPE": dtype_name,
+            "RETURN_LSE": "1",
+            "LSE_BASE_E": "1",
+        },
+    )
+
+
+@pytest.mark.skipif(
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs."
+)
+def test_distributed_packed_a2a_with_workspace_matches_reference():
+    _distributed_run(
+        _distributed_packed_a2a_worker,
+        world_size=4,
+        extra_env={
+            "TEST_DTYPE": "bfloat16",
+            "RETURN_LSE": "1",
+            "LSE_BASE_E": "1",
+            "USE_WORKSPACE": "1",
+        },
+    )
+
 
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/distributed/test_mnnvl_alltoall.py b/tests/distributed/test_mnnvl_alltoall.py
new file mode 100644
index 000000000000..f395c96a3d3a
--- /dev/null
+++ b/tests/distributed/test_mnnvl_alltoall.py
@@ -0,0 +1,774 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for MNNVL AllToAll operations.
+
+Requires: docker run ... --cap-add=SYS_PTRACE ...
+Run: pytest tests/distributed/test_mnnvl_alltoall.py -v
+"""
+
+import os
+import traceback
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+
+from vllm.distributed import get_ep_group
+from vllm.utils.flashinfer import (
+    has_flashinfer_nvlink_one_sided,
+    has_flashinfer_nvlink_two_sided,
+)
+from vllm.utils.network_utils import get_open_port
+
+from ..utils import init_test_distributed_environment
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _has_sys_ptrace() -> bool:
+    """Check for SYS_PTRACE capability (bit 19 in CapEff)."""
+    try:
+        with open("/proc/self/status") as f:
+            for line in f:
+                if line.startswith("CapEff:"):
+                    return bool(int(line.split()[1], 16) & (1 << 19))
+    except Exception:
+        pass
+    return False
+
+
+def _spawn_workers(worker_fn, world_size, *, dp_size=None):
+    """Spawn one process per GPU, run worker_fn, assert all succeed.
+
+    Uses an mp.Queue to propagate worker tracebacks back to the parent
+    so pytest shows the actual failure, not just an exit code.
+    """
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method("spawn")
+
+    port = str(get_open_port())
+    # Allocate a second port for DP master when dp_size is set, so the
+    # distributed init port and DP port can't collide even under xdist.
+    dp_port = str(get_open_port()) if dp_size is not None else None
+    err_queue: mp.Queue = mp.Queue()
+    procs = []
+    for rank in range(world_size):
+        p = mp.Process(
+            target=_run_worker,
+            args=(rank, world_size, port, worker_fn, dp_size, dp_port, err_queue),
+        )
+        p.start()
+        procs.append(p)
+    for p in procs:
+        p.join()
+
+    # Collect any errors from workers before asserting.
+    errors = []
+    while not err_queue.empty():
+        errors.append(err_queue.get_nowait())
+    err_queue.close()
+    err_queue.join_thread()
+    if errors:
+        pytest.fail("Worker(s) failed:\n" + "\n---\n".join(errors))
+
+
+def _run_worker(rank, world_size, port, worker_fn, dp_size, dp_port, err_queue):
+    """Per-process setup: device, distributed env, then call worker_fn.
+
+    Args:
+        dp_size: If set, initialize with tp=1 and data_parallel_size=dp_size.
+                 Otherwise use tp=world_size (default for EP-based tests).
+        dp_port: Separate port for the DP master (only used when dp_size is set).
+        err_queue: Queue for propagating tracebacks to the parent process.
+    """
+    try:
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        torch.accelerator.set_device_index(rank)
+        if dp_size is not None:
+            _init_dp_environment(world_size, rank, port, dp_size, dp_port)
+        else:
+            init_test_distributed_environment(world_size, 1, rank, port)
+        worker_fn(rank, world_size)
+        torch.distributed.barrier()
+    except Exception:
+        err_queue.put(f"[Rank {rank}]\n{traceback.format_exc()}")
+        # Don't re-raise: the parent reads errors from err_queue.
+        # A non-zero exit from the re-raise would be redundant.
+        import sys
+
+        sys.exit(1)
+
+
+def _init_dp_environment(world_size, rank, port, dp_size, dp_port):
+    """Initialize distributed env with data parallelism.
+
+    Sets up tp=1, pp=1, dp=dp_size. Each process is one DP rank
+    with local rank 0 within its (trivial) tp*pp group.
+
+    Args:
+        port: Port for torch.distributed init.
+        dp_port: Separate port for the DP master group init.
+    """
+    from vllm.config import VllmConfig, set_current_vllm_config
+    from vllm.config.parallel import ParallelConfig
+    from vllm.distributed.parallel_state import (
+        ensure_model_parallel_initialized,
+        init_distributed_environment,
+    )
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config = ParallelConfig(
+        data_parallel_size=dp_size,
+        data_parallel_rank=rank,
+        # Pre-populate port list so __post_init__ doesn't auto-generate
+        # random ports. All DP ranks must agree on the same port.
+        _data_parallel_master_port_list=[int(dp_port)],
+    )
+    with set_current_vllm_config(vllm_config):
+        # rank=0 here because each DP rank has a single (tp=1,pp=1) process,
+        # so the local rank within the tp*pp group is always 0.
+        # init_distributed_environment will offset by data_parallel_rank.
+        init_distributed_environment(
+            world_size=1,  # tp * pp = 1
+            rank=0,
+            distributed_init_method=f"tcp://localhost:{port}",
+            local_rank=rank,
+        )
+        ensure_model_parallel_initialized(1, 1)
+
+
+def _make_forward_context(rank, world_size, num_tokens_per_rank):
+    """Create a forward context with mock DP metadata for AgRs tests.
+
+    Returns a context manager suitable for ``with`` statements.
+    The real DPMetadata (with sp_local_sizes etc.) is created internally
+    by set_forward_context from num_tokens_across_dp; the attn_metadata
+    placeholder just satisfies the "attn_metadata is not None" guard.
+    """
+    from vllm.config.parallel import ParallelConfig
+    from vllm.config.vllm import VllmConfig
+    from vllm.forward_context import set_forward_context
+
+    class _AttnMeta:
+        """Minimal placeholder so set_forward_context's
+        ``attn_metadata is not None`` guard (forward_context.py:334)
+        is satisfied. The real DPMetadata is built from num_tokens_across_dp."""
+
+        dp_metadata = None
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config = ParallelConfig(
+        data_parallel_size=world_size,
+        is_moe_model=True,
+        data_parallel_rank=rank,
+    )
+    return set_forward_context(
+        _AttnMeta(),
+        vllm_config,
+        num_tokens=num_tokens_per_rank,
+        num_tokens_across_dp=torch.tensor(
+            [num_tokens_per_rank] * world_size, dtype=torch.int
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Skip conditions
+# ---------------------------------------------------------------------------
+
+requires_multi_gpu = pytest.mark.skipif(
+    torch.accelerator.device_count() < 2, reason="Need >= 2 GPUs"
+)
+requires_two_sided = pytest.mark.skipif(
+    not has_flashinfer_nvlink_two_sided(),
+    reason="FlashInfer NVLink two-sided not available",
+)
+requires_one_sided = pytest.mark.skipif(
+    not has_flashinfer_nvlink_one_sided(),
+    reason="FlashInfer NVLink one-sided not available",
+)
+requires_ptrace = pytest.mark.skipif(
+    not _has_sys_ptrace(),
+    reason="SYS_PTRACE required (docker run --cap-add=SYS_PTRACE)",
+)
+
+# NOTE: No module-level pytestmark here. The FlashInfer lifecycle tests have
+# their own @requires_two_sided / @requires_one_sided decorators, and
+# test_args_dispatch_combine uses only standard torch.distributed ops and
+# should run even when FlashInfer NVLink backends are not installed.
+
+
+# ---------------------------------------------------------------------------
+# Test 1: Two-sided manager lifecycle (init, cleanup, reinit, ensure_init)
+# ---------------------------------------------------------------------------
+#
+# Tests FlashInferNVLinkTwoSidedManager which wraps FlashInfer's MnnvlMoe.
+# initialize() allocates MNNVL shared workspaces via MnnvlMoe.get_moe_workspaces,
+# which uses pidfd_getfd() to share memory file descriptors across processes —
+# hence the SYS_PTRACE requirement.
+#
+# Uses EP group (get_ep_group) because the two-sided manager is constructed
+# with an EP-scoped communicator in production. With tp=world_size the EP
+# group spans all ranks, giving us a multi-rank group for testing.
+# ---------------------------------------------------------------------------
+
+
+def _two_sided_lifecycle_worker(rank, world_size):
+    from vllm.distributed.device_communicators.all2all import (
+        FlashInferNVLinkTwoSidedManager,
+    )
+
+    cpu_group = get_ep_group().cpu_group
+    num_gpus = torch.accelerator.device_count()
+    manager = FlashInferNVLinkTwoSidedManager(cpu_group)
+
+    # Not initialized yet
+    assert not manager.initialized
+    assert manager.rank == rank
+    assert manager.world_size == world_size
+
+    # Initialize
+    manager.initialize(world_size=world_size, rank=rank, gpus_per_node=num_gpus)
+    assert manager.initialized
+    assert manager.workspace_tensor is not None
+    assert manager.prepare_workspace_tensor is not None
+    assert manager.mapping is not None
+
+    torch.distributed.barrier()
+
+    # Cleanup
+    manager.cleanup()
+    assert not manager.initialized
+    assert manager.workspace_tensor is None
+    assert manager.prepare_workspace_tensor is None
+
+    torch.distributed.barrier()
+
+    # Reinitialize
+    manager.initialize(world_size=world_size, rank=rank, gpus_per_node=num_gpus)
+    assert manager.initialized
+
+    torch.distributed.barrier()
+
+    # ensure_alltoall_workspace_initialized is idempotent when already init'd
+    assert manager.ensure_alltoall_workspace_initialized()
+    assert manager.initialized
+
+    manager.cleanup()
+    assert not manager.initialized
+
+
+@requires_multi_gpu
+@requires_two_sided
+@requires_ptrace
+@pytest.mark.parametrize("world_size", [2])
+def test_two_sided_manager_lifecycle(world_size):
+    """Test init, cleanup, reinit, and ensure_initialized idempotency."""
+    _spawn_workers(_two_sided_lifecycle_worker, world_size)
+
+
+# ---------------------------------------------------------------------------
+# Test 2: One-sided manager lifecycle (init, cleanup, reinit)
+# ---------------------------------------------------------------------------
+#
+# Tests FlashInferNVLinkOneSidedManager which wraps FlashInfer's MoeAlltoAll.
+# initialize() creates MoeAlltoAll with an MnnvlConfig, which allocates MNNVL
+# shared workspaces — same cross-process memory sharing as two-sided, hence
+# the SYS_PTRACE requirement.
+#
+# Uses DP group (get_dp_group) because the one-sided manager's initialize()
+# internally calls get_dp_group() to set up the MnnvlConfig communicator.
+# We therefore need a real DP group with world_size > 1, which requires
+# dp_size=world_size via _init_dp_environment.
+# ---------------------------------------------------------------------------
+
+
+def _one_sided_lifecycle_worker(rank, world_size):
+    from vllm.distributed.device_communicators.all2all import (
+        FlashInferNVLinkOneSidedManager,
+    )
+    from vllm.distributed.parallel_state import get_dp_group
+
+    cpu_group = get_dp_group().cpu_group
+    manager = FlashInferNVLinkOneSidedManager(cpu_group)
+
+    assert not manager.initialized
+    assert manager.rank == rank
+    assert manager.world_size == world_size
+
+    init_kwargs = dict(
+        max_num_tokens=1024,
+        top_k=2,
+        num_experts=world_size * 8,
+        hidden_size=4096,
+    )
+
+    # Initialize
+    manager.initialize(**init_kwargs)
+    assert manager.initialized
+    assert manager.moe_alltoall is not None
+    assert manager.mapping is not None
+
+    torch.distributed.barrier()
+
+    # Cleanup
+    manager.cleanup()
+    assert not manager.initialized
+    assert manager.moe_alltoall is None
+
+    torch.distributed.barrier()
+
+    # Reinitialize with different token count
+    manager.initialize(**{**init_kwargs, "max_num_tokens": 2048})
+    assert manager.initialized
+
+    torch.distributed.barrier()
+    manager.cleanup()
+
+
+@requires_multi_gpu
+@requires_one_sided
+@requires_ptrace
+@pytest.mark.parametrize("world_size", [2])
+def test_one_sided_manager_lifecycle(world_size):
+    """Test init, cleanup, and reinit with different params."""
+    _spawn_workers(
+        _one_sided_lifecycle_worker,
+        world_size,
+        dp_size=world_size,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Test 3: AgRs dispatch/combine with value validation
+# ---------------------------------------------------------------------------
+#
+# Tests AgRsAll2AllManager which uses only standard torch.distributed
+# all_gatherv / reduce_scatterv — no FlashInfer or MNNVL dependency.
+# This test validates the reference all-to-all implementation that other
+# backends are compared against.
+# ---------------------------------------------------------------------------
+
+
+def _args_dispatch_combine_worker(rank, world_size):
+    from vllm.distributed.device_communicators.all2all import AgRsAll2AllManager
+    from vllm.forward_context import get_forward_context
+
+    cpu_group = get_ep_group().cpu_group
+    device = torch.device(f"cuda:{rank}")
+
+    hidden_size = 64
+    tokens_per_rank = 16
+    experts_per_token = 2
+    num_experts = world_size * 4
+    total_tokens = world_size * tokens_per_rank
+
+    # Deterministic per-rank data: rank r has value (r + 1)
+    hidden = torch.full(
+        (tokens_per_rank, hidden_size),
+        float(rank + 1),
+        device=device,
+        dtype=torch.float32,
+    )
+    router = torch.full(
+        (tokens_per_rank, num_experts),
+        float(rank + 1) * 10,
+        device=device,
+        dtype=torch.float32,
+    )
+    weights = torch.full(
+        (tokens_per_rank, experts_per_token),
+        float(rank + 1) * 100,
+        device=device,
+        dtype=torch.float32,
+    )
+    ids = torch.full(
+        (tokens_per_rank, experts_per_token),
+        rank,
+        device=device,
+        dtype=torch.long,
+    )
+
+    with _make_forward_context(rank, world_size, tokens_per_rank):
+        manager = AgRsAll2AllManager(cpu_group)
+        dp_metadata = get_forward_context().dp_metadata
+
+        with dp_metadata.sp_local_sizes(sequence_parallel_size=1):
+            # -- dispatch_router_logits --
+            d_hidden, d_router = manager.dispatch_router_logits(
+                hidden.clone(),
+                router.clone(),
+                is_sequence_parallel=True,
+            )
+            assert d_hidden.shape == (total_tokens, hidden_size)
+            assert d_router.shape == (total_tokens, num_experts)
+
+            for r in range(world_size):
+                s = r * tokens_per_rank
+                e = (r + 1) * tokens_per_rank
+                torch.testing.assert_close(
+                    d_hidden[s:e],
+                    torch.full_like(d_hidden[s:e], float(r + 1)),
+                )
+                torch.testing.assert_close(
+                    d_router[s:e],
+                    torch.full_like(d_router[s:e], float(r + 1) * 10),
+                )
+
+            # -- dispatch --
+            d_hidden2, d_weights, d_ids = manager.dispatch(
+                hidden.clone(),
+                weights.clone(),
+                ids.clone(),
+                is_sequence_parallel=True,
+            )
+            assert d_hidden2.shape == (total_tokens, hidden_size)
+            assert d_weights.shape == (total_tokens, experts_per_token)
+            assert d_ids.shape == (total_tokens, experts_per_token)
+
+            for r in range(world_size):
+                s = r * tokens_per_rank
+                e = (r + 1) * tokens_per_rank
+                torch.testing.assert_close(
+                    d_weights[s:e],
+                    torch.full_like(d_weights[s:e], float(r + 1) * 100),
+                )
+                assert (d_ids[s:e] == r).all()
+
+            # -- combine (reduce-scatter) --
+            # Each token i has value i in all columns; after reduce-scatter
+            # each rank gets its slice, summed across ranks.
+            expert_out = (
+                torch.arange(total_tokens, device=device, dtype=torch.float32)
+                .unsqueeze(1)
+                .expand(total_tokens, hidden_size)
+                .contiguous()
+            )
+
+            combined = manager.combine(expert_out, is_sequence_parallel=True)
+            assert combined.shape == (tokens_per_rank, hidden_size)
+
+            for i in range(tokens_per_rank):
+                expected_val = float(rank * tokens_per_rank + i) * world_size
+                torch.testing.assert_close(
+                    combined[i],
+                    torch.full_like(combined[i], expected_val),
+                )
+
+            torch.distributed.barrier()
+
+
+@requires_multi_gpu
+@pytest.mark.parametrize("world_size", [2])
+def test_args_dispatch_combine(world_size):
+    """Validate dispatch gathers all-rank data and combine reduces correctly."""
+    _spawn_workers(_args_dispatch_combine_worker, world_size)
+
+
+# ---------------------------------------------------------------------------
+# Test 4: FlashInfer two-sided dispatch/combine data communication
+# ---------------------------------------------------------------------------
+#
+# Tests actual data flow through the FlashInfer NVLink two-sided backend
+# by calling flashinfer_alltoall_dispatch (with defer_input_quant=True to
+# skip quantization) and flashinfer_alltoall_combine, then verifying exact
+# round-trip values. Dispatch sends each token once per distinct expert
+# rank, and combine performs an unweighted sum, so:
+#   dispatch(hidden) → identity → combine = hidden * num_distinct_ranks(i)
+# ---------------------------------------------------------------------------
+
+
+def _two_sided_data_worker(rank, world_size):
+    from vllm.distributed.device_communicators.all2all import (
+        FlashInferNVLinkTwoSidedManager,
+    )
+    from vllm.distributed.parallel_state import get_dp_group
+    from vllm.forward_context import get_forward_context
+    from vllm.model_executor.layers.fused_moe.config import (
+        FusedMoEQuantConfig,
+        FusedMoEQuantDesc,
+    )
+    from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided import (  # noqa: E501
+        flashinfer_alltoall_combine,
+        flashinfer_alltoall_dispatch,
+    )
+
+    # Use DP group because MnnvlMoe workspace allocation calls get_dp_group()
+    # internally and requires dp_size == ep_size.
+    cpu_group = get_dp_group().cpu_group
+    device = torch.device(f"cuda:{rank}")
+    num_gpus = torch.accelerator.device_count()
+
+    hidden_size = 128
+    tokens_per_rank = 32
+    experts_per_token = 2
+    num_experts = world_size * 4
+
+    # Initialize the FlashInfer two-sided manager
+    manager = FlashInferNVLinkTwoSidedManager(cpu_group)
+    manager.initialize(world_size=world_size, rank=rank, gpus_per_node=num_gpus)
+    assert manager.initialized
+
+    torch.distributed.barrier()
+
+    # Create deterministic per-rank test data
+    torch.manual_seed(rank + 42)
+    hidden = torch.randn(
+        tokens_per_rank,
+        hidden_size,
+        device=device,
+        dtype=torch.bfloat16,
+    )
+    # Assign each token to experts spread across ranks so tokens move between GPUs
+    topk_ids = torch.randint(
+        0,
+        num_experts,
+        (tokens_per_rank, experts_per_token),
+        device=device,
+        dtype=torch.int32,
+    )
+    topk_weights = torch.rand(
+        tokens_per_rank,
+        experts_per_token,
+        device=device,
+        dtype=torch.float32,
+    )
+
+    # Unquantized config: quant_dtype=None means moe_kernel_quantize_input is a no-op
+    no_quant = FusedMoEQuantDesc()
+    quant_config = FusedMoEQuantConfig(
+        _a1=no_quant,
+        _a2=no_quant,
+        _w1=no_quant,
+        _w2=no_quant,
+    )
+    assert quant_config.quant_dtype is None  # sanity: no quantization
+
+    with _make_forward_context(rank, world_size, tokens_per_rank):
+        dp_metadata = get_forward_context().dp_metadata
+
+        with dp_metadata.sp_local_sizes(sequence_parallel_size=1):
+            local_sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
+
+            # --- FlashInfer two-sided dispatch ---
+            alltoall_info, fi_topk_ids, fi_topk_weights, fi_hidden, fi_scale = (
+                flashinfer_alltoall_dispatch(
+                    manager,
+                    local_sizes,
+                    hidden.clone(),
+                    None,  # no global scale
+                    topk_ids.clone(),
+                    topk_weights.clone(),
+                    experts_per_token,
+                    num_experts,
+                    quant_config,
+                    defer_input_quant=True,
+                )
+            )
+            assert fi_scale is None  # deferred quant: no scale produced
+            assert fi_hidden is not None
+            assert fi_hidden.shape[1] == hidden_size
+            assert fi_hidden.numel() > 0
+
+            # --- Round-trip exact verification ---
+            # The all-to-all sends each token once per *distinct* expert
+            # rank. Combine performs an unweighted sum of the per-rank
+            # contributions. With identity expert (feeding dispatched
+            # hidden straight back):
+            #   result[i] = hidden[i] * num_distinct_expert_ranks(i)
+            combined = flashinfer_alltoall_combine(
+                manager,
+                fi_hidden,
+                top_k=experts_per_token,
+                token_count=tokens_per_rank,
+                alltoall_info=alltoall_info,
+            )
+            assert combined.shape == (tokens_per_rank, hidden_size)
+
+            experts_per_rank = num_experts // world_size
+            expert_ranks = topk_ids // experts_per_rank  # (tokens, top_k)
+            num_distinct = torch.tensor(
+                [len(set(row.tolist())) for row in expert_ranks],
+                device=device,
+                dtype=torch.float32,
+            ).unsqueeze(1)  # (tokens, 1)
+            expected = (hidden.float() * num_distinct).to(hidden.dtype)
+            torch.testing.assert_close(combined, expected)
+
+            # --- Linearity check with scaled expert output ---
+            # Scaling the expert output by a constant should scale the
+            # combined result by the same constant.
+            scale = 3.0
+            combined_scaled = flashinfer_alltoall_combine(
+                manager,
+                fi_hidden * scale,
+                top_k=experts_per_token,
+                token_count=tokens_per_rank,
+                alltoall_info=alltoall_info,
+            )
+            expected_scaled = (hidden.float() * num_distinct * scale).to(hidden.dtype)
+            torch.testing.assert_close(combined_scaled, expected_scaled)
+
+            torch.distributed.barrier()
+
+    manager.cleanup()
+
+
+@requires_multi_gpu
+@requires_two_sided
+@requires_ptrace
+@pytest.mark.parametrize("world_size", [2])
+def test_two_sided_dispatch_combine(world_size):
+    """Test FlashInfer two-sided dispatch/combine with exact value verification."""
+    _spawn_workers(_two_sided_data_worker, world_size, dp_size=world_size)
+
+
+# ---------------------------------------------------------------------------
+# Test 5: FlashInfer one-sided dispatch/combine data communication
+# ---------------------------------------------------------------------------
+#
+# Tests actual data flow through the FlashInfer NVLink one-sided backend
+# by calling MoeAlltoAll.dispatch() and MoeAlltoAll.combine() directly
+# with synthetic payloads, then verifying shapes and round-trip consistency.
+# ---------------------------------------------------------------------------
+
+
+def _one_sided_data_worker(rank, world_size):
+    from vllm.distributed.device_communicators.all2all import (
+        FlashInferNVLinkOneSidedManager,
+    )
+    from vllm.distributed.parallel_state import get_dp_group
+    from vllm.forward_context import get_forward_context
+
+    cpu_group = get_dp_group().cpu_group
+    device = torch.device(f"cuda:{rank}")
+
+    hidden_size = 256
+    tokens_per_rank = 32
+    experts_per_token = 2
+    num_experts = world_size * 8
+
+    # Initialize the one-sided manager
+    manager = FlashInferNVLinkOneSidedManager(cpu_group)
+    manager.initialize(
+        max_num_tokens=tokens_per_rank,
+        top_k=experts_per_token,
+        num_experts=num_experts,
+        hidden_size=hidden_size,
+    )
+    assert manager.initialized
+    assert manager.moe_alltoall is not None
+
+    with _make_forward_context(rank, world_size, tokens_per_rank):
+        dp_metadata = get_forward_context().dp_metadata
+
+        with dp_metadata.sp_local_sizes(sequence_parallel_size=1):
+            local_sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
+            runtime_max_tokens = max(local_sizes)
+
+            # Create test data with raw tensors matching the nvfp4 payload
+            # sizes the workspace was allocated for:
+            #   a1q: (tokens, hidden_size // 2) — nvfp4 hidden states
+            #   a1q_scale: (tokens, hidden_size // 16) — fp8 scaling factors
+            torch.manual_seed(rank + 42)
+            a1q = torch.randint(
+                0,
+                256,
+                (tokens_per_rank, hidden_size // 2),
+                device=device,
+                dtype=torch.uint8,
+            )
+            a1q_scale = torch.randint(
+                0,
+                256,
+                (tokens_per_rank, hidden_size // 16),
+                device=device,
+                dtype=torch.uint8,
+            )
+            topk_ids = torch.randint(
+                0,
+                num_experts,
+                (tokens_per_rank, experts_per_token),
+                device=device,
+                dtype=torch.int32,
+            )
+            topk_weights = torch.rand(
+                tokens_per_rank,
+                experts_per_token,
+                device=device,
+                dtype=torch.float32,
+            )
+
+            # --- One-sided dispatch ---
+            payloads = [a1q, a1q_scale, topk_ids, topk_weights]
+            recv_payloads = manager.moe_alltoall.dispatch(
+                token_selected_experts=topk_ids,
+                input_payloads=payloads,
+                runtime_max_tokens_per_rank=runtime_max_tokens,
+            )
+            assert len(recv_payloads) == 4
+            recv_a1q, recv_scale, recv_ids, recv_weights = recv_payloads
+            assert recv_a1q.numel() > 0
+            assert recv_ids.numel() > 0
+
+            # --- Round-trip exact verification ---
+            # The dispatch routes each token once per *distinct* expert
+            # rank. Combine performs an unweighted sum of per-rank
+            # contributions. With constant expert output (all 1s):
+            #   result[i] = 1.0 * num_distinct_expert_ranks(i)
+            expert_output = torch.ones(
+                world_size,
+                runtime_max_tokens,
+                hidden_size,
+                device=device,
+                dtype=torch.bfloat16,
+            )
+            combined = manager.moe_alltoall.combine(
+                payload=expert_output,
+                runtime_max_tokens_per_rank=runtime_max_tokens,
+            )
+            assert combined.shape == (tokens_per_rank, hidden_size)
+
+            experts_per_rank = num_experts // world_size
+            expert_ranks = topk_ids // experts_per_rank  # (tokens, top_k)
+            num_distinct = torch.tensor(
+                [len(set(row.tolist())) for row in expert_ranks],
+                device=device,
+                dtype=torch.bfloat16,
+            ).unsqueeze(1)  # (tokens, 1)
+            expected = num_distinct.expand_as(combined)
+            torch.testing.assert_close(combined, expected)
+
+            # --- Linearity check with scaled expert output ---
+            # Scaling the expert output by a constant should scale the
+            # combined result by the same constant.
+            # Re-dispatch to reset internal state (one-sided requires a
+            # fresh dispatch before each combine).
+            manager.moe_alltoall.dispatch(
+                token_selected_experts=topk_ids,
+                input_payloads=payloads,
+                runtime_max_tokens_per_rank=runtime_max_tokens,
+            )
+            scale = 3.0
+            combined_scaled = manager.moe_alltoall.combine(
+                payload=expert_output * scale,
+                runtime_max_tokens_per_rank=runtime_max_tokens,
+            )
+            expected_scaled = (expected * scale).to(torch.bfloat16)
+            torch.testing.assert_close(combined_scaled, expected_scaled)
+
+            torch.distributed.barrier()
+
+    manager.cleanup()
+
+
+@requires_multi_gpu
+@requires_one_sided
+@requires_ptrace
+@pytest.mark.parametrize("world_size", [2])
+def test_one_sided_dispatch_combine(world_size):
+    """Test FlashInfer one-sided dispatch/combine with actual data flow."""
+    _spawn_workers(_one_sided_data_worker, world_size, dp_size=world_size)
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index f56d037fa547..e72f00bc91e0 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# unit test for `examples/offline_inference/torchrun_example.py`
+# unit test for `examples/features/torchrun/torchrun_example_offline.py`
 import os
 import random
 
diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py
index 8c1d00561b16..969b5e92e3fc 100644
--- a/tests/distributed/test_torchrun_example_moe.py
+++ b/tests/distributed/test_torchrun_example_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# unit test for `examples/offline_inference/torchrun_example.py`
+# unit test for `examples/features/torchrun/torchrun_example_offline.py`
 import os
 import random
 
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index bf3b400d9d7e..1ab4949c4003 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -333,8 +333,6 @@ def test_attention_config():
             "true",
             "--attention-config.flash_attn_max_num_splits_for_cuda_graph",
             "16",
-            "--attention-config.use_cudnn_prefill",
-            "true",
             "--attention-config.use_trtllm_ragged_deepseek_prefill",
             "true",
             "--attention-config.use_trtllm_attention",
@@ -352,7 +350,6 @@ def test_attention_config():
     assert engine_args.attention_config.flash_attn_version == 3
     assert engine_args.attention_config.use_prefill_decode_attention is True
     assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16
-    assert engine_args.attention_config.use_cudnn_prefill is True
     assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True
     assert engine_args.attention_config.use_trtllm_attention is True
     assert engine_args.attention_config.disable_flashinfer_prefill is True
diff --git a/tests/entrypoints/llm/test_mm_processor_kwargs.py b/tests/entrypoints/llm/test_mm_processor_kwargs.py
index 19cf91230ca6..1b0092df011d 100644
--- a/tests/entrypoints/llm/test_mm_processor_kwargs.py
+++ b/tests/entrypoints/llm/test_mm_processor_kwargs.py
@@ -11,7 +11,9 @@
 
 def _make_mock_llm() -> LLM:
     llm = object.__new__(LLM)
-    llm.model_config = SimpleNamespace(runner_type="generate")
+    llm.model_config = SimpleNamespace(
+        runner_type="generate", enable_prompt_embeds=False
+    )
     return llm
 
 
diff --git a/tests/entrypoints/openai/chat_completion/test_chat.py b/tests/entrypoints/openai/chat_completion/test_chat.py
index 212839f78d5c..6703095aec4a 100644
--- a/tests/entrypoints/openai/chat_completion/test_chat.py
+++ b/tests/entrypoints/openai/chat_completion/test_chat.py
@@ -845,9 +845,10 @@ async def test_chat_completion_n_parameter_non_streaming(
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=20,
-        temperature=0.7,
+        max_completion_tokens=50,
+        temperature=1.0,
         n=3,
+        seed=42,
         stream=False,
     )
 
@@ -859,7 +860,6 @@ async def test_chat_completion_n_parameter_non_streaming(
         assert choice.message.content is not None
         assert len(choice.message.content) > 0
 
-    # Verify all responses are different (highly likely with temperature > 0)
     contents = [choice.message.content for choice in chat_completion.choices]
     assert len(set(contents)) > 1, "Expected different responses with n=3"
 
@@ -1002,6 +1002,31 @@ def test_chat_completion_request_n_parameter_default():
     assert sampling_params.n == 1, f"Expected n=1 (default), got n={sampling_params.n}"
 
 
+def test_chat_completion_request_accepts_model_specific_reasoning_effort():
+    request = ChatCompletionRequest(
+        model="test-model",
+        messages=[{"role": "user", "content": "Hello"}],
+        reasoning_effort="max",
+    )
+
+    chat_params = request.build_chat_params(
+        default_template=None,
+        default_template_content_format="auto",
+    )
+
+    assert request.reasoning_effort == "max"
+    assert chat_params.chat_template_kwargs["reasoning_effort"] == "max"
+
+
+def test_chat_completion_request_rejects_unknown_reasoning_effort():
+    with pytest.raises(ValueError, match="Input should be"):
+        ChatCompletionRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "Hello"}],
+            reasoning_effort="extra_high",
+        )
+
+
 def test_chat_completion_request_n_parameter_various_values():
     """Test n parameter with various values."""
     for n_value in [1, 2, 5, 10]:
diff --git a/tests/entrypoints/openai/chat_completion/test_chat_completion_with_mixed_audio_embeds.py b/tests/entrypoints/openai/chat_completion/test_chat_completion_with_mixed_audio_embeds.py
new file mode 100644
index 000000000000..d005edc950cc
--- /dev/null
+++ b/tests/entrypoints/openai/chat_completion/test_chat_completion_with_mixed_audio_embeds.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""E2E test for mixing `prompt_embeds` with `audio_embeds` in a single
+Chat Completions request."""
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+import safetensors
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from transformers import AutoConfig, AutoTokenizer
+
+from tests.utils import RemoteOpenAIServer
+from vllm.utils.serial_utils import tensor2base64
+
+QWEN2AUDIO_MODEL = "Qwen/Qwen2-Audio-7B-Instruct"
+
+# Use the model's native dtype to avoid an implicit cast inside
+# `safe_load_prompt_embeds` (mismatched floating-point dtypes are cast to the
+# model's dtype automatically, matching here just skips the conversion).
+QWEN2AUDIO_DTYPE = torch.bfloat16
+
+
+@pytest.fixture(scope="module")
+def qwen2audio_server_args() -> list[str]:
+    return [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "4",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--gpu-memory-utilization",
+        "0.85",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": 1}),
+        "--enable-prompt-embeds",
+        "--enable-mm-embeds",
+    ]
+
+
+@pytest.fixture(scope="module")
+def qwen2audio_server(qwen2audio_server_args):
+    with RemoteOpenAIServer(
+        QWEN2AUDIO_MODEL,
+        qwen2audio_server_args,
+        max_wait_seconds=600,
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def qwen2audio_client(qwen2audio_server):
+    async with qwen2audio_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="module")
+def qwen2audio_hidden_size() -> int:
+    config = AutoConfig.from_pretrained(QWEN2AUDIO_MODEL, trust_remote_code=True)
+    return config.text_config.hidden_size
+
+
+@pytest.fixture(scope="module")
+def qwen2audio_prompt_embeds_b64(qwen2audio_hidden_size: int) -> str:
+    tensor = torch.randn(4, qwen2audio_hidden_size, dtype=QWEN2AUDIO_DTYPE)
+    return tensor2base64(tensor)
+
+
+@pytest.fixture(scope="module")
+def qwen2audio_audio_embeds_b64(qwen2audio_hidden_size: int) -> str:
+    # Shape matches the `audio_embeds` unit-test fixture.
+    torch.manual_seed(0)
+    tensor = torch.randn(1, 128, qwen2audio_hidden_size, dtype=QWEN2AUDIO_DTYPE)
+    return tensor2base64(tensor)
+
+
+@pytest.mark.asyncio
+async def test_prompt_embeds_plus_audio_embeds(
+    qwen2audio_client: openai.AsyncOpenAI,
+    qwen2audio_prompt_embeds_b64: str,
+    qwen2audio_audio_embeds_b64: str,
+):
+    """Single user message carrying both prompt_embeds and audio_embeds parts."""
+    chat = await qwen2audio_client.chat.completions.create(
+        model=QWEN2AUDIO_MODEL,
+        max_tokens=5,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "prompt_embeds",
+                        "data": qwen2audio_prompt_embeds_b64,
+                    },
+                    {
+                        "type": "audio_embeds",
+                        "audio_embeds": qwen2audio_audio_embeds_b64,
+                    },
+                    {"type": "text", "text": "Continue."},
+                ],
+            }
+        ],
+    )
+    assert chat.choices[0].message.content is not None
+    assert len(chat.choices[0].message.content) > 0
+
+
+@pytest.fixture(scope="module")
+def qwen2audio_aligned_content_and_embeds_b64() -> tuple[str, str]:
+    """Return `(content, base64_embeds)` where the embeddings are the model's
+    embedding of `content` tokenized WITHOUT special tokens.
+
+    Loads only the `embed_tokens` shard from disk on CPU (~1.1 GB of host
+    RAM) instead of the full 7B model on GPU.
+    """
+    content = "Describe this audio."
+    tokenizer = AutoTokenizer.from_pretrained(QWEN2AUDIO_MODEL, trust_remote_code=True)
+
+    index_path = hf_hub_download(QWEN2AUDIO_MODEL, "model.safetensors.index.json")
+    with open(index_path) as f:
+        weight_map = json.load(f)["weight_map"]
+    embed_key = next(k for k in weight_map if k.endswith("embed_tokens.weight"))
+    shard_path = hf_hub_download(QWEN2AUDIO_MODEL, weight_map[embed_key])
+    with safetensors.safe_open(shard_path, framework="pt", device="cpu") as f:
+        embed_weight = f.get_tensor(embed_key)
+    embed_layer = nn.Embedding.from_pretrained(embed_weight.to(QWEN2AUDIO_DTYPE))
+
+    ids = tokenizer(content, add_special_tokens=False, return_tensors="pt").input_ids
+    embeds = embed_layer(ids).squeeze(0)
+    return content, tensor2base64(embeds)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "audio_first",
+    [True, False],
+    ids=["audio_embeds-then-text", "text-then-audio_embeds"],
+)
+async def test_text_content_and_prompt_embeds_match_with_audio_embeds(
+    qwen2audio_client: openai.AsyncOpenAI,
+    qwen2audio_audio_embeds_b64: str,
+    qwen2audio_aligned_content_and_embeds_b64: tuple[str, str],
+    audio_first: bool,
+):
+    """Same content as text vs `prompt_embeds` should yield identical Chat
+    Completions output when mixed with `audio_embeds` in the same message.
+    """
+    content, encoded_text_embeds = qwen2audio_aligned_content_and_embeds_b64
+
+    audio_part = {
+        "type": "audio_embeds",
+        "audio_embeds": qwen2audio_audio_embeds_b64,
+    }
+    text_part = {"type": "text", "text": content}
+    embeds_part = {"type": "prompt_embeds", "data": encoded_text_embeds}
+
+    if audio_first:
+        text_content = [audio_part, text_part]
+        embeds_content = [audio_part, embeds_part]
+    else:
+        text_content = [text_part, audio_part]
+        embeds_content = [embeds_part, audio_part]
+
+    text_resp = await qwen2audio_client.chat.completions.create(
+        model=QWEN2AUDIO_MODEL,
+        max_tokens=10,
+        temperature=0.0,
+        messages=[{"role": "user", "content": text_content}],
+    )
+    embeds_resp = await qwen2audio_client.chat.completions.create(
+        model=QWEN2AUDIO_MODEL,
+        max_tokens=10,
+        temperature=0.0,
+        messages=[{"role": "user", "content": embeds_content}],
+    )
+
+    text_out = text_resp.choices[0].message.content
+    embeds_out = embeds_resp.choices[0].message.content
+    assert text_out is not None and len(text_out) > 0
+    assert embeds_out is not None and len(embeds_out) > 0
+    assert text_out == embeds_out
diff --git a/tests/entrypoints/openai/chat_completion/test_chat_completion_with_mixed_image_embeds.py b/tests/entrypoints/openai/chat_completion/test_chat_completion_with_mixed_image_embeds.py
new file mode 100644
index 000000000000..dbbed3c47127
--- /dev/null
+++ b/tests/entrypoints/openai/chat_completion/test_chat_completion_with_mixed_image_embeds.py
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""E2E tests for mixing `prompt_embeds` with image content parts in a single
+Chat Completions request.
+"""
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+import safetensors
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
+
+from tests.utils import RemoteOpenAIServer
+from vllm.assets.image import ImageAsset
+from vllm.multimodal.utils import encode_image_url
+from vllm.utils.serial_utils import tensor2base64
+
+MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
+
+# Use the model's native dtype to skip the implicit cast inside
+# `safe_load_prompt_embeds` (mismatched floating-point dtypes are cast to the
+# model's dtype automatically).
+MODEL_DTYPE = torch.bfloat16
+
+
+@pytest.fixture(scope="module")
+def server_args() -> list[str]:
+    return [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "4",
+        "--enforce-eager",
+        "--gpu-memory-utilization",
+        "0.4",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": 1}),
+        "--enable-prompt-embeds",
+        "--enable-mm-embeds",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(server_args):
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        server_args,
+        max_wait_seconds=600,
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="module")
+def image_url() -> str:
+    """Stable real image as a data URL, kept identical across both the
+    text and prompt_embeds requests so any output difference must come from
+    how the text content is delivered."""
+    return encode_image_url(ImageAsset("stop_sign").pil_image)
+
+
+@pytest.fixture(scope="module")
+def aligned_content_and_embeds_b64() -> tuple[str, str]:
+    """`(content, base64_embeds)` where the embeddings are the model's
+    embedding of `content` tokenized WITHOUT special tokens.
+
+    Loads only the `embed_tokens` shard from disk on CPU instead of the full
+    model on GPU, so the fixture has zero VRAM footprint and won't contend
+    with the running vLLM server.
+    """
+    content = "Describe this image."
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    index_path = hf_hub_download(MODEL_NAME, "model.safetensors.index.json")
+    with open(index_path) as f:
+        weight_map = json.load(f)["weight_map"]
+    embed_key = next(k for k in weight_map if k.endswith("embed_tokens.weight"))
+    shard_path = hf_hub_download(MODEL_NAME, weight_map[embed_key])
+    with safetensors.safe_open(shard_path, framework="pt", device="cpu") as f:
+        embed_weight = f.get_tensor(embed_key)
+    embed_layer = nn.Embedding.from_pretrained(embed_weight.to(MODEL_DTYPE))
+
+    ids = tokenizer(content, add_special_tokens=False, return_tensors="pt").input_ids
+    embeds = embed_layer(ids).squeeze(0)
+    return content, tensor2base64(embeds)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "image_first",
+    [True, False],
+    ids=["image_url-then-text", "text-then-image_url"],
+)
+async def test_text_content_and_prompt_embeds_match_with_image_url(
+    client: openai.AsyncOpenAI,
+    image_url: str,
+    aligned_content_and_embeds_b64: tuple[str, str],
+    image_first: bool,
+):
+    """Same content as text vs `prompt_embeds` should yield identical Chat
+    Completions output when mixed with an `image_url` part in the same
+    message under greedy decoding.
+    """
+    content, encoded_text_embeds = aligned_content_and_embeds_b64
+
+    image_part = {"type": "image_url", "image_url": {"url": image_url}}
+    text_part = {"type": "text", "text": content}
+    embeds_part = {"type": "prompt_embeds", "data": encoded_text_embeds}
+
+    if image_first:
+        text_content = [image_part, text_part]
+        embeds_content = [image_part, embeds_part]
+    else:
+        text_content = [text_part, image_part]
+        embeds_content = [embeds_part, image_part]
+
+    text_resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=10,
+        temperature=0.0,
+        messages=[{"role": "user", "content": text_content}],
+    )
+    embeds_resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=10,
+        temperature=0.0,
+        messages=[{"role": "user", "content": embeds_content}],
+    )
+
+    text_out = text_resp.choices[0].message.content
+    embeds_out = embeds_resp.choices[0].message.content
+    assert text_out is not None and len(text_out) > 0
+    assert embeds_out is not None and len(embeds_out) > 0
+    assert text_out == embeds_out
+
+
+@pytest.fixture(scope="module")
+def image_embeds_b64() -> dict[str, str]:
+    """Synthetic but stable `image_embeds` for Qwen2-VL."""
+    grid = (1, 4, 4)
+    spatial_merge_size = 2
+    num_patches = (grid[1] // spatial_merge_size) * (grid[2] // spatial_merge_size)
+    text_hidden_size = 1536  # Qwen2-VL-2B
+    torch.manual_seed(0)
+    return {
+        "image_embeds": tensor2base64(
+            torch.randn(num_patches, text_hidden_size, dtype=MODEL_DTYPE)
+        ),
+        "image_grid_thw": tensor2base64(torch.tensor(grid)),
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "image_first",
+    [True, False],
+    ids=["image_embeds-then-text", "text-then-image_embeds"],
+)
+async def test_text_content_and_prompt_embeds_match_with_image_embeds(
+    client: openai.AsyncOpenAI,
+    image_embeds_b64: dict[str, str],
+    aligned_content_and_embeds_b64: tuple[str, str],
+    image_first: bool,
+):
+    """Same content as text vs `prompt_embeds` should yield identical Chat
+    Completions output when mixed with a precomputed `image_embeds` part in
+    the same message under greedy decoding.
+    """
+    content, encoded_text_embeds = aligned_content_and_embeds_b64
+
+    image_part = {"type": "image_embeds", "image_embeds": image_embeds_b64}
+    text_part = {"type": "text", "text": content}
+    embeds_part = {"type": "prompt_embeds", "data": encoded_text_embeds}
+
+    if image_first:
+        text_content = [image_part, text_part]
+        embeds_content = [image_part, embeds_part]
+    else:
+        text_content = [text_part, image_part]
+        embeds_content = [embeds_part, image_part]
+
+    text_resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=10,
+        temperature=0.0,
+        messages=[{"role": "user", "content": text_content}],
+    )
+    embeds_resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=10,
+        temperature=0.0,
+        messages=[{"role": "user", "content": embeds_content}],
+    )
+
+    text_out = text_resp.choices[0].message.content
+    embeds_out = embeds_resp.choices[0].message.content
+    assert text_out is not None and len(text_out) > 0
+    assert embeds_out is not None and len(embeds_out) > 0
+    assert text_out == embeds_out
diff --git a/tests/entrypoints/openai/chat_completion/test_chat_completion_with_prompt_embeds.py b/tests/entrypoints/openai/chat_completion/test_chat_completion_with_prompt_embeds.py
new file mode 100644
index 000000000000..1813d74798de
--- /dev/null
+++ b/tests/entrypoints/openai/chat_completion/test_chat_completion_with_prompt_embeds.py
@@ -0,0 +1,293 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""E2E tests for `prompt_embeds` content parts in the Chat Completions API."""
+
+import asyncio
+import io
+
+import openai
+import pybase64 as base64
+import pytest
+import pytest_asyncio
+import torch
+from openai import BadRequestError
+
+from tests.utils import VLLM_PATH, RemoteOpenAIServer
+
+MODEL_NAME = "facebook/opt-125m"
+CHAT_TEMPLATE = VLLM_PATH / "examples/template_chatml.jinja"
+# Matches `--dtype` in `server_args` to avoid an implicit cast in
+# `safe_load_prompt_embeds` (mismatched floating-point dtypes are cast to the
+# model's dtype automatically, we match here just to skip the conversion).
+SERVER_DTYPE: torch.dtype = torch.bfloat16
+
+
+@pytest.fixture(scope="module")
+def server_args() -> list[str]:
+    return [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--chat-template",
+        str(CHAT_TEMPLATE),
+        # Prompt Embeds server args
+        "--enable-prompt-embeds",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(server_args):
+    with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+def _encode_embeds(embeds: torch.Tensor) -> str:
+    buf = io.BytesIO()
+    torch.save(embeds, buf)
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+
+
+@pytest.fixture(scope="module")
+def prompt_embeds_b64(hf_runner) -> list[str]:
+    """Pre-compute embeddings for two short prompts and return as base64."""
+    prompts = ["Hello, my name is", "What is an LLM?"]
+    with hf_runner(MODEL_NAME) as hf_model:
+        embeddings = hf_model.get_prompt_embeddings(prompts)
+    # Cast to the server's dtype so `safe_load_prompt_embeds` doesn't need to
+    # convert on its own, the function accepts any floating-point dtype and
+    # will cast to the model's dtype, but matching up front skips the work.
+    return [_encode_embeds(e.to(SERVER_DTYPE)) for e in embeddings]
+
+
+@pytest.mark.asyncio
+async def test_single_prompt_embeds_part(
+    client: openai.AsyncOpenAI,
+    prompt_embeds_b64: list[str],
+):
+    """A user message with one prompt_embeds part + text."""
+    b64 = prompt_embeds_b64[0]
+    chat = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=5,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "prompt_embeds", "data": b64},
+                    {"type": "text", "text": "Continue:"},
+                ],
+            }
+        ],
+    )
+    assert chat.choices[0].message.content is not None
+    assert len(chat.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+async def test_multiple_prompt_embeds_parts(
+    client: openai.AsyncOpenAI,
+    prompt_embeds_b64: list[str],
+):
+    """Multiple prompt_embeds parts in a single message."""
+    b64_a, b64_b = prompt_embeds_b64
+    chat = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=5,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "prompt_embeds", "data": b64_a},
+                    {"type": "text", "text": " and "},
+                    {"type": "prompt_embeds", "data": b64_b},
+                ],
+            }
+        ],
+    )
+    assert chat.choices[0].message.content is not None
+    assert len(chat.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+async def test_multi_message_conversation(
+    client: openai.AsyncOpenAI,
+    prompt_embeds_b64: list[str],
+):
+    """prompt_embeds in both system and user messages."""
+    b64_sys, b64_usr = prompt_embeds_b64
+    chat = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=5,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": "You are helpful."},
+                    {"type": "prompt_embeds", "data": b64_sys},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "prompt_embeds", "data": b64_usr},
+                    {"type": "text", "text": "Summarize."},
+                ],
+            },
+        ],
+    )
+    assert chat.choices[0].message.content is not None
+    assert len(chat.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+async def test_streaming(
+    client: openai.AsyncOpenAI,
+    prompt_embeds_b64: list[str],
+):
+    """Streaming chat completion with prompt_embeds."""
+    b64 = prompt_embeds_b64[0]
+
+    # Non-streaming baseline.
+    baseline = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=5,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "prompt_embeds", "data": b64},
+                    {"type": "text", "text": "Continue:"},
+                ],
+            }
+        ],
+    )
+    expected = baseline.choices[0].message.content
+
+    # Streaming.
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "prompt_embeds", "data": b64},
+                    {"type": "text", "text": "Continue:"},
+                ],
+            }
+        ],
+    )
+    chunks: list[str] = []
+    async for chunk in stream:
+        delta = chunk.choices[0].delta.content
+        if delta:
+            chunks.append(delta)
+    assert "".join(chunks) == expected
+
+
+@pytest.fixture(scope="module")
+def aligned_content_and_embeds_b64(hf_runner) -> tuple[str, str]:
+    """Return `(content, base64_embeds)` where the embeddings are the model's
+    embedding of `content` tokenized WITHOUT special tokens.
+    """
+    content = "Hello, my name is"
+    with hf_runner(MODEL_NAME) as hf_model:
+        ids = hf_model.tokenizer(
+            content, add_special_tokens=False, return_tensors="pt"
+        ).input_ids
+        ids = hf_model.wrap_device({"input_ids": ids})["input_ids"]
+        embed_layer = hf_model.model.get_input_embeddings()
+        embeds = embed_layer(ids).squeeze(0).to(SERVER_DTYPE).cpu()
+    return content, _encode_embeds(embeds)
+
+
+@pytest.mark.asyncio
+async def test_text_content_and_prompt_embeds_match(
+    client: openai.AsyncOpenAI,
+    aligned_content_and_embeds_b64: tuple[str, str],
+):
+    """Equal content in text and `prompt_embeds` should yield identical
+    Chat Completions output under greedy decoding.
+    """
+    content, encoded_embeds = aligned_content_and_embeds_b64
+
+    text_resp, embeds_resp = await asyncio.gather(
+        client.chat.completions.create(
+            model=MODEL_NAME,
+            max_tokens=10,
+            temperature=0.0,
+            messages=[{"role": "user", "content": content}],
+        ),
+        client.chat.completions.create(
+            model=MODEL_NAME,
+            max_tokens=10,
+            temperature=0.0,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [{"type": "prompt_embeds", "data": encoded_embeds}],
+                }
+            ],
+        ),
+    )
+
+    text_out = text_resp.choices[0].message.content
+    embeds_out = embeds_resp.choices[0].message.content
+    assert text_out is not None and len(text_out) > 0
+    assert embeds_out is not None and len(embeds_out) > 0
+    assert text_out == embeds_out
+
+
+@pytest.mark.asyncio
+async def test_missing_data_field(
+    client: openai.AsyncOpenAI,
+):
+    """A prompt_embeds part without `data` should return a clear error."""
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            max_tokens=5,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [{"type": "prompt_embeds"}],
+                }
+            ],
+        )
+
+
+@pytest.mark.asyncio
+async def test_invalid_base64(
+    client: openai.AsyncOpenAI,
+):
+    """Invalid base64 in the `data` field should return a clear error."""
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            max_tokens=5,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "prompt_embeds", "data": "not_valid_base64!!"},
+                    ],
+                }
+            ],
+        )
diff --git a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
index 965b21351302..839793fde856 100644
--- a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
@@ -518,7 +518,13 @@ async def test_inconsistent_tool_choice_and_tools(
 
 
 @pytest.mark.asyncio
-async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    "tool_choice",
+    ["required", {"type": "function", "function": {"name": "get_current_weather"}}],
+)
+async def test_max_tokens_with_tool_choice_required(
+    client: openai.AsyncOpenAI, tool_choice
+):
     """ """
     models = await client.models.list()
     model_name: str = models.data[0].id
@@ -530,7 +536,7 @@ async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
         max_completion_tokens=1,
         model=model_name,
         tools=tools,
-        tool_choice="required",
+        tool_choice=tool_choice,
     )
     # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`,
     # both `tool_calls` and `content` should be empty.
@@ -538,4 +544,3 @@ async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert len(choice.message.tool_calls) == 0
-    assert choice.message.content == ""
diff --git a/tests/entrypoints/openai/chat_completion/test_serving_chat.py b/tests/entrypoints/openai/chat_completion/test_serving_chat.py
index 39d59d28f854..c44e07a4c10d 100644
--- a/tests/entrypoints/openai/chat_completion/test_serving_chat.py
+++ b/tests/entrypoints/openai/chat_completion/test_serving_chat.py
@@ -538,6 +538,7 @@ class MockModelConfig:
     is_encoder_decoder: bool = False
     is_multimodal_model: bool = False
     renderer_num_workers: int = 1
+    enable_prompt_embeds: bool = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
@@ -806,6 +807,57 @@ async def test_serving_chat_should_set_correct_max_tokens():
     assert mock_engine.generate.call_args.args[1].max_tokens == 5
 
 
+@pytest.mark.asyncio
+async def test_serving_chat_truncate_prompt_tokens_max_token_accounting():
+    """When truncate_prompt_tokens is set, max_tokens must be calculated using
+    the truncated prompt length, not the original prompt length.
+
+    Regression: without the fix, get_max_tokens received the untruncated prompt
+    length, causing the output budget to be underestimated.
+    """
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.renderer = _build_renderer(mock_engine.model_config)
+
+    serving_chat = _build_serving_chat(mock_engine)
+
+    # "what is 1+1?" tokenizes to 7 tokens with the test chat template
+    # (max_model_len=100 -> max_tokens = 93 without truncation, confirmed by
+    # test_serving_chat_should_set_correct_max_tokens above).
+    messages = [{"role": "user", "content": "what is 1+1?"}]
+
+    # Baseline: no truncation -> max_tokens = 100 - 7 = 93.
+    req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # With truncate_prompt_tokens=5 (less than 7): the effective prompt length
+    # is 5, so max_tokens should be 100 - 5 = 95, not 93.
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=messages,
+        truncate_prompt_tokens=5,
+    )
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+    assert mock_engine.generate.call_args.args[1].max_tokens == 95
+
+    # With truncate_prompt_tokens=-1 (meaning use full max_model_len as the
+    # truncation limit, i.e., no practical truncation vs the window): effective
+    # length = min(7, 100) = 7 -> max_tokens = 93 again.
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=messages,
+        truncate_prompt_tokens=-1,
+    )
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+
 @pytest.mark.asyncio
 async def test_serving_chat_mistral_token_ids_prompt_is_validated():
     """Regression test: when the Mistral tokenizer path returns token IDs
diff --git a/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py b/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
index d2db50082a55..ae2b597e13ac 100644
--- a/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
+++ b/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
@@ -1,18 +1,62 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-"""E2E tests for thinking_token_budget with reasoning models."""
+"""E2E tests for ``thinking_token_budget`` with reasoning models.
+
+Covers Qwen3-0.6B and Qwen3.5 FP8 + MTP.
+"""
+
+import asyncio
+import json
+from typing import Literal
 
 import openai
 import pytest
 import pytest_asyncio
 
-from tests.utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer, multi_gpu_only, requires_fp8
+from vllm.platforms import current_platform
+from vllm.tokenizers import get_tokenizer
 
 MODEL_NAME = "Qwen/Qwen3-0.6B"
+QWEN35_FP8_MTP_MODEL = "Qwen/Qwen3.5-35B-A3B-FP8"
 MESSAGES = [{"role": "user", "content": "What is 1+1? Be concise."}]
 THINK_BUDGET = 5
 
+REASONING_START_STR = "<think>"
+REASONING_END_STR = "</think>"
+
+
+def _count_reasoning_decode_token_ids_between_markers(
+    full_token_ids: list[int],
+    reasoning_start_ids: list[int],
+    reasoning_end_ids: list[int],
+) -> int | None:
+    """Count decode tokens in the thinking span (after last start, before first end)."""
+
+    if not reasoning_start_ids or not reasoning_end_ids:
+        raise ValueError("reasoning marker token id lists must be non-empty")
+
+    def _last_subseq_index(haystack: list[int], needle: list[int]) -> int:
+        n = len(needle)
+        if n > len(haystack):
+            return -1
+        for i in range(len(haystack) - n, -1, -1):
+            if haystack[i : i + n] == needle:
+                return i
+        return -1
+
+    last_start = _last_subseq_index(full_token_ids, reasoning_start_ids)
+    if last_start < 0:
+        return None
+
+    pos_after_start = last_start + len(reasoning_start_ids)
+    end_n = len(reasoning_end_ids)
+    for j in range(pos_after_start, len(full_token_ids) - end_n + 1):
+        if full_token_ids[j : j + end_n] == reasoning_end_ids:
+            return j - pos_after_start
+    return len(full_token_ids) - pos_after_start
+
 
 @pytest.fixture(scope="module")
 def server():
@@ -48,6 +92,51 @@ def server_with_auto_reasoning_config():
         yield remote_server
 
 
+@pytest.fixture(scope="module")
+def server_qwen35_fp8_mtp_tp2():
+    """Qwen3.5-35B FP8 with MTP speculative decoding and tensor parallel size 2."""
+    if current_platform.device_count() < 2:
+        pytest.skip("Need at least 2 GPUs for --tensor-parallel-size 2")
+    if not current_platform.supports_fp8():
+        pytest.skip("FP8 is not supported on this platform")
+
+    spec_cfg = {
+        "method": "mtp",
+        "num_speculative_tokens": 2,
+        "max_model_len": 32768,
+    }
+    args = [
+        "--tensor-parallel-size",
+        "2",
+        "--max-model-len",
+        "32768",
+        "--speculative-config",
+        json.dumps(spec_cfg),
+        "--reasoning-parser",
+        "qwen3",
+        "--reasoning-config",
+        json.dumps(
+            {
+                "reasoning_start_str": REASONING_START_STR,
+                "reasoning_end_str": REASONING_END_STR,
+            }
+        ),
+    ]
+    # With 4+ GPUs, run TP=2 on physical devices 2,3 so module-scoped 0.6B servers
+    # on 0,1 do not exhaust memory on the same devices as this worker.
+    env_dict = None
+    if current_platform.device_count() >= 4:
+        env_dict = {"CUDA_VISIBLE_DEVICES": "2,3"}
+
+    with RemoteOpenAIServer(
+        QWEN35_FP8_MTP_MODEL,
+        args,
+        max_wait_seconds=3000,
+        env_dict=env_dict,
+    ) as remote_server:
+        yield remote_server
+
+
 @pytest_asyncio.fixture
 async def client(request, server, server_with_auto_reasoning_config):
     server_map = {
@@ -89,8 +178,10 @@ async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
 async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI):
     """Test that thinking_token_budget limits the number of reasoning tokens.
 
-    In streaming mode each reasoning delta corresponds to one token, so
-    counting non-empty reasoning_content chunks gives the exact token count.
+    Counts non-empty streaming ``delta.reasoning`` chunks (coarse proxy; each
+    chunk may represent multiple decode tokens — see
+    ``_count_reasoning_decode_token_ids_between_markers`` and the Qwen3.5 MTP
+    test for id-based checks).
     """
 
     reasoning_token_count = 0
@@ -110,3 +201,133 @@ async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI
         f"reasoning tokens ({reasoning_token_count}) exceeded "
         f"thinking_token_budget ({THINK_BUDGET})"
     )
+
+
+@pytest.mark.asyncio
+@multi_gpu_only(num_gpus=2)
+@requires_fp8
+async def test_thinking_token_budget_qwen35_fp8_mtp_concurrent_mixed_budget_and_plain(
+    server_qwen35_fp8_mtp_tp2,
+):
+    """Concurrent chat requests: some with ``thinking_token_budget``, some without.
+
+    Exercises the scheduler / input processor under a mixed batch on the same
+    Qwen3.5 FP8 + MTP (TP=2) server. Budgeted calls are checked with
+    ``_count_reasoning_decode_token_ids_between_markers`` on full token ids.
+    """
+
+    _batch_spec: list[tuple[Literal["budget"], int] | tuple[Literal["plain"], None]] = [
+        ("budget", 1),
+        ("budget", 12),
+        ("plain", None),
+        ("budget", 20),
+        ("budget", 14),
+        ("plain", None),
+        ("plain", None),
+        ("budget", 12),
+        ("plain", None),
+    ]
+
+    tokenizer = get_tokenizer(tokenizer_name=QWEN35_FP8_MTP_MODEL)
+    start_ids = list(tokenizer.encode(REASONING_START_STR, add_special_tokens=False))
+    end_ids = list(tokenizer.encode(REASONING_END_STR, add_special_tokens=False))
+
+    async with server_qwen35_fp8_mtp_tp2.get_async_client() as client:
+
+        async def budgeted_call(expected_budget: int):
+            return await client.chat.completions.create(
+                model=QWEN35_FP8_MTP_MODEL,
+                messages=MESSAGES,
+                max_tokens=256,
+                stream=False,
+                extra_body={
+                    "thinking_token_budget": expected_budget,
+                    "return_token_ids": True,
+                },
+            )
+
+        async def plain_call():
+            return await client.chat.completions.create(
+                model=QWEN35_FP8_MTP_MODEL,
+                messages=MESSAGES,
+                max_tokens=256,
+                stream=False,
+            )
+
+        coros = []
+        for row in _batch_spec:
+            if row[0] == "budget":
+                b = row[1]
+                assert isinstance(b, int)
+                coros.append(budgeted_call(b))
+            else:
+                coros.append(plain_call())
+        results = await asyncio.gather(*coros)
+
+    for i, (response, (kind, expected_budget)) in enumerate(
+        zip(results, _batch_spec, strict=True)
+    ):
+        msg = response.choices[0].message
+        assert msg.content or getattr(msg, "reasoning", None), (
+            f"index {i} ({kind}): empty message"
+        )
+
+        if kind == "budget":
+            assert expected_budget is not None
+            assert response.prompt_token_ids is not None
+            assert response.choices[0].token_ids is not None
+            full_ids = list(response.prompt_token_ids) + list(
+                response.choices[0].token_ids
+            )
+            n_reason = _count_reasoning_decode_token_ids_between_markers(
+                full_ids, start_ids, end_ids
+            )
+            assert n_reason is not None, f"index {i}: missing reasoning start in ids"
+            assert n_reason == expected_budget, (
+                f"index {i}: reasoning decode token ids ({n_reason}) != "
+                f"thinking_token_budget ({expected_budget})"
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
+async def test_streaming_with_thinking_disabled_stays_in_content(
+    client: openai.AsyncOpenAI,
+):
+    request_kwargs = {
+        "model": MODEL_NAME,
+        "messages": [
+            {
+                "role": "user",
+                "content": "Which is larger, 4 or 12?"
+                " Output exactly one token: 4 or 12.",
+            }
+        ],
+        "max_tokens": 16,
+        "temperature": 0.0,
+        "extra_body": {"chat_template_kwargs": {"enable_thinking": False}},
+    }
+
+    response = await client.chat.completions.create(**request_kwargs)
+    message = response.choices[0].message
+    assert message.content is not None and message.content.strip() != ""
+    assert getattr(message, "reasoning", None) in (None, "")
+
+    stream = await client.chat.completions.create(
+        **request_kwargs,
+        stream=True,
+    )
+
+    content_chunks = []
+    reasoning_chunks = []
+    async for chunk in stream:
+        if not chunk.choices:
+            continue
+        delta = chunk.choices[0].delta
+        if getattr(delta, "content", None):
+            content_chunks.append(delta.content)
+        if getattr(delta, "reasoning", None):
+            reasoning_chunks.append(delta.reasoning)
+
+    assert "".join(content_chunks).strip() != ""
+    assert reasoning_chunks == []
diff --git a/tests/entrypoints/openai/chat_completion/test_vision_embeds.py b/tests/entrypoints/openai/chat_completion/test_vision_embeds.py
index 574a8f1c86a9..da9787e3f89e 100644
--- a/tests/entrypoints/openai/chat_completion/test_vision_embeds.py
+++ b/tests/entrypoints/openai/chat_completion/test_vision_embeds.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib.util
+
 import numpy as np
 import pybase64 as base64
 import pytest
@@ -10,7 +12,16 @@
 from tests.utils import RemoteOpenAIServer
 from vllm.utils.serial_utils import tensor2base64
 
+# Prithvi requires terratorch, which is temporarily unavailable while PyPI has
+# `lightning` quarantined (#41376). Skip just the Prithvi case; leave the
+# Qwen3-VL case in the same file untouched.
+_TERRATORCH_AVAILABLE = importlib.util.find_spec("terratorch") is not None
+
 
+@pytest.mark.skipif(
+    not _TERRATORCH_AVAILABLE,
+    reason="terratorch unavailable while PyPI has `lightning` quarantined; see #41376",
+)
 @pytest.mark.parametrize(
     "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
 )
diff --git a/tests/entrypoints/openai/completion/test_prompt_validation.py b/tests/entrypoints/openai/completion/test_prompt_validation.py
index f44d13c555c5..81204b27bc0b 100644
--- a/tests/entrypoints/openai/completion/test_prompt_validation.py
+++ b/tests/entrypoints/openai/completion/test_prompt_validation.py
@@ -62,6 +62,8 @@ def test_load_prompt_embeds(
 ):
     model_config = Mock(spec=ModelConfig)
     model_config.enable_prompt_embeds = True
+    model_config.get_hidden_size.return_value = hidden_size
+    model_config.dtype = dtype
 
     # construct arbitrary tensors of various dtypes, layouts, and sizes.
     # We need to check against different layouts to make sure that if a user
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index f847b7b0d88d..fedbd74795b5 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -26,6 +26,10 @@
 from ....models.registry import HF_EXAMPLE_MODELS
 from ....utils import RemoteOpenAIServer
 
+# Tuned to prevent OOM on 18GB GPUs in transcription correctness tests.
+MAX_SEQS_FOR_TRANSCRIPTION_TEST = 8
+GPU_UTIL_FOR_TRANSCRIPTION_TEST = 0.5
+
 
 def to_bytes(y, sr):
     buffer = io.BytesIO()
@@ -167,9 +171,8 @@ def run_evaluation(
     "model_config",
     [
         ("openai/whisper-large-v3", 12.744980),
-        # TODO (ekagra): turn on after asr release
         # CohereASR is used to test the variable encoder length code paths
-        # ("CohereLabs/cohere-transcribe-03-2026", 11.92),
+        ("CohereLabs/cohere-transcribe-03-2026", 11.92),
     ],
 )
 # Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
@@ -185,6 +188,8 @@ def test_wer_correctness(
     server_args = [
         "--enforce-eager",
         f"--tokenizer_mode={model_info.tokenizer_mode}",
+        f"--max_num_seqs={MAX_SEQS_FOR_TRANSCRIPTION_TEST}",
+        f"--gpu_memory_utilization={GPU_UTIL_FOR_TRANSCRIPTION_TEST}",
     ]
     if model_info.trust_remote_code:
         server_args.append("--trust-remote-code")
diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
index 21b53dff1507..69b9f101ea28 100644
--- a/tests/entrypoints/openai/parser/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -843,6 +843,13 @@ def test_all_standard_channels_present(self) -> None:
                     f"{channel} missing when with_custom_tools={with_tools}"
                 )
 
+    def test_unsupported_reasoning_effort_raises_clear_error(self) -> None:
+        with pytest.raises(
+            ValueError,
+            match="reasoning_effort='max' is not supported by Harmony",
+        ):
+            get_system_message(reasoning_effort="max")
+
 
 class TestResponseInputToHarmonyReasoningItem:
     """Tests for response_input_to_harmony handling of reasoning input items.
diff --git a/tests/entrypoints/openai/responses/test_errors.py b/tests/entrypoints/openai/responses/test_errors.py
index 0ef9bb901a64..e21f6aa2a42a 100644
--- a/tests/entrypoints/openai/responses/test_errors.py
+++ b/tests/entrypoints/openai/responses/test_errors.py
@@ -6,7 +6,9 @@
 
 import pytest
 
+import vllm.envs as envs
 from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing
+from vllm.envs import disable_envs_cache
 
 
 @pytest.mark.asyncio
@@ -60,3 +62,35 @@ async def test_convert_generation_error_to_streaming_response():
     assert isinstance(error_json, str)
     assert "Internal server error" in error_json
     assert "InternalServerError" in error_json
+
+
+def test_is_model_supported_skip_name_validation_env(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """When VLLM_SKIP_MODEL_NAME_VALIDATION is set, accept any model id."""
+    disable_envs_cache()
+    monkeypatch.delenv("VLLM_SKIP_MODEL_NAME_VALIDATION", raising=False)
+
+    mock_engine = MagicMock()
+    mock_engine.model_config = MagicMock()
+    mock_engine.model_config.max_model_len = 100
+    mock_models = MagicMock()
+    mock_models.is_base_model.return_value = False
+
+    serving = OpenAIServing(
+        engine_client=mock_engine,
+        models=mock_models,
+        request_logger=None,
+    )
+
+    assert serving._is_model_supported("not-a-registered-model") is False
+
+    monkeypatch.setenv("VLLM_SKIP_MODEL_NAME_VALIDATION", "1")
+    disable_envs_cache()
+    assert envs.VLLM_SKIP_MODEL_NAME_VALIDATION is True
+    assert serving._is_model_supported("not-a-registered-model") is True
+
+    monkeypatch.setenv("VLLM_SKIP_MODEL_NAME_VALIDATION", "true")
+    disable_envs_cache()
+    assert envs.VLLM_SKIP_MODEL_NAME_VALIDATION is True
+    assert serving._is_model_supported("another-alias") is True
diff --git a/tests/entrypoints/openai/responses/test_function_call.py b/tests/entrypoints/openai/responses/test_function_call.py
index 515f31b399ee..8ca43feaca4f 100644
--- a/tests/entrypoints/openai/responses/test_function_call.py
+++ b/tests/entrypoints/openai/responses/test_function_call.py
@@ -323,7 +323,7 @@ async def test_function_calling_with_streaming_expected_arguments(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
     "tool_choice",
-    ["auto", "required"],
+    ["auto", "required", {"type": "function", "name": "get_current_weather"}],
 )
 async def test_function_calling_with_streaming_types(
     client: openai.AsyncOpenAI, model_name: str, tool_choice
@@ -462,7 +462,7 @@ async def test_function_calling_with_streaming_types(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
     "tool_choice",
-    ["required", "auto"],
+    ["required", "auto", {"type": "function", "name": "get_weather"}],
 )
 async def test_function_calling_with_streaming_forced_tool_choice(
     client: openai.AsyncOpenAI, model_name: str, tool_choice: str
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 083290ed5b3a..7d17eb9d4569 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -1,18 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
-from http import HTTPStatus
 from typing import Final
 
 import pytest
 import schemathesis
-from httpx import URL
-from hypothesis import settings
+from hypothesis import HealthCheck, settings
 from schemathesis import GenerationConfig
-from schemathesis.checks import not_a_server_error
-from schemathesis.internal.checks import CheckContext
 from schemathesis.models import Case
-from schemathesis.transports.responses import GenericResponse
 
 from vllm.platforms import current_platform
 
@@ -65,20 +60,10 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
 
     def no_invalid_types(case: schemathesis.models.Case):
         """
-        This filter skips test cases with invalid data that schemathesis
-        incorrectly generates due to permissive schema configurations.
-        
-        1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in 
-           message content, which isn't implemented.
-        
-        2. Skips tool_calls with `"type": "custom"` which schemathesis 
-           incorrectly generates instead of the valid `"type": "function"`.
-
-        Example test cases that are skipped:
-        curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
-            http://localhost:8000/tokenize
+        Skips tool_calls with `"type": "custom"` which schemathesis incorrectly
+        generates instead of the valid `"type": "function"`.
 
+        Example test case that is skipped:
         curl -X POST -H 'Content-Type: application/json' \
             -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
             http://localhost:8000/v1/chat/completions
@@ -93,20 +78,6 @@ def no_invalid_types(case: schemathesis.models.Case):
                     if not isinstance(message, dict):
                         continue
 
-                    # Check for invalid file type in tokenize endpoint
-                    if op.method.lower() == "post" and op.path == "/tokenize":
-                        content = message.get("content", [])
-                        if (
-                            isinstance(content, list)
-                            and len(content) > 0
-                            and any(
-                                isinstance(item, dict) and item.get("type") == "file"
-                                for item in content
-                            )
-                        ):
-                            return False
-
-                    # Check for invalid tool_calls with non-function types
                     tool_calls = message.get("tool_calls", [])
                     if isinstance(tool_calls, list):
                         for tool_call in tool_calls:
@@ -136,24 +107,19 @@ def no_invalid_types(case: schemathesis.models.Case):
     return strategy.filter(no_invalid_types)
 
 
-def customized_not_a_server_error(
-    ctx: CheckContext, response: GenericResponse, case: Case
-) -> bool | None:
-    try:
-        return not_a_server_error(ctx, response, case)
-    except Exception:
-        if (
-            URL(response.request.url).path
-            in ["/v1/chat/completions/render", "/v1/chat/completions"]
-            and response.status_code == HTTPStatus.NOT_IMPLEMENTED.value
-        ):
-            return True
-        raise
-
-
 @schema.parametrize()
 @schema.override(headers={"Content-Type": "application/json"})
-@settings(deadline=LONG_TIMEOUT_SECONDS * 1000, max_examples=50)
+@settings(
+    deadline=LONG_TIMEOUT_SECONDS * 1000,
+    max_examples=50,
+    # Under CI's derandomized hypothesis seed, the schemathesis strategy
+    # for /v1/chat/completions/batch's nested-message body, combined with
+    # the no_invalid_types filter (notably the grammar=="" rule), exceeds
+    # the default filtered-vs-good ratio. The filter is intentional, so
+    # suppress the health check rather than drop the filter — dropping it
+    # exposes pre-existing server bugs out of scope here.
+    suppress_health_check=[HealthCheck.filter_too_much],
+)
 def test_openapi_stateless(case: Case):
     key = (
         case.operation.method.upper(),
@@ -180,9 +146,4 @@ def test_openapi_stateless(case: Case):
     }.get(key, DEFAULT_TIMEOUT_SECONDS)
 
     # No need to verify SSL certificate for localhost
-    case.call_and_validate(
-        verify=False,
-        timeout=timeout,
-        additional_checks=(customized_not_a_server_error,),
-        excluded_checks=(not_a_server_error,),
-    )
+    case.call_and_validate(verify=False, timeout=timeout)
diff --git a/tests/entrypoints/openai/test_tool_choice_content_none.py b/tests/entrypoints/openai/test_tool_choice_content_none.py
new file mode 100644
index 000000000000..c1da5918697c
--- /dev/null
+++ b/tests/entrypoints/openai/test_tool_choice_content_none.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
+from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+from vllm.parser.abstract_parser import DelegatingParser
+
+pytestmark = pytest.mark.skip_global_cleanup
+
+
+class _DummyDelegatingParser(DelegatingParser):
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return False
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return input_ids
+
+    def extract_reasoning(self, model_output: str, request):
+        return None, model_output
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: list[int],
+        current_token_ids: list[int],
+        delta_token_ids: list[int],
+    ):
+        return None
+
+    def extract_tool_calls(self, model_output: str, request):
+        return None
+
+
+def test_parse_tool_calls_from_content_allows_named_tool_choice_with_none_content():
+    request = ChatCompletionRequest.model_validate(
+        {
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "test"}],
+            "tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "parameters": {"type": "object", "properties": {}},
+                    },
+                }
+            ],
+            "tool_choice": {"type": "function", "function": {"name": "get_weather"}},
+        }
+    )
+
+    tool_calls, content = OpenAIServing._parse_tool_calls_from_content(
+        request=request,
+        tokenizer=None,
+        enable_auto_tools=True,
+        tool_parser_cls=None,
+        content=None,
+    )
+
+    assert content is None
+    assert tool_calls is not None
+    assert tool_calls == []
+
+
+def test_responses_parser_allows_named_tool_choice_with_none_content():
+    request = ResponsesRequest.model_validate(
+        {
+            "model": "test-model",
+            "input": "test",
+            "tools": [
+                {
+                    "type": "function",
+                    "name": "get_weather",
+                    "parameters": {"type": "object", "properties": {}},
+                }
+            ],
+            "tool_choice": {"type": "function", "name": "get_weather"},
+        }
+    )
+    parser = _DummyDelegatingParser(tokenizer=None)
+
+    tool_calls, content = parser._parse_tool_calls(
+        request=request,
+        content=None,
+        enable_auto_tools=False,
+    )
+
+    assert content is None
+    assert tool_calls == []
diff --git a/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py b/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
index d773802a9c24..a815e6e509c2 100644
--- a/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
+++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
@@ -38,6 +38,15 @@
     "FLEX_ATTENTION": 0.045,  # gfx950:~3.25%, gfx942:~1.10%
 }
 
+# ROCm 7.2/gfx950 shows small absolute drift on the low text-vs-text
+# probability even though larger scores remain well inside the relative
+# tolerance. Keep the relative tolerances tight and add only a small floor.
+BACKEND_ABS_TOL: dict[str, float] = {
+    "default": 0.0,
+    "ROCM_AITER_FA": 0.005,
+    "FLEX_ATTENTION": 0.006,
+}
+
 # ROCm: disable skinny GEMM to avoid non-deterministic results from
 # atomic reductions in wvSplitKrc kernel.
 # See: https://github.com/vllm-project/vllm/pull/33493#issuecomment-3906083975
@@ -57,18 +66,23 @@ def get_tol(backend: str) -> float:
     return BACKEND_TOL.get(backend, BACKEND_TOL["default"])
 
 
+def get_abs_tol(backend: str) -> float:
+    return BACKEND_ABS_TOL.get(backend, BACKEND_ABS_TOL["default"])
+
+
 def assert_score(actual: float, expected: float, backend: str, label: str):
     tol = get_tol(backend)
+    abs_tol = get_abs_tol(backend)
     diff = abs(actual - expected)
     rel_diff = diff / abs(expected) if expected != 0 else diff
     print(
         f"[{backend}] {label}: actual={actual:.6f} expected={expected:.6f} "
-        f"diff={diff:.6f} rel_diff={rel_diff:.4f} tol={tol}"
+        f"diff={diff:.6f} rel_diff={rel_diff:.4f} tol={tol} abs_tol={abs_tol}"
     )
-    assert actual == pytest.approx(expected, rel=tol), (
+    assert actual == pytest.approx(expected, rel=tol, abs=abs_tol), (
         f"[{backend}] {label}: score mismatch — "
         f"actual={actual:.6f}, expected={expected:.6f}, "
-        f"rel_diff={rel_diff:.4f}, tol={tol}"
+        f"rel_diff={rel_diff:.4f}, tol={tol}, abs_tol={abs_tol}"
     )
 
 
diff --git a/tests/evals/mrcr/README.md b/tests/evals/mrcr/README.md
new file mode 100644
index 000000000000..59acc11ac48c
--- /dev/null
+++ b/tests/evals/mrcr/README.md
@@ -0,0 +1,44 @@
+# MRCR Long-Context Accuracy Evaluation
+
+Smoke test for long-context behavior using OpenAI's public [`openai/mrcr`](https://huggingface.co/datasets/openai/mrcr) dataset. The model sees a long chat with several near-duplicate "needles" and must reproduce a specific earlier assistant turn verbatim, prepended with a random anti-guessing string.
+
+**Scoring:** if the response doesn't start with `random_string_to_prepend`, score is 0; otherwise the prefix is stripped and the mean `SequenceMatcher.ratio()` against the reference answer is reported.
+
+## Usage
+
+```bash
+# Pytest (spawns the server)
+pytest -s -v tests/evals/mrcr/test_mrcr_correctness.py \
+    --config-list-file=configs/models-small.txt
+
+# Standalone (server already running; model and context auto-discovered)
+vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3 --port 8000
+python tests/evals/mrcr/mrcr_eval.py --port 8000
+```
+
+## Configuration
+
+```yaml
+model_name: "Qwen/Qwen3-0.6B"
+# Per-needle thresholds catch bucket-specific regressions (sliding window,
+# chunked prefill, prefix cache) that an aggregate can hide. A scalar
+# (e.g. `match_ratio_threshold: 0.20`) is also accepted and checked against
+# the mean match ratio.
+match_ratio_threshold:
+  2: 0.30
+  4: 0.15
+  8: 0.10
+num_samples: 30
+needles: [2, 4, 8]
+# max_prompt_tokens: 32768       # Optional; defaults to server max_model_len - max_tokens - 256
+max_tokens: 2048
+concurrency: 8
+server_args: "--max-model-len 32768 --reasoning-parser qwen3"
+```
+
+## Notes
+
+- Samples stream from three parquet shards (`{N}needle/{N}needle_0.parquet`); only the first few row groups are fetched, not the full 1.4 GB repo.
+- `max_prompt_tokens` defaults to `max_model_len - max_tokens - 256`, i.e. fills whatever context the server advertises. Set `--max-model-len` on the server to control the smoke-test context length; override `--max-prompt-tokens` on the client to cap below that.
+- Sample length is pre-filtered by `n_chars × 4 ≤ max_prompt_tokens`, then verified via the server's `/tokenize` endpoint under the actual chat template.
+- Reasoning models: start the server with `--reasoning-parser <name>` (e.g. `qwen3`, `deepseek_r1`) so `<think>` goes to `message.reasoning_content` and doesn't contaminate the scored answer.
diff --git a/tests/evals/mrcr/__init__.py b/tests/evals/mrcr/__init__.py
new file mode 100644
index 000000000000..208f01a7cb5e
--- /dev/null
+++ b/tests/evals/mrcr/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/evals/mrcr/configs/Qwen3.5-4B.yaml b/tests/evals/mrcr/configs/Qwen3.5-4B.yaml
new file mode 100644
index 000000000000..c2fd438dd377
--- /dev/null
+++ b/tests/evals/mrcr/configs/Qwen3.5-4B.yaml
@@ -0,0 +1,7 @@
+model_name: "Qwen/Qwen3.5-4B"
+needles: [2, 4, 8]
+match_ratio_threshold:
+  2: 0.99
+  4: 0.84
+  8: 0.76
+server_args: "--max-model-len 128K --reasoning-parser qwen3"
diff --git a/tests/evals/mrcr/configs/models-small.txt b/tests/evals/mrcr/configs/models-small.txt
new file mode 100644
index 000000000000..b78704fe539f
--- /dev/null
+++ b/tests/evals/mrcr/configs/models-small.txt
@@ -0,0 +1 @@
+Qwen3.5-4B.yaml
diff --git a/tests/evals/mrcr/conftest.py b/tests/evals/mrcr/conftest.py
new file mode 100644
index 000000000000..46f59a56c238
--- /dev/null
+++ b/tests/evals/mrcr/conftest.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+
+def pytest_addoption(parser):
+    """Add custom command line options."""
+    parser.addoption(
+        "--config-list-file",
+        default="configs/models-small.txt",
+        help="File containing list of config files to test",
+    )
+
+
+def pytest_generate_tests(metafunc):
+    """Generate test parameters from config files."""
+    if "config_filename" in metafunc.fixturenames:
+        config_list_file = metafunc.config.getoption("--config-list-file")
+
+        config_list_path = Path(config_list_file)
+        if not config_list_path.is_absolute():
+            test_dir_path = Path(__file__).parent / config_list_file
+            if test_dir_path.exists():
+                config_list_path = test_dir_path
+            else:
+                config_list_path = Path.cwd() / config_list_file
+
+        print(f"Looking for config list at: {config_list_path}")
+
+        config_files = []
+        if config_list_path.exists():
+            config_dir = config_list_path.parent
+            with open(config_list_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith("#"):
+                        config_path = config_dir / line
+                        if config_path.exists():
+                            config_files.append(config_path)
+                            print(f"  ✓ Found: {config_path}")
+                        else:
+                            print(f"  ✗ Missing: {config_path}")
+        else:
+            print(f"Config list file not found: {config_list_path}")
+
+        if config_files:
+            metafunc.parametrize(
+                "config_filename",
+                config_files,
+                ids=[config_file.stem for config_file in config_files],
+            )
+        else:
+            print("No config files found, test will be skipped")
diff --git a/tests/evals/mrcr/mrcr_eval.py b/tests/evals/mrcr/mrcr_eval.py
new file mode 100644
index 000000000000..3ab87a57d122
--- /dev/null
+++ b/tests/evals/mrcr/mrcr_eval.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""MRCR long-context evaluation for vLLM's OpenAI-compatible server.
+
+Streams samples from `openai/mrcr` on HuggingFace, sends chat completions to
+the server, and scores each response with a prefix-gated SequenceMatcher ratio
+against the reference answer.
+"""
+
+import argparse
+import asyncio
+import json
+import time
+from difflib import SequenceMatcher
+
+import aiohttp
+import numpy as np
+import requests
+from tqdm.asyncio import tqdm
+
+DATASET_REPO = "openai/mrcr"
+NEEDLE_SHARDS = {
+    2: "2needle/2needle_0.parquet",
+    4: "4needle/4needle_0.parquet",
+    8: "8needle/8needle_0.parquet",
+}
+# Reserve headroom for chat-template tokens on top of the messages.
+PROMPT_SAFETY_BUFFER = 256
+# Pre-filter heuristic before the authoritative /tokenize check.
+CHARS_PER_TOKEN = 4
+# Skip chain-of-thought on reasoning models; ignored by non-reasoning templates.
+DEFAULT_EXTRA_BODY: dict = {"chat_template_kwargs": {"enable_thinking": False}}
+
+
+def discover_server_model(base_url: str) -> tuple[str, int | None]:
+    """Return (model_id, max_model_len) from /v1/models."""
+    resp = requests.get(f"{base_url}/v1/models", timeout=30)
+    resp.raise_for_status()
+    data = resp.json().get("data", [])
+    if not data:
+        raise RuntimeError(f"No models advertised at {base_url}/v1/models")
+    entry = data[0]
+    return entry["id"], entry.get("max_model_len")
+
+
+def count_chat_tokens(base_url: str, model: str, messages: list[dict]) -> int:
+    """Return the chat-template-rendered token count via /tokenize."""
+    resp = requests.post(
+        f"{base_url}/tokenize",
+        json={"model": model, "messages": messages, "add_generation_prompt": True},
+        timeout=120,
+    )
+    resp.raise_for_status()
+    return int(resp.json()["count"])
+
+
+def _load_mrcr_samples(
+    needles: list[int],
+    max_prompt_tokens: int,
+    num_samples: int,
+    seed: int,
+    base_url: str,
+    model_name: str,
+) -> list[dict]:
+    """Stream MRCR samples balanced across needle buckets, token-verified."""
+    try:
+        from datasets import load_dataset
+    except ImportError as e:
+        raise ImportError(
+            "MRCR eval requires `datasets`. Install with: uv pip install datasets"
+        ) from e
+
+    max_chars = max_prompt_tokens * CHARS_PER_TOKEN
+    per_bucket = num_samples // len(needles)
+    leftover = num_samples - per_bucket * len(needles)
+
+    samples: list[dict] = []
+    for idx, n in enumerate(needles):
+        if n not in NEEDLE_SHARDS:
+            raise ValueError(f"Unsupported needle count {n}")
+        target = per_bucket + (1 if idx < leftover else 0)
+        if target == 0:
+            continue
+
+        ds = load_dataset(
+            DATASET_REPO,
+            data_files=NEEDLE_SHARDS[n],
+            split="train",
+            streaming=True,
+        ).shuffle(seed=seed + n, buffer_size=16)
+
+        taken = 0
+        for row in ds:
+            if int(row.get("n_chars", 0)) > max_chars:
+                continue
+            prompt = row["prompt"]
+            messages = json.loads(prompt) if isinstance(prompt, str) else list(prompt)
+            n_tokens = count_chat_tokens(base_url, model_name, messages)
+            if n_tokens > max_prompt_tokens:
+                continue
+            samples.append(
+                {
+                    "messages": messages,
+                    "answer": row["answer"],
+                    "random_string_to_prepend": row["random_string_to_prepend"],
+                    "n_needles": int(row["n_needles"]),
+                    "n_tokens": n_tokens,
+                }
+            )
+            taken += 1
+            if taken >= target:
+                break
+
+        if taken < target:
+            print(f"Warning: only {taken}/{target} samples for n_needles={n}")
+
+    if not samples:
+        raise RuntimeError("No MRCR samples fit; loosen max_prompt_tokens.")
+    return samples
+
+
+def score_mrcr(response: str, answer: str, random_prefix: str) -> float:
+    """Prefix-gated SequenceMatcher ratio; 0 if the prefix is missing."""
+    if not response.startswith(random_prefix):
+        return 0.0
+    stripped = response[len(random_prefix) :]
+    return SequenceMatcher(a=answer, b=stripped, autojunk=False).ratio()
+
+
+async def _call_chat(
+    session: aiohttp.ClientSession,
+    url: str,
+    model: str,
+    messages: list[dict],
+    max_tokens: int,
+    temperature: float,
+    seed: int | None,
+    extra_body: dict,
+) -> tuple[str, int]:
+    data = {
+        "model": model,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        **extra_body,
+    }
+    if seed is not None:
+        data["seed"] = seed
+    try:
+        async with session.post(f"{url}/v1/chat/completions", json=data) as resp:
+            resp.raise_for_status()
+            result = await resp.json()
+            text = result["choices"][0]["message"]["content"] or ""
+            return text, result.get("usage", {}).get("completion_tokens", 0)
+    except Exception as e:
+        print(f"chat request failed: {e}")
+        return "", 0
+
+
+def evaluate_mrcr(
+    model_name: str | None = None,
+    num_samples: int = 40,
+    needles: list[int] | None = None,
+    max_prompt_tokens: int | None = None,
+    max_tokens: int = 2048,
+    host: str = "http://127.0.0.1",
+    port: int = 8000,
+    temperature: float = 0.0,
+    seed: int | None = 42,
+    concurrency: int = 8,
+    extra_body: dict | None = None,
+) -> dict:
+    """Run MRCR against a vLLM server; auto-discovers model and context."""
+    needles = needles or [2, 4, 8]
+    extra_body = DEFAULT_EXTRA_BODY if extra_body is None else extra_body
+    base_url = f"{host}:{port}"
+
+    discovered_model, server_max_len = discover_server_model(base_url)
+    if model_name is None:
+        model_name = discovered_model
+    if max_prompt_tokens is None:
+        if server_max_len is None:
+            raise RuntimeError(
+                "Server did not advertise max_model_len; pass --max-prompt-tokens."
+            )
+        max_prompt_tokens = max(512, server_max_len - max_tokens - PROMPT_SAFETY_BUFFER)
+    print(
+        f"Model: {model_name} | max_prompt_tokens={max_prompt_tokens} "
+        f"(server max_model_len={server_max_len}, max_tokens={max_tokens})"
+    )
+
+    samples = _load_mrcr_samples(
+        needles=needles,
+        max_prompt_tokens=max_prompt_tokens,
+        num_samples=num_samples,
+        seed=seed or 0,
+        base_url=base_url,
+        model_name=model_name,
+    )
+    tok_counts = [s["n_tokens"] for s in samples]
+    print(
+        f"Loaded {len(samples)} samples (needles={needles}, "
+        f"tokens={min(tok_counts)}-{max(tok_counts)})"
+    )
+
+    async def run():
+        sem = asyncio.Semaphore(concurrency)
+        responses = [""] * len(samples)
+        out_tokens = [0] * len(samples)
+
+        async def one(session, i):
+            async with sem:
+                text, toks = await _call_chat(
+                    session=session,
+                    url=base_url,
+                    model=model_name,
+                    messages=samples[i]["messages"],
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    seed=seed,
+                    extra_body=extra_body,
+                )
+                responses[i] = text
+                out_tokens[i] = toks
+
+        timeout = aiohttp.ClientTimeout(total=1800)
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            await tqdm.gather(
+                *[one(session, i) for i in range(len(samples))], desc="MRCR"
+            )
+        return responses, out_tokens
+
+    tic = time.perf_counter()
+    responses, out_tokens = asyncio.run(run())
+    latency = time.perf_counter() - tic
+
+    scores = np.array(
+        [
+            score_mrcr(r, s["answer"], s["random_string_to_prepend"])
+            for r, s in zip(responses, samples)
+        ]
+    )
+    prefix_hits = np.array(
+        [
+            r.startswith(s["random_string_to_prepend"])
+            for r, s in zip(responses, samples)
+        ]
+    )
+    per_needle = {
+        f"match_ratio_n{n}": float(
+            scores[np.array([s["n_needles"] == n for s in samples])].mean()
+        )
+        for n in needles
+        if any(s["n_needles"] == n for s in samples)
+    }
+
+    total_out = int(sum(out_tokens))
+    return {
+        "model": model_name,
+        "match_ratio": float(scores.mean()),
+        "prefix_hit_rate": float(prefix_hits.mean()),
+        "per_needle": per_needle,
+        "num_samples": len(samples),
+        "latency": latency,
+        "total_output_tokens": total_out,
+        "tokens_per_second": total_out / latency if latency > 0 else 0.0,
+        "max_tokens": max_tokens,
+        "needles": needles,
+        "max_prompt_tokens": max_prompt_tokens,
+    }
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="MRCR evaluation for vLLM serve")
+    p.add_argument("--model", default=None, help="Default: discovered from /v1/models")
+    p.add_argument("--num-samples", type=int, default=40)
+    p.add_argument(
+        "--needles", type=int, nargs="+", default=[2, 4, 8], choices=[2, 4, 8]
+    )
+    p.add_argument(
+        "--max-prompt-tokens",
+        type=int,
+        default=None,
+        help="Default: server max_model_len - max_tokens - buffer",
+    )
+    p.add_argument("--max-tokens", type=int, default=2048)
+    p.add_argument("--host", default="http://127.0.0.1")
+    p.add_argument("--port", type=int, default=8000)
+    p.add_argument("--temperature", type=float, default=0.0)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--concurrency", type=int, default=8)
+    p.add_argument(
+        "--extra-body",
+        default=None,
+        help="JSON merged into each request. "
+        "Pass '{}' to disable the default enable_thinking=false.",
+    )
+    p.add_argument("--save-results", default=None)
+    args = p.parse_args()
+
+    extra_body = json.loads(args.extra_body) if args.extra_body else None
+
+    result = evaluate_mrcr(
+        model_name=args.model,
+        num_samples=args.num_samples,
+        needles=args.needles,
+        max_prompt_tokens=args.max_prompt_tokens,
+        max_tokens=args.max_tokens,
+        host=args.host,
+        port=args.port,
+        temperature=args.temperature,
+        seed=args.seed,
+        concurrency=args.concurrency,
+        extra_body=extra_body,
+    )
+
+    print("\nResults:")
+    print(f"  match_ratio:     {result['match_ratio']:.4f}")
+    print(f"  prefix_hit_rate: {result['prefix_hit_rate']:.4f}")
+    for k, v in result["per_needle"].items():
+        print(f"  {k}: {v:.4f}")
+    print(f"  samples:         {result['num_samples']}")
+    print(f"  latency:         {result['latency']:.1f}s")
+    print(f"  output tok/s:    {result['tokens_per_second']:.1f}")
+
+    if args.save_results:
+        with open(args.save_results, "w") as f:
+            json.dump(result, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/evals/mrcr/test_mrcr_correctness.py b/tests/evals/mrcr/test_mrcr_correctness.py
new file mode 100644
index 000000000000..3adfd4dc8799
--- /dev/null
+++ b/tests/evals/mrcr/test_mrcr_correctness.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+MRCR long-context accuracy test.
+
+Usage:
+    pytest -s -v tests/evals/mrcr/test_mrcr_correctness.py \
+        --config-list-file=configs/models-small.txt
+"""
+
+import shlex
+
+import yaml
+
+from tests.utils import RemoteOpenAIServer
+
+from .mrcr_eval import evaluate_mrcr
+
+
+def _split_host_port(url: str, default_port: int = 8000) -> tuple[str, int]:
+    if "://" in url:
+        url = url.split("://", 1)[1]
+    host_port = url.split("/", 1)[0]
+    if ":" in host_port:
+        host, p = host_port.split(":", 1)
+        return f"http://{host}", int(p)
+    return f"http://{host_port}", default_port
+
+
+def test_mrcr_correctness(config_filename):
+    cfg = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+
+    server_args = shlex.split(cfg.get("server_args", ""))
+    server_args += ["--trust-remote-code", "--disable-uvicorn-access-log"]
+
+    print(
+        f"MRCR eval for {cfg['model_name']} (threshold {cfg['match_ratio_threshold']})"
+    )
+
+    with RemoteOpenAIServer(
+        cfg["model_name"],
+        server_args,
+        env_dict=cfg.get("env"),
+        max_wait_seconds=cfg.get("startup_max_wait_seconds", 600),
+    ) as server:
+        host, port = _split_host_port(server.url_for("v1"))
+        results = evaluate_mrcr(
+            model_name=cfg.get("model_name"),
+            num_samples=cfg.get("num_samples", 40),
+            needles=cfg.get("needles", [2, 4, 8]),
+            max_prompt_tokens=cfg.get("max_prompt_tokens"),
+            max_tokens=cfg.get("max_tokens", 2048),
+            host=host,
+            port=port,
+            concurrency=cfg.get("concurrency", 8),
+            extra_body=cfg.get("extra_body"),
+        )
+
+    threshold = cfg["match_ratio_threshold"]
+    tol = cfg.get("tolerance", 0.05)
+
+    print(f"  match_ratio:     {results['match_ratio']:.4f}")
+    print(f"  prefix_hit_rate: {results['prefix_hit_rate']:.4f}")
+    for k, v in results["per_needle"].items():
+        print(f"  {k}: {v:.4f}")
+
+    failures: list[str] = []
+    if isinstance(threshold, dict):
+        for n, expected in threshold.items():
+            key = f"match_ratio_n{int(n)}"
+            measured = results["per_needle"].get(key)
+            if measured is None:
+                failures.append(f"{key}: no samples collected")
+            elif measured < expected - tol:
+                failures.append(f"{key}: {measured:.4f} < {expected:.4f} - {tol:.4f}")
+    else:
+        measured = results["match_ratio"]
+        if measured < threshold - tol:
+            failures.append(
+                f"match_ratio: {measured:.4f} < {threshold:.4f} - {tol:.4f}"
+            )
+
+    assert not failures, "MRCR thresholds failed: " + "; ".join(failures)
diff --git a/tests/ir/test_inplace_op.py b/tests/ir/test_inplace_op.py
new file mode 100644
index 000000000000..decc4f51c777
--- /dev/null
+++ b/tests/ir/test_inplace_op.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import Tensor
+from torch.fx.experimental.proxy_tensor import make_fx
+
+import vllm.ir.op
+from vllm.ir.op import IrOp, IrOpInplaceOverload
+
+
+@vllm.ir.register_op(allow_inplace=True)
+def _custom_mm2(x: Tensor, w: Tensor) -> Tensor:
+    return x @ w
+
+
+@_custom_mm2.register_impl("regular")
+def _custom_mm2_regular(x: Tensor, w: Tensor) -> Tensor:
+    return x @ w + 1
+
+
+@_custom_mm2.register_impl("inplace", inplace=True)
+def _custom_mm2_inplace(x: Tensor, w: Tensor) -> Tensor:
+    x.copy_(x @ w + 2)
+    return x
+
+
+class TestInplaceOp:
+    def test_registration(self):
+        # Test that the inplace op is registered correctly.
+        assert "_custom_mm2" in IrOp.registry
+        assert IrOp.registry["_custom_mm2"] is _custom_mm2
+        assert _custom_mm2.torch_op is torch.ops.vllm_ir._custom_mm2.default
+        assert isinstance(_custom_mm2.maybe_inplace, IrOpInplaceOverload)
+        assert (
+            _custom_mm2.maybe_inplace.torch_op
+            is torch.ops.vllm_ir._custom_mm2.maybe_inplace
+        )
+
+    def test_inplace_dispatching(self):
+        # check that the correct implementation is dispatched based on priority,
+        # and inplace semantics hold
+        w = torch.randn(3, 3)
+        x = torch.randn(2, 3)
+        x1 = x.clone()
+
+        with _custom_mm2.set_priority(["regular"]):
+            result_regular = _custom_mm2.maybe_inplace(x, w)
+
+        # check that the regular op does not modify x
+        torch.testing.assert_close(x, x1, atol=0, rtol=0)
+
+        with _custom_mm2.set_priority(["inplace"]):
+            result_inplace: Tensor = _custom_mm2.maybe_inplace(x, w)
+
+        # check that the inplace op returns x directly
+        assert result_inplace.data_ptr() == x.data_ptr()
+
+        torch.testing.assert_close(result_inplace, x1 @ w + 2)
+        torch.testing.assert_close(result_regular, x1 @ w + 1)
+
+    def test_default_dispatching(self):
+        # check that the correct implementation is dispatched,
+        # and ops do not modify inputs when using the default overload
+        w = torch.randn(3, 3)
+        x = torch.randn(2, 3)
+        x1 = x.clone()
+
+        with _custom_mm2.set_priority(["regular"]):
+            result_regular = _custom_mm2(x, w)
+
+        with _custom_mm2.set_priority(["inplace"]):
+            result_inplace = _custom_mm2(x, w)
+
+        # check that x was not modified by either impl
+        torch.testing.assert_close(x, x1, atol=0, rtol=0)
+
+        torch.testing.assert_close(result_inplace, x1 @ w + 2)
+        torch.testing.assert_close(result_regular, x1 @ w + 1)
+
+    def test_trace(self):
+        # Test that the inplace op can be used in a graph.
+        def func(x: Tensor, y: Tensor) -> Tensor:
+            return _custom_mm2.maybe_inplace(x, y)
+
+        x = torch.randn(2, 3)
+        y = torch.randn(3, 4)
+        graph = make_fx(func)(x, y)
+        assert any(
+            node.target == torch.ops.vllm_ir._custom_mm2.maybe_inplace
+            for node in graph.graph.nodes
+        )
diff --git a/tests/ir/test_op.py b/tests/ir/test_op.py
index 524497916b6c..3576e5aef8bd 100644
--- a/tests/ir/test_op.py
+++ b/tests/ir/test_op.py
@@ -21,7 +21,7 @@ class CustomError(Exception):
     pass
 
 
-@vllm.ir.register_op
+@vllm.ir.register_op(allow_inplace=True)
 def _custom_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     return x + y
 
@@ -129,11 +129,15 @@ def test_schema_contains_tensor_signature(self):
 
     @pytest.mark.parametrize("enable_torch_wrap", [True, False])
     @pytest.mark.parametrize("symbolic_trace", [True, False])
+    @pytest.mark.parametrize("overload", ["default", "maybe_inplace"])
     def test_trace_sees_single_custom_op(
-        self, symbolic_trace: bool, enable_torch_wrap: bool
+        self, symbolic_trace: bool, enable_torch_wrap: bool, overload: str
     ):
+        op_fn = _custom_add if overload == "default" else _custom_add.maybe_inplace
+        torch_op = getattr(torch.ops.vllm_ir._custom_add, overload)
+
         def fn(x, y):
-            return _custom_add(x, y)
+            return op_fn(x, y)
 
         def find_fn(target: Any, gm: fx.GraphModule):
             return gm.graph.find_nodes(op="call_function", target=target)
@@ -155,7 +159,7 @@ def find_fn(target: Any, gm: fx.GraphModule):
         torch.testing.assert_close(out_fx, out_eager)
 
         # check that IR nodes only appear if enable_torch_wrap=True
-        ir_nodes = find_fn(torch.ops.vllm_ir._custom_add.default, gm)
+        ir_nodes = find_fn(torch_op, gm)
         if enable_torch_wrap:
             assert len(ir_nodes) == 1, gm.code
         else:
@@ -167,7 +171,7 @@ def find_fn(target: Any, gm: fx.GraphModule):
         else:
             gm = make_fx(fn)(torch.randn(2, 2), torch.randn(2, 2))
 
-        ir_nodes = find_fn(torch.ops.vllm_ir._custom_add.default, gm)
+        ir_nodes = find_fn(torch_op, gm)
         assert len(ir_nodes) == 1, gm.code
 
 
@@ -176,9 +180,12 @@ def impl_a(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     return x + y + 10
 
 
-@_custom_add.register_impl("impl_b")
+@_custom_add.register_impl("impl_b", inplace=True)
 def impl_b(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-    return x + y + 20
+    """Computes x+y+20"""
+    x.add_(y)
+    x.add_(20)
+    return x
 
 
 @_custom_add.register_impl("impl_even", supports_args=lambda x, y: x.size(1) % 2 == 0)
@@ -243,19 +250,23 @@ def test_set_priority_scoped(self):
         # Restored to empty
         assert _custom_add.get_priority() == []
 
-    def test_dispatch_priority_order(self):
+    @pytest.mark.parametrize("overload", ["default", "maybe_inplace"])
+    def test_dispatch_priority_order(self, overload: str):
+        op_fn = _custom_add if overload == "default" else _custom_add.maybe_inplace
+        torch_op = getattr(torch.ops.vllm_ir._custom_add, overload)
+
         x = torch.tensor(1, dtype=torch.int32)
         y = torch.tensor(2, dtype=torch.int32)
 
         with _custom_add.set_priority(["impl_b", "impl_a"]):
             assert _custom_add.dispatch(x, y) is impl_b
-            out1 = _custom_add(x, y)
-            out2 = torch.ops.vllm_ir._custom_add(x, y)
+            out1 = op_fn(x.clone(), y)
+            out2 = torch_op(x.clone(), y)
 
             with _custom_add.set_priority(["impl_a"]):
                 assert _custom_add.dispatch(x, y) is impl_a
-                out3 = _custom_add(x, y)
-                out4 = torch.ops.vllm_ir._custom_add(x, y)
+                out3 = op_fn(x.clone(), y)
+                out4 = torch_op(x.clone(), y)
 
         # impl_b
         assert out1.item() == 1 + 2 + 20
@@ -265,18 +276,18 @@ def test_dispatch_priority_order(self):
         assert out4.item() == 1 + 2 + 10
 
     def test_unsupported_impl_filtered(self):
-        @_custom_add.register_impl("unsupported", supported=False)
-        def impl_bad(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        @_custom_add.register_impl("impl_unsupported", supported=False)
+        def impl_unsupported(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             return x + y + 999
 
         x = torch.tensor(1, dtype=torch.int32)
         y = torch.tensor(2, dtype=torch.int32)
 
-        with _custom_add.set_priority(["unsupported", "impl_a"]):
+        with _custom_add.set_priority(["impl_unsupported", "impl_a"]):
             assert _custom_add.get_priority() == ["impl_a"]
             out = _custom_add(x, y)
 
-        # impl_bad skipped → impl_a
+        # impl_unsupported skipped → impl_a
         assert out.item() == 1 + 2 + 10
 
     def test_supports_args_runtime_dispatch_and_warning(
diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py
index f7691a90ed8b..ce7eb3088ac0 100644
--- a/tests/kernels/attention/test_cpu_attn.py
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -20,6 +20,12 @@
     cpu_attn_reshape_and_cache,
 )
 
+# Enable AMX tile data registers so isolated runs (e.g. -k fp8_amx) don't rely
+# on ref_paged_attn's einsum to trigger oneDNN's _init_amx() first.
+if torch.cpu._is_amx_tile_supported():
+    torch.cpu._init_amx()
+
+
 NUM_HEADS = [
     (4, 4),
     (8, 2),
@@ -178,6 +184,10 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
+_FP8_ATOL = {"fp8_e4m3": 0.2, "fp8_e5m2": 0.3}
+_FP8_RTOL = 0.1
+
+
 @torch.inference_mode()
 def varlen_with_paged_kv(
     seq_lens: list[tuple[int, int]],
@@ -191,6 +201,9 @@ def varlen_with_paged_kv(
     use_alibi: bool,
     use_sink: bool,
     isa: str,
+    kv_cache_dtype: str = "auto",
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
 ) -> None:
     set_random_seed(0)
     num_seqs = len(seq_lens)
@@ -212,6 +225,10 @@ def varlen_with_paged_kv(
         15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
     )
 
+    is_fp8 = kv_cache_dtype != "auto"
+    if is_fp8 and current_platform.get_cpu_architecture() != CpuArchEnum.X86:
+        pytest.skip("FP8 KV cache only supported on x86")
+
     query = tensor_cache(
         elem_num=token_num * num_query_heads * head_size,
         dtype=dtype,
@@ -233,11 +250,17 @@ def varlen_with_paged_kv(
         num_kv_heads,
         head_size,
     )
+    if is_fp8:
+        # Clamp KV to [-1, 1] so FP8 quantization error (<=12.5% for E4M3,
+        # <=25% for E5M2) stays within the test tolerances regardless of
+        # which tensor_cache values happen to be in use.
+        key_value = key_value.clamp(-1, 1)
     key_cache, value_cache = key_value.unbind(0)
 
     # KV cache for CPU attention
+    cache_dtype = torch.uint8 if is_fp8 else dtype
     packed_key_cache = torch.empty(
-        num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+        num_blocks, num_kv_heads, block_size, head_size, dtype=cache_dtype
     )
     packed_value_cache = torch.empty_like(packed_key_cache)
 
@@ -252,6 +275,11 @@ def varlen_with_paged_kv(
 
     # use reshape_and_cache to pack key_cache and value_cache
     slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
+    fp8_kwargs: dict = (
+        dict(k_scale=k_scale, v_scale=v_scale, kv_cache_dtype=kv_cache_dtype)
+        if is_fp8
+        else {}
+    )
     cpu_attn_reshape_and_cache(
         key=key_cache.view(-1, num_kv_heads, head_size),
         value=value_cache.view(-1, num_kv_heads, head_size),
@@ -259,6 +287,7 @@ def varlen_with_paged_kv(
         value_cache=packed_value_cache,
         slot_mapping=slot_mapping,
         isa=isa,
+        **fp8_kwargs,
     )
 
     metadata = cpu_attn_get_scheduler_metadata(
@@ -291,6 +320,7 @@ def varlen_with_paged_kv(
         softcap=soft_cap if soft_cap is not None else 0,
         scheduler_metadata=metadata,
         s_aux=s_aux,
+        **fp8_kwargs,
     )
 
     metadata = cpu_attn_get_scheduler_metadata(
@@ -323,23 +353,59 @@ def varlen_with_paged_kv(
         softcap=soft_cap if soft_cap is not None else 0,
         scheduler_metadata=metadata,
         s_aux=s_aux,
+        **fp8_kwargs,
     )
 
-    ref_output = ref_paged_attn(
-        query=query,
-        key_cache=key_cache,
-        value_cache=value_cache,
-        query_lens=query_lens,
-        kv_lens=kv_lens,
-        block_tables=block_tables,
-        scale=scale,
-        sliding_window=sliding_window,
-        soft_cap=soft_cap,
-        alibi_slopes=alibi_slopes,
-        s_aux=s_aux,
-    )
+    if is_fp8:
+        # Build a float KV cache via the non-FP8 path and run float attention
+        # to use as the reference.
+        ref_key_cache = torch.empty(
+            num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+        )
+        ref_value_cache = torch.empty_like(ref_key_cache)
+        cpu_attn_reshape_and_cache(
+            key=key_cache.view(-1, num_kv_heads, head_size),
+            value=value_cache.view(-1, num_kv_heads, head_size),
+            key_cache=ref_key_cache,
+            value_cache=ref_value_cache,
+            slot_mapping=slot_mapping,
+            isa=isa,
+        )
+        ref_output = torch.empty_like(query)
+        cpu_attention_with_kv_cache(
+            query=query,
+            key_cache=ref_key_cache,
+            value_cache=ref_value_cache,
+            output=ref_output,
+            query_start_loc=cu_query_lens,
+            seq_lens=kv_lens_tensor,
+            scale=scale,
+            causal=True,
+            alibi_slopes=alibi_slopes,
+            sliding_window=window_size,
+            block_table=block_tables,
+            softcap=soft_cap if soft_cap is not None else 0,
+            scheduler_metadata=metadata,
+            s_aux=s_aux,
+        )
+        atol = _FP8_ATOL[kv_cache_dtype]
+        rtol = _FP8_RTOL
+    else:
+        ref_output = ref_paged_attn(
+            query=query,
+            key_cache=key_cache,
+            value_cache=value_cache,
+            query_lens=query_lens,
+            kv_lens=kv_lens,
+            block_tables=block_tables,
+            scale=scale,
+            sliding_window=sliding_window,
+            soft_cap=soft_cap,
+            alibi_slopes=alibi_slopes,
+            s_aux=s_aux,
+        )
+        atol, rtol = 1.5e-2, 1e-2
 
-    atol, rtol = 1.5e-2, 1e-2
     (
         torch.testing.assert_close(out_with_split, ref_output, atol=atol, rtol=rtol),
         f"{torch.max(torch.abs(out_with_split - ref_output))}",
@@ -350,6 +416,7 @@ def varlen_with_paged_kv(
     )
 
 
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8_e4m3", "fp8_e5m2"])
 @pytest.mark.parametrize("seq_lens", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -373,6 +440,7 @@ def test_varlen_with_paged_kv_normal_vec(
     use_alibi: bool,
     use_sink: bool,
     isa: str,
+    kv_cache_dtype: str,
 ) -> None:
     varlen_with_paged_kv(
         seq_lens=seq_lens,
@@ -386,9 +454,11 @@ def test_varlen_with_paged_kv_normal_vec(
         use_alibi=use_alibi,
         use_sink=use_sink,
         isa=isa,
+        kv_cache_dtype=kv_cache_dtype,
     )
 
 
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8_e4m3", "fp8_e5m2"])
 @pytest.mark.parametrize("seq_lens", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -413,6 +483,7 @@ def test_varlen_with_paged_kv_normal_amx(
     use_alibi: bool,
     use_sink: bool,
     isa: str,
+    kv_cache_dtype: str,
 ) -> None:
     varlen_with_paged_kv(
         seq_lens=seq_lens,
@@ -426,6 +497,7 @@ def test_varlen_with_paged_kv_normal_amx(
         use_alibi=use_alibi,
         use_sink=use_sink,
         isa=isa,
+        kv_cache_dtype=kv_cache_dtype,
     )
 
 
@@ -511,6 +583,7 @@ def test_varlen_with_paged_kv_normal_neon(
     )
 
 
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8_e4m3"])
 @pytest.mark.parametrize("seq_lens", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", [96])
@@ -534,6 +607,7 @@ def test_varlen_with_paged_kv_softcap(
     use_alibi: bool,
     use_sink: bool,
     isa: str,
+    kv_cache_dtype: str,
 ) -> None:
     varlen_with_paged_kv(
         seq_lens=seq_lens,
@@ -547,9 +621,11 @@ def test_varlen_with_paged_kv_softcap(
         use_alibi=use_alibi,
         use_sink=use_sink,
         isa=isa,
+        kv_cache_dtype=kv_cache_dtype,
     )
 
 
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8_e4m3"])
 @pytest.mark.parametrize("seq_lens", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", [96])
@@ -573,6 +649,7 @@ def test_varlen_with_paged_kv_alibi(
     use_alibi: bool,
     use_sink: bool,
     isa: str,
+    kv_cache_dtype: str,
 ) -> None:
     varlen_with_paged_kv(
         seq_lens=seq_lens,
@@ -586,9 +663,11 @@ def test_varlen_with_paged_kv_alibi(
         use_alibi=use_alibi,
         use_sink=use_sink,
         isa=isa,
+        kv_cache_dtype=kv_cache_dtype,
     )
 
 
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8_e4m3"])
 @pytest.mark.parametrize("seq_lens", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", [96])
@@ -612,6 +691,7 @@ def test_varlen_with_paged_kv_sink(
     use_alibi: bool,
     use_sink: bool,
     isa: str,
+    kv_cache_dtype: str,
 ) -> None:
     varlen_with_paged_kv(
         seq_lens=seq_lens,
@@ -625,4 +705,5 @@ def test_varlen_with_paged_kv_sink(
         use_alibi=use_alibi,
         use_sink=use_sink,
         isa=isa,
+        kv_cache_dtype=kv_cache_dtype,
     )
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index b5f8584015be..87a12c2ff395 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -5,12 +5,17 @@
 import torch
 
 from tests.kernels.quantization.nvfp4_utils import (
+    dequant_nvfp4_kv_cache,
     dequantize_nvfp4_to_dtype,
     get_nvfp4_global_scale,
 )
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
-from vllm.utils.torch_utils import set_random_seed
+from vllm.utils.torch_utils import (
+    nvfp4_kv_cache_full_dim,
+    nvfp4_kv_cache_split_views,
+    set_random_seed,
+)
 
 if not current_platform.is_device_capability_family(100):
     pytest.skip(
@@ -33,6 +38,117 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
     return x_scl_sat.to(dtype), scale.float().reciprocal()
 
 
+def build_paged_kv_metadata(
+    seq_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    block_size: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Build paged-KV indptr/indices/last_page_lens from seq_lens + block_tables."""
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(len(seq_lens)):
+        sl = int(seq_lens[i])
+        assert sl > 0
+        nb = (sl + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :nb].tolist())
+        kv_indptr.append(kv_indptr[-1] + nb)
+        kv_last_page_lens.append(sl % block_size or block_size)
+    return (
+        torch.tensor(kv_indptr, dtype=torch.int32),
+        torch.tensor(kv_indices, dtype=torch.int32),
+        torch.tensor(kv_last_page_lens, dtype=torch.int32),
+    )
+
+
+def make_nvfp4_kv_cache(
+    kv_bf16_hnd: torch.Tensor, block_size: int, head_size: int
+) -> tuple:
+    """Quantize bf16 KV cache to nvfp4 via reshape_and_cache_flash.
+
+    Returns (k_data, v_data), (k_scales, v_scales), kv_scale, ref_kv_bf16.
+    """
+    num_blocks, _, num_kv_heads, _, _ = kv_bf16_hnd.shape
+    kv_scale_val = (kv_bf16_hnd.abs().amax() / 448.0).item()
+    kv_scale_tensor = torch.tensor(
+        kv_scale_val, dtype=torch.float32, device=kv_bf16_hnd.device
+    )
+
+    # Allocate in HND physical order, permute to NHD logical order.
+    # hnd_order swaps dims 2↔3; it is its own inverse.
+    full_dim = nvfp4_kv_cache_full_dim(head_size)
+    hnd_order = (0, 1, 3, 2, 4)
+    kv_cache = torch.zeros(
+        (num_blocks, 2, num_kv_heads, block_size, full_dim),
+        dtype=torch.uint8,
+        device=kv_bf16_hnd.device,
+    ).permute(*hnd_order)
+
+    # Flatten NHD [N, T, H, D] → token tensors [N*T, H, D] for the kernel.
+    num_tokens = num_blocks * block_size
+    k_tokens = (
+        kv_bf16_hnd[:, 0]
+        .permute(0, 2, 1, 3)
+        .reshape(num_tokens, num_kv_heads, head_size)
+    )
+    v_tokens = (
+        kv_bf16_hnd[:, 1]
+        .permute(0, 2, 1, 3)
+        .reshape(num_tokens, num_kv_heads, head_size)
+    )
+    slot_mapping = torch.arange(num_tokens, dtype=torch.long, device=kv_bf16_hnd.device)
+
+    # reshape_and_cache_flash: kernel receives kv_cache[:, 0] and [:, 1]
+    # (full K/V buffers containing both data and scale).
+    torch.ops._C_cache_ops.reshape_and_cache_flash(
+        k_tokens,
+        v_tokens,
+        kv_cache[:, 0],
+        kv_cache[:, 1],
+        slot_mapping,
+        "nvfp4",
+        kv_scale_tensor,
+        kv_scale_tensor,
+    )
+
+    # Split in HND order for trtllm kernel (expects HND numTokensPerPage).
+    kv_cache_hnd = kv_cache.permute(*hnd_order)
+    (k_data, v_data), (k_scales, v_scales) = nvfp4_kv_cache_split_views(kv_cache_hnd)
+
+    # Dequantize for the FA2 reference baseline.
+    ref_k = dequant_nvfp4_kv_cache(
+        k_data, k_scales, kv_scale_val, head_size, block_size
+    ).to(torch.bfloat16)
+    ref_v = dequant_nvfp4_kv_cache(
+        v_data, v_scales, kv_scale_val, head_size, block_size
+    ).to(torch.bfloat16)
+    ref_kv_bf16 = torch.stack([ref_k, ref_v], dim=1)  # [N, 2, H, T, D]
+
+    return (k_data, v_data), (k_scales, v_scales), kv_scale_val, ref_kv_bf16
+
+
+def make_quantized_kv_cache(
+    kv_cache: torch.Tensor,
+    kv_quant_dtype: torch.dtype,
+    block_size: int,
+    head_size: int,
+) -> tuple:
+    """Quantize kv_cache based on dtype. Returns (kv_cache, kv_cache_sf,
+    kv_scale, ref_kv_cache, is_nvfp4_kv)."""
+    is_nvfp4_kv = kv_quant_dtype == FP4_DTYPE
+    if is_nvfp4_kv:
+        data, scales, kv_scale, ref = make_nvfp4_kv_cache(
+            kv_cache, block_size, head_size
+        )
+        return data, scales, kv_scale, ref, True
+    elif kv_quant_dtype == FP8_DTYPE:
+        kv_fp8, kv_scale = to_float8(kv_cache)
+        ref = kv_fp8.to(kv_cache.dtype) * kv_scale
+        return kv_fp8, None, kv_scale, ref, False
+    else:
+        return kv_cache, None, 1.0, kv_cache, False
+
+
 DTYPE = [torch.bfloat16]
 QUANT_DTYPES = [
     # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
@@ -41,6 +157,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
     (FP8_DTYPE, FP8_DTYPE, None),
     (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
     (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
+    (FP8_DTYPE, FP4_DTYPE, FP8_DTYPE),  # nvfp4 KV cache
 ]
 BATCH_SIZE = [4, 12]
 MAX_SEQ_LENS = [(1024, 4096)]
@@ -127,35 +244,19 @@ def test_flashinfer_trtllm_decode_with_baseline(
     max_seq_len = torch.max(seq_lens).item()
 
     kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
-    if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, kv_scale = to_float8(kv_cache)
-        ref_kv_cache = kv_cache.to(dtype) * kv_scale
-    else:
-        kv_scale = 1.0
-        ref_kv_cache = kv_cache
+    kv_cache, kv_cache_sf, kv_scale, ref_kv_cache, is_nvfp4_kv = (
+        make_quantized_kv_cache(kv_cache, kv_quant_dtype, block_size, head_size)
+    )
+
     k_scale = v_scale = kv_scale
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
         0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
     )
-    kv_indptr = [0]
-    kv_indices = []
-    kv_last_page_lens = []
-    for i in range(batch_size):
-        seq_len = seq_lens[i]
-        assert seq_len > 0
-        num_blocks = (seq_len + block_size - 1) // block_size
-        kv_indices.extend(block_tables[i, :num_blocks])
-        kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % block_size
-        if kv_last_page_len == 0:
-            kv_last_page_len = block_size
-        kv_last_page_lens.append(kv_last_page_len)
-
-    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
-    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
-    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+    kv_indptr, kv_indices, kv_last_page_lens = build_paged_kv_metadata(
+        seq_lens, block_tables, block_size
+    )
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
 
     # Baseline Decode
@@ -225,6 +326,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
         sinks=sinks,
         o_sf_scale=o_sf_scale_float,
         out=output_trtllm,
+        kv_cache_sf=kv_cache_sf,
     )
     if o_quant_dtype == FP8_DTYPE:
         output_trtllm = output_trtllm.to(dtype) * o_scale
@@ -237,7 +339,9 @@ def test_flashinfer_trtllm_decode_with_baseline(
         )
         output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
 
-    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
+    if is_nvfp4_kv:
+        rtol, atol = 1.0, 1.0  # nvfp4 has higher quantization error
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
         rtol, atol = 7e-2, 9e-2
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
         rtol, atol = 3e-2, 4e-2
@@ -287,7 +391,12 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     kv_quant_dtype = kv_quant_dtype or dtype
     o_quant_dtype = o_quant_dtype or dtype
 
-    if q_quant_dtype != kv_quant_dtype:
+    # FP8 Q + nvfp4 KV is the required combination for the nvfp4 KV path.
+    # All other mixed Q/KV dtype combinations are unsupported.
+    is_nvfp4_kv = kv_quant_dtype == FP4_DTYPE
+    if q_quant_dtype != kv_quant_dtype and not (
+        q_quant_dtype == FP8_DTYPE and is_nvfp4_kv
+    ):
         pytest.skip("Skipped mixed QKV dtypes for prefill")
 
     max_q_len, max_kv_len = max_seq_lens
@@ -329,35 +438,19 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     max_seq_len = torch.max(seq_lens).item()
 
     kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
-    if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, kv_scale = to_float8(kv_cache)
-        ref_kv_cache = kv_cache.to(dtype) * kv_scale
-    else:
-        kv_scale = 1.0
-        ref_kv_cache = kv_cache
+    kv_cache, kv_cache_sf, kv_scale, ref_kv_cache, is_nvfp4_kv = (
+        make_quantized_kv_cache(kv_cache, kv_quant_dtype, block_size, head_size)
+    )
+
     k_scale = v_scale = kv_scale
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
         0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
     )
-    kv_indptr = [0]
-    kv_indices = []
-    kv_last_page_lens = []
-    for i in range(batch_size):
-        seq_len = seq_lens[i]
-        assert seq_len > 0
-        num_blocks = (seq_len + block_size - 1) // block_size
-        kv_indices.extend(block_tables[i, :num_blocks])
-        kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % block_size
-        if kv_last_page_len == 0:
-            kv_last_page_len = block_size
-        kv_last_page_lens.append(kv_last_page_len)
-
-    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
-    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
-    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+    kv_indptr, kv_indices, kv_last_page_lens = build_paged_kv_metadata(
+        seq_lens, block_tables, block_size
+    )
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
 
     # Baseline Prefill
@@ -431,6 +524,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         sinks=sinks,
         o_sf_scale=o_sf_scale_float,
         out=output_trtllm,
+        kv_cache_sf=kv_cache_sf,
     )
     if o_quant_dtype == FP8_DTYPE:
         output_trtllm = output_trtllm.to(dtype) * o_scale
@@ -443,7 +537,9 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         )
         output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
 
-    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
+    if is_nvfp4_kv:
+        rtol, atol = 1.0, 1.5  # nvfp4 has higher quantization error
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
         rtol, atol = 3e-1, 4e-1
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
         rtol, atol = 4e-2, 6e-2
diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
index 6cdd94fdc865..1cbb5dbd1881 100644
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -35,6 +35,7 @@ def rotary_embedding_opcheck(
 @pytest.mark.parametrize("seq_len", [11, 1024])
 @pytest.mark.parametrize("use_key", [True, False])
 @pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
 def test_rotary_embedding_opcheck(
     default_vllm_config,
     dist_init,
@@ -46,19 +47,20 @@ def test_rotary_embedding_opcheck(
     seq_len,
     use_key,
     head_stride_is_contiguous,
+    dtype,
 ):
     batch_size = 1
     base = 10000
     num_heads = 7
     rot = RotaryEmbedding(
-        head_size, rotary_dim, max_position, base, is_neox_style, torch.float32
+        head_size, rotary_dim, max_position, base, is_neox_style, dtype
     )
 
     positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
     head_stride = head_size + (64 if head_stride_is_contiguous else 0)
 
     query = torch.randn(
-        batch_size, seq_len, num_heads, head_stride, dtype=torch.float32, device=device
+        batch_size, seq_len, num_heads, head_stride, dtype=dtype, device=device
     )
     key = torch.randn_like(query) if use_key else None
     query = query[..., :head_size]
diff --git a/tests/kernels/ir/test_layernorm.py b/tests/kernels/ir/test_layernorm.py
index 7510ae5010fa..e9661f5202f9 100644
--- a/tests/kernels/ir/test_layernorm.py
+++ b/tests/kernels/ir/test_layernorm.py
@@ -28,7 +28,9 @@ def test_rms_norm_registration():
         "native": True,
         "vllm_c": current_platform.is_cuda_alike(),
         "aiter": current_platform.is_rocm(),
-        "oink": False,
+        "oink": current_platform.has_device_capability(100)
+        and hasattr(torch.ops, "oink")
+        and hasattr(torch.ops.oink, "rmsnorm"),
         "xpu_kernels": current_platform.is_xpu(),
     }
 
@@ -67,6 +69,14 @@ def test_native_semantics(self, dtype, n_tokens, hidden_size, epsilon):
         out2 = rms_norm_native(x * 2.0, weight, epsilon=epsilon)
         torch.testing.assert_close(out2, out, rtol=get_default_rtol(out), atol=1e-3)
 
+        # Mean square should be approximately 1 (ignoring epsilon and weight scaling)
+        combined_norm = out.float() / weight.float()
+        variance = combined_norm.pow(2).mean(dim=-1)
+        # After RMS normalization, variance should be close to 1
+        torch.testing.assert_close(
+            variance, torch.ones_like(variance), rtol=1e-2, atol=1e-2
+        )
+
         # Check behavior with and without weight
         weight1 = torch.ones_like(weight)
         out3 = rms_norm_native(x, weight1, epsilon=epsilon)
@@ -129,3 +139,197 @@ def test_aiter_rejects_unsupported_dtypes():
             num_tokens=8, hidden_size=4096, dtype=dtype, epsilon=1e-5
         )
         assert not impl.supports_args(*args), f"aiter should reject dtype={dtype}"
+
+
+fused_add_rms_norm_native = ir.ops.fused_add_rms_norm.impls["native"].impl_fn
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike() and not current_platform.is_xpu(),
+    reason="Currently only kernels on CUDA, ROCm and XPU",
+)
+def test_fused_add_rms_norm_registration():
+    expected = {
+        "native": True,
+        "vllm_c": current_platform.is_cuda_alike(),
+        "aiter": current_platform.is_rocm(),
+        "oink": current_platform.has_device_capability(100)
+        and hasattr(torch.ops, "oink")
+        and hasattr(torch.ops.oink, "fused_add_rms_norm"),
+        "xpu_kernels": current_platform.is_xpu(),
+    }
+
+    actual = {
+        provider: impl.supported
+        for provider, impl in ir.ops.fused_add_rms_norm.impls.items()
+    }
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("n_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", COMMON_HIDDEN_SIZES)
+@pytest.mark.parametrize("epsilon", [1e-6, 1e-5])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike() and not current_platform.is_xpu(),
+    reason="Currently only kernels on CUDA, ROCm and XPU",
+)
+class TestFusedAddRMSNorm:
+    @classmethod
+    def setup_class(cls, **kwargs):
+        torch.set_default_device(current_platform.device_type)
+
+    def test_native_semantics(self, dtype, n_tokens, hidden_size, epsilon):
+        x, x_residual, weight, eps = ir.ops.fused_add_rms_norm.generate_inputs(
+            num_tokens=4, hidden_size=8, dtype=dtype, epsilon=epsilon
+        )
+        out, residual_out = fused_add_rms_norm_native(x, x_residual, weight, eps)
+
+        # Check shape, dtype, device
+        assert out.shape == x.shape
+        assert out.dtype == x.dtype
+        assert out.device == x.device
+        assert residual_out.shape == x_residual.shape
+        assert residual_out.dtype == x_residual.dtype
+        assert residual_out.device == x_residual.device
+
+        # Check that residual_out = x + x_residual
+        expected_residual = (x.float() + x_residual.float()).to(dtype)
+        torch.testing.assert_close(
+            residual_out, expected_residual, rtol=1e-3, atol=1e-3
+        )
+
+        # Verify that the output is RMS normalized version of (x + x_residual)
+        expected_out = rms_norm_native(expected_residual, weight, epsilon)
+        assert_close(
+            ir.ops.fused_add_rms_norm,
+            (out, residual_out),
+            (expected_out, expected_residual),
+        )
+
+        # Check the scaling property of rms norm
+        out1, _ = fused_add_rms_norm_native(
+            x, torch.zeros_like(x), weight, epsilon=epsilon
+        )
+        out2, _ = fused_add_rms_norm_native(
+            x * 2.0, torch.zeros_like(x), weight, epsilon=epsilon
+        )
+        torch.testing.assert_close(out2, out1, rtol=get_default_rtol(out), atol=1e-3)
+
+        # Check behavior with and without weight
+        weight1 = torch.ones_like(weight)
+        out3, _ = fused_add_rms_norm_native(x, x_residual, weight1, eps)
+        out4, _ = fused_add_rms_norm_native(x, x_residual, None, eps)
+        torch.testing.assert_close(out3, out4)
+
+    @pytest.mark.parametrize("provider", supported_providers(ir.ops.fused_add_rms_norm))
+    def test_impls(self, dtype, n_tokens, hidden_size, epsilon, provider):
+        impl = ir.ops.fused_add_rms_norm.impls[provider]
+        x, x_residual, weight, eps = ir.ops.fused_add_rms_norm.generate_inputs(
+            num_tokens=n_tokens, hidden_size=hidden_size, dtype=dtype, epsilon=epsilon
+        )
+        args = (x, x_residual, weight, eps, None)
+
+        if not impl.supports_args(*args):
+            pytest.skip(f"{provider} does not support args")
+
+        ref_output, ref_residual = fused_add_rms_norm_native(*clone_args(args))
+        output, residual = impl.impl_fn(*clone_args(args))
+        assert_close(ir.ops.fused_add_rms_norm, output, ref_output)
+        assert_close(ir.ops.fused_add_rms_norm, residual, ref_residual)
+
+        # check that dispatched call matches direct call
+        with ir.ops.fused_add_rms_norm.set_priority([provider, "native"]):
+            out_dispatched, residual_dispatched = ir.ops.fused_add_rms_norm(*args[:4])
+        out_direct, residual_direct = impl.impl_fn(*clone_args(args))
+        torch.testing.assert_close(out_dispatched, out_direct, rtol=0.0, atol=0.0)
+        torch.testing.assert_close(
+            residual_dispatched, residual_direct, rtol=0.0, atol=0.0
+        )
+
+        # none of these support variance_size override
+        assert not impl.supports_args(x, x_residual, weight, epsilon, 4)
+        assert not impl.supports_args(x, x_residual, weight, epsilon, variance_size=4)
+
+        # test weight=None behavior
+        out_no_weight, residual_no_weight = impl.impl_fn(
+            x.clone(), x_residual.clone(), None, epsilon
+        )
+        out_unit_weight, residual_unit_weight = impl.impl_fn(
+            x.clone(), x_residual.clone(), torch.ones_like(weight), epsilon
+        )
+        assert_close(ir.ops.fused_add_rms_norm, out_no_weight, out_unit_weight)
+        assert_close(
+            ir.ops.fused_add_rms_norm, residual_no_weight, residual_unit_weight
+        )
+
+    @pytest.mark.parametrize("provider", ["vllm_c"])
+    def test_inplace_semantics(self, dtype, n_tokens, hidden_size, epsilon, provider):
+        """Test that inplace implementations reuse inputs,
+        for maybe_inplace overload but not for default overload."""
+        impl = ir.ops.fused_add_rms_norm.impls[provider]
+        if not impl.supported:
+            pytest.skip(f"{provider} impl not supported on this platform")
+
+        x, x_residual, weight, eps = ir.ops.fused_add_rms_norm.generate_inputs(
+            num_tokens=n_tokens, hidden_size=hidden_size, dtype=dtype, epsilon=epsilon
+        )
+
+        # Test default overload - should NOT modify inputs even with inplace impl
+        x_default = x.clone()
+        x_residual_default = x_residual.clone()
+        x_default_ptr = x_default.data_ptr()
+        x_residual_default_ptr = x_residual_default.data_ptr()
+
+        with ir.ops.fused_add_rms_norm.set_priority([provider, "native"]):
+            out_default, residual_default = ir.ops.fused_add_rms_norm(
+                x_default, x_residual_default, weight, eps
+            )
+
+        # Default should NOT be inplace (even with inplace implementation)
+        assert out_default.data_ptr() != x_default_ptr
+        assert residual_default.data_ptr() != x_residual_default_ptr
+        torch.testing.assert_close(x, x_default, rtol=0.0, atol=0.0)
+        torch.testing.assert_close(x_residual, x_residual_default, rtol=0.0, atol=0.0)
+
+        # Test maybe_inplace overload - should modify inputs with inplace impl
+        x_inplace = x.clone()
+        x_residual_inplace = x_residual.clone()
+        x_inplace_ptr = x_inplace.data_ptr()
+        x_residual_inplace_ptr = x_residual_inplace.data_ptr()
+
+        with ir.ops.fused_add_rms_norm.set_priority([provider, "native"]):
+            out_inplace, residual_inplace = ir.ops.fused_add_rms_norm.maybe_inplace(
+                x_inplace, x_residual_inplace, weight, eps
+            )
+
+        # maybe_inplace should be inplace
+        assert out_inplace.data_ptr() == x_inplace_ptr
+        assert residual_inplace.data_ptr() == x_residual_inplace_ptr
+
+        # Both should produce same results
+        torch.testing.assert_close(out_default, out_inplace, atol=0.0, rtol=0.0)
+        torch.testing.assert_close(
+            residual_default, residual_inplace, atol=0.0, rtol=0.0
+        )
+
+    @pytest.mark.parametrize("provider", supported_providers(ir.ops.fused_add_rms_norm))
+    def test_torch_opcheck(self, dtype, n_tokens, hidden_size, epsilon, provider):
+        args = ir.ops.fused_add_rms_norm.generate_inputs(
+            num_tokens=n_tokens, hidden_size=hidden_size, dtype=dtype, epsilon=epsilon
+        )
+        args = args + (None,)  # Add variance_size parameter
+
+        # When checking the torch op, we have to set priority and use dispatch
+        with ir.ops.fused_add_rms_norm.set_priority([provider, "native"]):
+            torch.library.opcheck(torch.ops.vllm_ir.fused_add_rms_norm.default, args)
+
+            # Only test maybe_inplace with non-inplace implementations
+            # Inplace implementations return aliases of inputs which is not allowed.
+            # We break this invariant, but we also convert maybe_inplace to the default
+            # overload during compilation, so maybe_inplace never reaches Inductor.
+            if not ir.ops.fused_add_rms_norm.impls[provider].inplace:
+                torch.library.opcheck(
+                    torch.ops.vllm_ir.fused_add_rms_norm.maybe_inplace, args
+                )
diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
index aefc35324d86..8ed7757f6553 100644
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -28,6 +28,25 @@
     and has_flashinfer()
 )
 
+# ROCm platform and dependencies
+ROCM_AVAILABLE = current_platform.is_rocm()
+ROCM_TRITON_KERNELS_AVAILABLE = False
+ROCM_AITER_AVAILABLE = False
+ROCM_GFX950 = False
+
+if ROCM_AVAILABLE:
+    from vllm._aiter_ops import rocm_aiter_ops
+    from vllm.platforms.rocm import on_gfx950
+    from vllm.utils.import_utils import has_triton_kernels
+
+    ROCM_TRITON_KERNELS_AVAILABLE = has_triton_kernels()
+    ROCM_GFX950 = on_gfx950()
+    ROCM_AITER_AVAILABLE = rocm_aiter_ops.is_enabled()
+
+    if ROCM_AITER_AVAILABLE:
+        from aiter.ops.triton.moe.quant_moe import upcast_from_mxfp
+        from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
 if TRTLLM_GEN_MXFP4_AVAILABLE:
     from flashinfer import (
         fp4_quantize,
@@ -111,6 +130,7 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
 
 def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: float | None = None):
     # Note we add an extra bias of 1 to the linear layer
+    # Uses chunked layout: first half is gate, second half is up
     x_glu, x_linear = torch.chunk(x, 2, dim=-1)
     if limit is not None:
         x_glu = x_glu.clamp(max=limit)
@@ -119,6 +139,16 @@ def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: float | None = Non
     return out_glu * (x_linear + beta)
 
 
+def swigluoai(x, alpha: float = 1.702, limit: float = 7.0):
+    # OAI swiglu uses interleaved layout: gate/up alternating
+    # See SwigluOAIAndMul in vllm/model_executor/layers/activation.py
+    gate, up = x[..., ::2], x[..., 1::2]
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
 fp4_lookup_table = [0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6]
 
 
@@ -168,8 +198,20 @@ def reference_moe(
     beta,
     limit,
     act_type,
-    is_gated,
+    activation: str = "swiglu",
+    use_interleaved_layout: bool = False,
 ):
+    """
+    Reference MoE implementation for accuracy testing.
+
+    Args:
+        activation: One of "swiglu", "silu", "relu2". Controls the activation
+            function used after the first MLP.
+        use_interleaved_layout: If True, uses interleaved gate/up layout
+            (gate=x[..., ::2], up=x[..., 1::2]) as used by SWIGLUOAI.
+            If False, uses chunked layout (gate, up = chunk(x, 2)) as used
+            by standard swiglu/silu.
+    """
     # renormalize routing
     experts = torch.topk(roouting_logits, k=topk, dim=-1, sorted=True)
     expert_weights = torch.nn.functional.softmax(experts.values, dim=1)
@@ -179,12 +221,21 @@ def reference_moe(
     mlp1_weight = w13[expert_indices, ...]
     mlp1_bias = bias13[expert_indices, ...]
     t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias
-    if is_gated:
-        t = swiglu(t, alpha=alpha, beta=beta, limit=limit)
-    else:
+
+    # Apply activation
+    if activation in ("swiglu", "silu"):
+        if use_interleaved_layout:
+            # SWIGLUOAI: interleaved gate/up layout
+            t = swigluoai(t, alpha=alpha, limit=limit)
+        else:
+            # Standard swiglu/silu: chunked layout
+            t = swiglu(t, alpha=alpha, beta=beta, limit=limit)
+    elif activation == "relu2":
         # RELU2_NO_MUL: relu(x)^2
         t = torch.relu(t)
         t = t * t
+    else:
+        raise ValueError(f"Unknown activation: {activation}")
 
     if act_type == "mxfp8":
         t_quantized, t_scale = mxfp8_quantize(
@@ -585,7 +636,8 @@ def test_trtllm_gen_mxfp4_fused_moe(
             beta,
             limit,
             act_type,
-            is_gated=True,
+            activation="swiglu",
+            use_interleaved_layout=False,
         )
         ref_result[start_idx:end_idx].copy_(chunk_result)
 
@@ -722,7 +774,8 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
         beta,
         limit,
         "bf16",
-        is_gated=True,
+        activation="swiglu",
+        use_interleaved_layout=False,
     )
 
     from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
@@ -908,7 +961,8 @@ def dequant_mxfp4_batches(mat_fp4: torch.Tensor, scale_tensor: torch.Tensor):
         beta,
         limit,
         "mxfp8",
-        is_gated=True,
+        activation="swiglu",
+        use_interleaved_layout=False,
     )
 
     # Prepare inputs for FlashInfer CUTLASS fused MoE
@@ -1080,7 +1134,8 @@ def test_trtllm_gen_mxfp8_block_scale_moe(
         beta=0.0,
         limit=None,
         act_type="mxfp8",
-        is_gated=is_gated,
+        activation="swiglu" if is_gated else "relu2",
+        use_interleaved_layout=False,
     )
 
     # Shuffle weights/scales with the same indexed layout used by TRTLLM kernels.
@@ -1150,3 +1205,328 @@ def test_trtllm_gen_mxfp8_block_scale_moe(
 
     # Block-scale MXFP8 kernels are approximate; require majority close.
     check_accuracy(ref, out, atol=0.1, rtol=0.85, percent=0.8)
+
+
+# -----------------------------------------------------------------------------
+# ROCm Oracle-based kernel execution tests
+# -----------------------------------------------------------------------------
+# TODO: Further tighten the accuracy threshold.
+# - More accurate ref moe to include activation quantization
+# - Check aiter kernel accuracy. E.g., quant / dequant details.
+ROCM_BACKEND_CONFIGS = {
+    "TRITON": {
+        "activation": "SWIGLUOAI",
+        "rtol": 0.3,
+        "percent": 0.95,
+        "requires_aiter": False,
+        "requires_gfx950": False,
+    },
+    "TRITON_UNFUSED": {
+        "activation": "SWIGLUOAI",
+        "rtol": 0.3,
+        "percent": 0.95,
+        "requires_aiter": False,
+        "requires_gfx950": False,
+    },
+    "AITER_MXFP4_BF16": {
+        "activation": "SILU",
+        "rtol": 1.0,
+        "percent": 0.7,
+        "requires_aiter": True,
+        "requires_gfx950": True,
+    },
+    "AITER_MXFP4_FP8": {
+        "activation": "SWIGLUOAI",
+        "rtol": 0.5,
+        "percent": 0.9,
+        "requires_aiter": True,
+        "requires_gfx950": True,
+    },
+}
+
+
+@pytest.mark.parametrize("backend_name", list(ROCM_BACKEND_CONFIGS.keys()))
+@pytest.mark.parametrize("topk", [4])
+@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("num_tokens,hidden_size,intermediate_size", [(16, 256, 256)])
+@pytest.mark.skipif(
+    not ROCM_AVAILABLE,
+    reason="ROCm is required for this test",
+)
+@torch.inference_mode()
+def test_rocm_mxfp4_moe_oracle(
+    backend_name: str,
+    topk: int,
+    num_experts: int,
+    num_tokens: int,
+    hidden_size: int,
+    intermediate_size: int,
+):
+    """
+    Test ROCm MXFP4 MoE using oracle functions.
+
+    This test validates that the oracle functions work end-to-end:
+    - select_mxfp4_moe_backend() selects a valid backend
+    - convert_to_mxfp4_moe_kernel_format() converts weights without error
+    - make_mxfp4_moe_quant_config() builds a valid quant config
+    - make_mxfp4_moe_kernel() creates a kernel that runs without error
+    - The kernel output is within accuracy tolerance of reference
+    """
+    config = ROCM_BACKEND_CONFIGS[backend_name]
+
+    # Check platform requirements
+    if not ROCM_TRITON_KERNELS_AVAILABLE:
+        pytest.skip("triton_kernels required for quantization")
+    if config["requires_aiter"] and not ROCM_AITER_AVAILABLE:
+        pytest.skip(f"Backend {backend_name} requires AITER")
+    if config["requires_gfx950"] and not ROCM_GFX950:
+        pytest.skip(f"Backend {backend_name} requires GFX950")
+
+    from vllm.config import VllmConfig, set_current_vllm_config
+    from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+    from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import (
+        Mxfp4MoeBackend,
+        backend_to_kernel_cls,
+        convert_to_mxfp4_moe_kernel_format,
+        make_mxfp4_moe_kernel,
+        make_mxfp4_moe_quant_config,
+    )
+    from vllm.v1.worker.workspace import init_workspace_manager
+
+    # Initialize workspace manager (needed for modular kernels)
+    init_workspace_manager(torch.accelerator.current_device_index())
+
+    # Map string to enum
+    backend = Mxfp4MoeBackend[backend_name]
+
+    # Get experts class from oracle
+    experts_cls_list = backend_to_kernel_cls(backend)
+    if experts_cls_list is None or len(experts_cls_list) == 0:
+        pytest.skip(f"Backend {backend_name} not available")
+
+    # Use first experts class
+    experts_cls = experts_cls_list[0]
+
+    torch.manual_seed(42)
+    dtype = torch.bfloat16
+    device = "cuda:0"
+
+    # Create MoE config with Renormalize routing (required by monolithic kernels)
+    from vllm.model_executor.layers.fused_moe import FusedMoEConfig
+    from vllm.model_executor.layers.fused_moe.config import (
+        FusedMoEParallelConfig,
+        RoutingMethodType,
+    )
+
+    moe_config = FusedMoEConfig(
+        num_experts=num_experts,
+        experts_per_token=topk,
+        hidden_dim=hidden_size,
+        intermediate_size_per_partition=intermediate_size,
+        num_local_experts=num_experts,
+        num_logical_experts=num_experts,
+        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+        activation=MoEActivation[config["activation"]],
+        in_dtype=dtype,
+        device="cuda",
+        routing_method=RoutingMethodType.Renormalize,
+    )
+
+    # Create float weights in checkpoint format:
+    # w13: [num_experts, 2*intermediate_size, hidden_size]
+    # w2: [num_experts, hidden_size, intermediate_size]
+    w13_float = torch.randn(
+        num_experts, 2 * intermediate_size, hidden_size, dtype=dtype, device=device
+    )
+    w2_float = torch.randn(
+        num_experts, hidden_size, intermediate_size, dtype=dtype, device=device
+    )
+
+    # dynamic_mxfp4_quant expects 2D input, so reshape 3D weights
+    # w13: [E, 2*I, H] -> [E*2*I, H] -> quantize -> [E, 2*I, H//2]
+    # w2: [E, H, I] -> [E*H, I] -> quantize -> [E, H, I//2]
+    w13_2d = w13_float.reshape(-1, hidden_size)
+    w13_quant_2d, w13_scale_2d = dynamic_mxfp4_quant(w13_2d)
+    w13_quant = w13_quant_2d.reshape(num_experts, 2 * intermediate_size, -1)
+    w13_scale = w13_scale_2d.reshape(num_experts, 2 * intermediate_size, -1)
+
+    w2_2d = w2_float.reshape(-1, intermediate_size)
+    w2_quant_2d, w2_scale_2d = dynamic_mxfp4_quant(w2_2d)
+    w2_quant = w2_quant_2d.reshape(num_experts, hidden_size, -1)
+    w2_scale = w2_scale_2d.reshape(num_experts, hidden_size, -1)
+
+    w13_bias = torch.randn(
+        num_experts, 2 * intermediate_size, dtype=dtype, device=device
+    )
+    w2_bias = torch.randn(num_experts, hidden_size, dtype=dtype, device=device)
+
+    # Create static input scales for W4A8 backend (AITER_MXFP4_FP8)
+    w13_input_scale: torch.Tensor | None = None
+    w2_input_scale: torch.Tensor | None = None
+    if backend_name == "AITER_MXFP4_FP8":
+        # Static FP8 scales: one scale per expert
+        w13_input_scale = torch.ones(num_experts, dtype=torch.float32, device=device)
+        w2_input_scale = torch.ones(num_experts, dtype=torch.float32, device=device)
+
+    # Create mock layer for oracle functions
+    class MockLayer:
+        w13_weight: torch.Tensor
+        w2_weight: torch.Tensor
+        w13_weight_scale: torch.Tensor
+        w2_weight_scale: torch.Tensor
+        w13_input_scale: torch.Tensor | None
+        w2_input_scale: torch.Tensor | None
+
+    layer = MockLayer()
+    layer.w13_weight = w13_quant
+    layer.w2_weight = w2_quant
+    layer.w13_weight_scale = w13_scale
+    layer.w2_weight_scale = w2_scale
+    layer.w13_input_scale = w13_input_scale
+    layer.w2_input_scale = w2_input_scale
+
+    # Convert weights using oracle
+    w13_conv, w2_conv, w13_scale_conv, w2_scale_conv, w13_bias_conv, w2_bias_conv = (
+        convert_to_mxfp4_moe_kernel_format(
+            mxfp4_backend=backend,
+            layer=layer,  # type: ignore[arg-type]
+            w13_weight=w13_quant,
+            w2_weight=w2_quant,
+            w13_weight_scale=w13_scale,
+            w2_weight_scale=w2_scale,
+            w13_bias=w13_bias,
+            w2_bias=w2_bias,
+        )
+    )
+
+    # Build quant config using oracle
+    quant_config = make_mxfp4_moe_quant_config(
+        mxfp4_backend=backend,
+        w1_scale=w13_scale_conv,
+        w2_scale=w2_scale_conv,
+        w1_bias=w13_bias_conv,
+        w2_bias=w2_bias_conv,
+        a1_scale=w13_input_scale,
+        a2_scale=w2_input_scale,
+    )
+
+    # Select activation based on backend
+    activation_name = str(config["activation"])
+    activation = MoEActivation[activation_name]
+
+    # Build kernel using oracle
+    assert quant_config is not None, "Failed to create quant config"
+    with set_current_vllm_config(VllmConfig()):
+        kernel = make_mxfp4_moe_kernel(
+            moe_quant_config=quant_config,
+            moe_config=moe_config,
+            mxfp4_backend=backend,
+            experts_cls=experts_cls,
+            routing_tables=None,
+            shared_experts=None,
+        )
+
+        # Create inputs
+        x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+        router_logits = torch.randn(
+            num_tokens, num_experts, dtype=torch.float32, device=device
+        )
+        topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
+        topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+        # Run kernel - use appropriate method based on impl type
+        if kernel.is_monolithic:
+            # Monolithic impl uses router_logits
+            out = kernel.apply_monolithic(
+                hidden_states=x,
+                w1=w13_conv,
+                w2=w2_conv,
+                router_logits=router_logits,
+                activation=activation,
+                global_num_experts=num_experts,
+                expert_map=None,
+                apply_router_weight_on_input=False,
+            )
+        else:
+            # Modular impl uses topk_weights and topk_ids
+            out = kernel.apply(
+                hidden_states=x,
+                w1=w13_conv,
+                w2=w2_conv,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                global_num_experts=num_experts,
+                expert_map=None,
+                apply_router_weight_on_input=False,
+            )
+
+    # Verify output is valid (no NaN/Inf) and has expected shape
+    assert out.shape == (num_tokens, hidden_size), f"Unexpected shape: {out.shape}"
+    assert not torch.any(torch.isnan(out)), "Output contains NaN"
+    assert not torch.any(torch.isinf(out)), "Output contains Inf"
+
+    # Verify output has reasonable magnitude (not all zeros)
+    assert out.abs().max() > 0.01, "Output is effectively zero"
+
+    # Dequantize weights for reference computation
+    w13_dq = upcast_from_mxfp(
+        w13_quant.view(torch.uint8), w13_scale, torch.bfloat16, axis=-1
+    )
+    w2_dq = upcast_from_mxfp(
+        w2_quant.view(torch.uint8), w2_scale, torch.bfloat16, axis=-1
+    )
+
+    # Determine activation type and layout
+    # SWIGLUOAI uses interleaved layout (gate/up alternating)
+    # SILU uses chunked layout (first half gate, second half up)
+    use_interleaved = activation == MoEActivation.SWIGLUOAI
+    if activation in [MoEActivation.SWIGLUOAI, MoEActivation.SILU]:
+        act_name = "swiglu"
+    else:
+        act_name = "relu2"
+
+    ref = reference_moe(
+        router_logits,
+        topk,
+        num_experts,
+        x.to(torch.float32),
+        w13_dq.to(torch.float32),
+        w13_bias.to(torch.float32),
+        w2_dq.to(torch.float32),
+        w2_bias.to(torch.float32),
+        alpha=1.702 if activation == MoEActivation.SWIGLUOAI else 1.0,
+        beta=1.0 if activation == MoEActivation.SWIGLUOAI else 0.0,
+        limit=7.0 if activation == MoEActivation.SWIGLUOAI else None,
+        act_type="bf16",
+        activation=act_name,
+        use_interleaved_layout=use_interleaved,
+    )
+
+    # Compute and print accuracy statistics
+    diff = (ref.float() - out.float()).abs()
+    rel_diff = diff / (ref.float().abs() + 1e-6)
+
+    print(f"\n[{backend_name}] Accuracy statistics:")
+    print(
+        f"  Reference: min={ref.min():.4f}, max={ref.max():.4f}, mean={ref.mean():.4f}"
+    )
+    print(
+        f"  Output:    min={out.min():.4f}, max={out.max():.4f}, mean={out.mean():.4f}"
+    )
+    print(
+        f"  Abs diff:  min={diff.min():.4f}, max={diff.max():.4f}, "
+        f"mean={diff.mean():.4f}"
+    )
+    print(
+        f"  Rel diff:  min={rel_diff.min():.4f}, max={rel_diff.max():.4f}, "
+        f"mean={rel_diff.mean():.4f}"
+    )
+
+    # Check what percentage of values are within various tolerances
+    for rtol in [0.1, 0.5, 1.0, 2.0]:
+        within_tol = (diff <= rtol * out.float().abs()).float().mean()
+        print(f"  Within rtol={rtol}: {within_tol * 100:.1f}%")
+
+    # Check accuracy using per-backend thresholds
+    check_accuracy(ref, out, atol=0.1, rtol=config["rtol"], percent=config["percent"])
diff --git a/tests/kernels/moe/test_topk_softplus_sqrt.py b/tests/kernels/moe/test_topk_softplus_sqrt.py
index 7f5aacb383db..1b68213fafef 100644
--- a/tests/kernels/moe/test_topk_softplus_sqrt.py
+++ b/tests/kernels/moe/test_topk_softplus_sqrt.py
@@ -70,7 +70,8 @@ def test_sqrtsoftplus_bias_uses_deepseek_v4_routing_method():
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+    not current_platform.is_cuda_alike(),
+    reason="This test is skipped on non-CUDA platform.",
 )
 @pytest.mark.parametrize("num_tokens", [1, 33, 128])
 @pytest.mark.parametrize("hidden_size", [1024, 2048])
@@ -125,7 +126,8 @@ def test_fused_topk_softplus_sqrt(
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+    not current_platform.is_cuda_alike(),
+    reason="This test is skipped on non-CUDA platform.",
 )
 @pytest.mark.parametrize("num_tokens", [1, 33, 128])
 @pytest.mark.parametrize("hidden_size", [1024, 2048])
diff --git a/tests/kernels/moe/test_trtllm_nvfp4_moe.py b/tests/kernels/moe/test_trtllm_nvfp4_moe.py
new file mode 100644
index 000000000000..84fa7247e907
--- /dev/null
+++ b/tests/kernels/moe/test_trtllm_nvfp4_moe.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for the FlashInfer TRTLLM NvFP4 MoE backend
+(`TrtLlmNvFp4ExpertsModular`).
+
+Covers the activations the wrapper claims to support — SiLU, RELU^2 (non-gated),
+and GELU — including a Gemma4-shaped case (128 experts, top-k 8,
+intermediate_size 704) that exercises the non-256-aligned padding path.
+"""
+
+import pytest
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_test_quant_config
+from tests.kernels.quantization.nvfp4_utils import (
+    FLOAT4_E2M1_MAX,
+    FLOAT8_E4M3_MAX,
+    dequantize_nvfp4_to_dtype,
+)
+from tests.kernels.utils import torch_moe
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    RoutingMethodType,
+)
+from vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe import (
+    TrtLlmNvFp4ExpertsModular,
+)
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
+from vllm.utils.math_utils import next_power_of_2
+from vllm.utils.torch_utils import set_random_seed
+
+if pytest and (
+    not has_flashinfer_trtllm_fused_moe()
+    or not current_platform.has_device_capability(100)
+):
+    pytest.skip(
+        "Requires flashinfer TRTLLM fused MoE and NvFP4 (SM100)",
+        allow_module_level=True,
+    )
+
+# (m, n, k) = (tokens, intermediate_size_per_partition, hidden_dim).
+# The (64, 704, 4096) row matches Gemma4's MoE shape and exercises the
+# non-256-aligned intermediate (padded inside the wrapper).
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (64, 2048, 1536),
+    (64, 704, 4096),
+]
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("topk", [8])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize(
+    "activation",
+    [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL, MoEActivation.GELU],
+)
+@torch.inference_mode()
+def test_trtllm_fp4_moe_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    activation: MoEActivation,
+    workspace_init,
+):
+    # FlashInfer's trtllm_batched_gemm_runner has no precompiled tile
+    # config for non-gated RELU^2 at non-256-aligned intermediate_size
+    # (e.g. Gemma4's 704). Other activations (SiLU/GELU) work at the
+    # same shape. Tracked upstream in FlashInfer; unrelated to this
+    # PR's GELU enablement (Gemma4 uses GeGLU, not non-gated RELU^2).
+    if activation == MoEActivation.RELU2_NO_MUL and (m, n, k) == (64, 704, 4096):
+        pytest.skip(
+            "FlashInfer trtllm_batched_gemm_runner: no valid tile config "
+            "for non-gated RELU^2 at intermediate_size=704 "
+            "(getValidConfigIndices throws). Tracked upstream."
+        )
+
+    set_random_seed(7)
+    with set_current_vllm_config(
+        VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+    ):
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+
+        quant_blocksize = 16
+        is_gated_act = activation.is_gated
+
+        w1_q, w2_q, quant_config = make_test_quant_config(
+            e,
+            n,
+            k,
+            in_dtype=dtype,
+            quant_dtype="nvfp4",
+            block_shape=None,
+            per_act_token_quant=False,
+            make_gate=is_gated_act,
+            # The TRT-LLM FP4 MoE kernel rejects swizzled (padded) activation
+            # scales — its numel-based vec_size check requires numel == M*K/16.
+            # Match what oracle/nvfp4.py does for this backend.
+            is_nvfp4_scale_swizzled=False,
+        )
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+
+        moe_config = FusedMoEConfig(
+            num_experts=e,
+            experts_per_token=topk,
+            hidden_dim=k,
+            intermediate_size_per_partition=n,
+            num_local_experts=e,
+            num_logical_experts=e,
+            activation=activation,
+            device="cuda",
+            moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+            in_dtype=dtype,
+            is_act_and_mul=is_gated_act,
+            routing_method=RoutingMethodType.TopK,
+            max_num_tokens=next_power_of_2(m),
+        )
+
+        trtllm_experts = mk.FusedMoEKernel(
+            maybe_make_prepare_finalize(
+                moe=moe_config,
+                quant_config=quant_config,
+                allow_new_interface=True,
+                use_monolithic=False,
+            ),
+            TrtLlmNvFp4ExpertsModular(moe_config=moe_config, quant_config=quant_config),
+            inplace=False,
+        )
+
+        trtllm_output = trtllm_experts.apply(
+            hidden_states=a,
+            w1=w1_q,
+            w2=w2_q,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            global_num_experts=e,
+            expert_map=None,
+            apply_router_weight_on_input=False,
+        )
+
+        # Reference: round-trip activations and weights through FP4
+        # quant/dequant so the comparison isolates kernel/activation behavior
+        # from quantization error.
+        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / a.abs().max()).to(
+            torch.float32
+        )
+        a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
+        a_in_dtype = dequantize_nvfp4_to_dtype(
+            a_fp4,
+            a_scale_interleaved,
+            a_global_scale,
+            dtype=a.dtype,
+            device=a.device,
+            block_size=quant_blocksize,
+        )
+
+        w1_d = torch.empty(
+            (e, (2 if is_gated_act else 1) * n, k), device="cuda", dtype=dtype
+        )
+        w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
+        for idx in range(e):
+            w1_d[idx] = dequantize_nvfp4_to_dtype(
+                w1_q[idx],
+                quant_config.w1_scale[idx],
+                (1 / quant_config.g1_alphas[idx]),
+                dtype=dtype,
+                device=w1_q.device,
+                block_size=quant_blocksize,
+            )
+            w2_d[idx] = dequantize_nvfp4_to_dtype(
+                w2_q[idx],
+                quant_config.w2_scale[idx],
+                (1 / quant_config.g2_alphas[idx]),
+                dtype=dtype,
+                device=w2_q.device,
+                block_size=quant_blocksize,
+            )
+
+        torch_output = torch_moe(
+            a_in_dtype, w1_d, w2_d, score, topk, activation=activation
+        )
+
+        torch.testing.assert_close(torch_output, trtllm_output, atol=2e-1, rtol=2e-1)
+
+
+if __name__ == "__main__":
+    test_trtllm_fp4_moe_no_graph(
+        64, 704, 4096, 128, 8, torch.bfloat16, MoEActivation.GELU, None
+    )
diff --git a/tests/kernels/moe/test_unquantized_backend_selection.py b/tests/kernels/moe/test_unquantized_backend_selection.py
index 73aa54fa579e..2c4cd7b94e78 100644
--- a/tests/kernels/moe/test_unquantized_backend_selection.py
+++ b/tests/kernels/moe/test_unquantized_backend_selection.py
@@ -85,7 +85,7 @@ def test_select_default_backend_by_platform(
 
 
 @patch(
-    "vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer",
+    "vllm.utils.flashinfer.has_flashinfer",
     return_value=False,
 )
 @patch(
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index d4b2350f5c23..a1594634f457 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -402,6 +402,7 @@ def make_test_quant_config(
     per_act_token_quant: bool = False,
     block_shape: list[int] | None = None,
     make_gate: bool = True,
+    is_nvfp4_scale_swizzled: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]:
     (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights(
         e,
@@ -442,6 +443,7 @@ def make_test_quant_config(
             # TODO: make sure this is handled properly
             g1_alphas=(1 / w1_gs) if w1_gs is not None else None,
             g2_alphas=(1 / w2_gs) if w2_gs is not None else None,
+            is_nvfp4_scale_swizzled=is_nvfp4_scale_swizzled,
         ),
     )
 
diff --git a/tests/kernels/quantization/test_cpu_fp8_scaled_mm.py b/tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
new file mode 100644
index 000000000000..3154e2cb98bb
--- /dev/null
+++ b/tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for CPU FP8 W8A16 block-scaled GEMM kernel (fp8_scaled_mm_cpu).
+
+Run `pytest tests/kernels/quantization/test_cpu_fp8_scaled_mm.py -v`.
+"""
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+if not ops._supports_cpu_fp8_w8a16:
+    pytest.skip("fp8_scaled_mm_cpu op not available", allow_module_level=True)
+
+BLOCK_SIZE = [128, 128]
+
+
+def cdiv(a: int, b: int) -> int:
+    return -(a // -b)
+
+
+def quantize_weight_block_fp8(
+    weight: torch.Tensor,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize weight [N, K] to FP8 with block scales.
+
+    Returns:
+        fp8_weight: [N, K] float8_e4m3fn
+        scales: [n_tiles, k_tiles] float32
+    """
+    N, K = weight.shape
+    block_n, block_k = block_size
+    fp8_max = torch.finfo(torch.float8_e4m3fn).max
+
+    n_tiles = cdiv(N, block_n)
+    k_tiles = cdiv(K, block_k)
+
+    # Pad for even blocking
+    pad_N = (block_n - (N % block_n)) % block_n
+    pad_K = (block_k - (K % block_k)) % block_k
+    if pad_N > 0 or pad_K > 0:
+        weight = torch.nn.functional.pad(weight, (0, pad_K, 0, pad_N))
+
+    # Reshape into blocks
+    w_blocks = weight.view(n_tiles, block_n, k_tiles, block_k)
+    w_blocks = w_blocks.permute(0, 2, 1, 3).contiguous()
+
+    # Per-block scale
+    abs_max = w_blocks.abs().amax(dim=(-2, -1), keepdim=True)
+    scales = abs_max / fp8_max
+    scales = torch.where(scales == 0, torch.ones_like(scales), scales)
+
+    # Quantize
+    q_fp8 = (w_blocks / scales).clamp(-fp8_max, fp8_max).to(torch.float8_e4m3fn)
+
+    # Reshape back
+    fp8_weight = (
+        q_fp8.permute(0, 2, 1, 3)
+        .contiguous()
+        .view(N + pad_N, K + pad_K)[:N, :K]
+        .contiguous()
+    )
+
+    scales = scales.view(n_tiles, k_tiles)
+    return fp8_weight, scales
+
+
+def dequant_weight_block_fp8(
+    fp8_weight: torch.Tensor,
+    scales: torch.Tensor,
+    block_size: list[int],
+    out_dtype: torch.dtype,
+) -> torch.Tensor:
+    """Dequantize FP8 weight back to float for reference computation."""
+    N, K = fp8_weight.shape
+    block_n, block_k = block_size
+    n_tiles, k_tiles = scales.shape
+
+    pad_N = (block_n - (N % block_n)) % block_n
+    pad_K = (block_k - (K % block_k)) % block_k
+    if pad_N > 0 or pad_K > 0:
+        fp8_padded = torch.nn.functional.pad(fp8_weight.float(), (0, pad_K, 0, pad_N))
+    else:
+        fp8_padded = fp8_weight.float()
+
+    w_blocks = fp8_padded.view(n_tiles, block_n, k_tiles, block_k)
+    w_blocks = w_blocks.permute(0, 2, 1, 3).contiguous()
+    dq = w_blocks * scales.view(n_tiles, k_tiles, 1, 1)
+    dq = dq.permute(0, 2, 1, 3).contiguous().view(N + pad_N, K + pad_K)
+    return dq[:N, :K].to(out_dtype)
+
+
+def ref_fp8_block_scaled_mm(
+    x: torch.Tensor,
+    fp8_weight: torch.Tensor,
+    scales: torch.Tensor,
+    block_size: list[int],
+    bias: torch.Tensor | None,
+    out_dtype: torch.dtype,
+) -> torch.Tensor:
+    """Reference: dequant FP8→float32, matmul in float32, cast to out_dtype."""
+    w_dq = dequant_weight_block_fp8(fp8_weight, scales, block_size, torch.float32)
+    out = torch.mm(x.float(), w_dq.t())
+    if bias is not None:
+        out = out + bias.float()
+    return out.to(out_dtype)
+
+
+# ---------------------------------------------------------------------------
+# Test parameters
+# ---------------------------------------------------------------------------
+M_SIZES = [1, 4, 16, 64, 128]
+# (N, K) — weight shape is [N, K], output has N columns.
+NK_SIZES = [
+    (128, 256),
+    (256, 512),
+    (512, 1024),
+    (1024, 2048),
+    (5120, 5120),
+    (17408, 5120),
+    (5120, 17408),
+]
+
+
+@pytest.mark.parametrize("M", M_SIZES)
+@pytest.mark.parametrize("N,K", NK_SIZES)
+@pytest.mark.parametrize("use_bias", [False, True])
+def test_cpu_fp8_scaled_mm(M: int, N: int, K: int, use_bias: bool):
+    """fp8_scaled_mm_cpu correctness against float reference."""
+    torch.manual_seed(42)
+    out_dtype = torch.bfloat16
+    block_size = BLOCK_SIZE
+
+    x = torch.randn(M, K, dtype=out_dtype) / (K**0.5)
+    w_f32 = torch.randn(N, K, dtype=torch.float32) / (K**0.5)
+    fp8_weight, scales = quantize_weight_block_fp8(w_f32, block_size)
+
+    bias = torch.randn(N, dtype=torch.float32) * 0.1 if use_bias else None
+
+    ref_out = ref_fp8_block_scaled_mm(
+        x, fp8_weight, scales, block_size, bias, out_dtype
+    )
+
+    packed_weight = torch.ops._C.convert_weight_packed(fp8_weight)
+    kernel_out = ops.fp8_scaled_mm_cpu(
+        x,
+        packed_weight,
+        scales,
+        block_size,
+        bias,
+        out_dtype,
+        True,
+    )
+
+    assert kernel_out.dtype == out_dtype
+    torch.testing.assert_close(kernel_out, ref_out, rtol=0.02, atol=0.01)
diff --git a/tests/kernels/quantization/test_nvfp4_emulation.py b/tests/kernels/quantization/test_nvfp4_emulation.py
new file mode 100644
index 000000000000..71072d9e9fff
--- /dev/null
+++ b/tests/kernels/quantization/test_nvfp4_emulation.py
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import huggingface_hub
+import pytest
+import torch
+from safetensors import safe_open
+
+from vllm.model_executor.layers.quantization.utils import (
+    nvfp4_emulation_utils,
+)
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (
+    dequantize_to_dtype,
+    ref_nvfp4_quant_dequant,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Triton NVFP4 kernel requires CUDA.",
+)
+def test_triton_dequantize_nvfp4(monkeypatch) -> None:
+    """Test the Triton dequantization kernel against the CPU reference
+    using real NVFP4 weights from a checkpoint.
+
+    Tests both 2D (attention projection) and 3D (stacked MoE experts).
+    """
+    checkpoint_path = huggingface_hub.snapshot_download(
+        "nvidia/Qwen3-30B-A3B-NVFP4",
+        allow_patterns=["model-00001-of-00004.safetensors"],
+    )
+    shard_path = f"{checkpoint_path}/model-00001-of-00004.safetensors"
+    block_size = 16
+
+    with safe_open(shard_path, framework="pt", device="cpu") as f:
+        all_keys = list(f.keys())
+
+        # 2D case: attention projection
+        tensor_fp4_2d = f.get_tensor("model.layers.9.self_attn.k_proj.weight")
+        tensor_sf_2d = f.get_tensor("model.layers.9.self_attn.k_proj.weight_scale")
+        global_scale_2d = f.get_tensor("model.layers.9.self_attn.k_proj.weight_scale_2")
+
+        # 3D case: stack ALL experts for layer 9 up_proj
+        expert_prefix = "model.layers.9.mlp.experts."
+        expert_indices = sorted(
+            int(key.split(".")[5])
+            for key in all_keys
+            if key.startswith(expert_prefix) and key.endswith(".up_proj.weight")
+        )
+        assert len(expert_indices) > 0
+
+        all_fp4 = []
+        all_sf = []
+        all_global_scale = []
+        for index in expert_indices:
+            name = f"{expert_prefix}{index}.up_proj"
+            all_fp4.append(f.get_tensor(f"{name}.weight"))
+            all_sf.append(f.get_tensor(f"{name}.weight_scale"))
+            all_global_scale.append(f.get_tensor(f"{name}.weight_scale_2"))
+
+    tensor_fp4_3d = torch.stack(all_fp4)
+    tensor_sf_3d = torch.stack(all_sf)
+    global_scale_3d = torch.stack(all_global_scale)
+
+    test_cases = [
+        ("2D base", tensor_fp4_2d, tensor_sf_2d, global_scale_2d),
+        (
+            "2D 2x rows",
+            tensor_fp4_2d.repeat(2, 1),
+            tensor_sf_2d.repeat(2, 1),
+            global_scale_2d,
+        ),
+        (
+            "2D 4x rows",
+            tensor_fp4_2d.repeat(4, 1),
+            tensor_sf_2d.repeat(4, 1),
+            global_scale_2d,
+        ),
+        (
+            "2D 2x cols",
+            tensor_fp4_2d.repeat(1, 2),
+            tensor_sf_2d.repeat(1, 2),
+            global_scale_2d,
+        ),
+        ("3D base", tensor_fp4_3d, tensor_sf_3d, global_scale_3d),
+        (
+            "3D 2x experts",
+            tensor_fp4_3d.repeat(2, 1, 1),
+            tensor_sf_3d.repeat(2, 1, 1),
+            global_scale_3d.repeat(2),
+        ),
+        (
+            "3D 2x rows",
+            tensor_fp4_3d.repeat(1, 2, 1),
+            tensor_sf_3d.repeat(1, 2, 1),
+            global_scale_3d,
+        ),
+        (
+            "3D 2x cols",
+            tensor_fp4_3d.repeat(1, 1, 2),
+            tensor_sf_3d.repeat(1, 1, 2),
+            global_scale_3d,
+        ),
+    ]
+
+    quantiles = [0.5, 0.001, 0.999]
+
+    # Move the E2M1 lookup table to CUDA ahead of time, as would normally
+    # happen during model loading (process_weights_after_loading).  Both the
+    # Triton and PyTorch reference paths run on CUDA.
+    nvfp4_emulation_utils.kE2M1ToFloat_handle.val = (
+        nvfp4_emulation_utils.kE2M1ToFloat_handle.val.cuda()
+    )
+
+    for label, tensor_fp4, tensor_sf, global_scale in test_cases:
+        fp4_cuda = tensor_fp4.cuda()
+        sf_cuda = tensor_sf.cuda()
+        gs_cuda = global_scale.cuda()
+
+        # Triton path
+        triton_result = dequantize_to_dtype(
+            fp4_cuda,
+            sf_cuda,
+            gs_cuda,
+            torch.bfloat16,
+            block_size,
+            swizzle=False,
+        )
+
+        # Reference path (PyTorch ops on CUDA, Triton dispatch disabled)
+        with monkeypatch.context() as m:
+            m.setattr(
+                nvfp4_emulation_utils.current_platform,
+                "is_cuda_alike",
+                lambda: False,
+            )
+            reference = dequantize_to_dtype(
+                fp4_cuda,
+                sf_cuda,
+                gs_cuda,
+                torch.bfloat16,
+                block_size,
+                swizzle=False,
+            )
+
+        torch.testing.assert_close(triton_result, reference, atol=0, rtol=0)
+
+        # Benchmark
+        shape = list(tensor_fp4.shape)
+
+        def _triton_bench(
+            fp4_cuda=fp4_cuda,
+            scale_cuda=sf_cuda,
+            global_scale_cuda=gs_cuda,
+            block_size=block_size,
+        ):
+            return dequantize_to_dtype(
+                fp4_cuda,
+                scale_cuda,
+                global_scale_cuda,
+                torch.bfloat16,
+                block_size,
+                swizzle=False,
+            )
+
+        triton_ms, triton_min, triton_max = triton.testing.do_bench(
+            _triton_bench, quantiles=quantiles
+        )
+
+        def _reference_bench(
+            fp4_cuda=fp4_cuda,
+            scale_cuda=sf_cuda,
+            global_scale_cuda=gs_cuda,
+            block_size=block_size,
+        ):
+            with monkeypatch.context() as m2:
+                m2.setattr(
+                    nvfp4_emulation_utils.current_platform,
+                    "is_cuda_alike",
+                    lambda: False,
+                )
+                dequantize_to_dtype(
+                    fp4_cuda,
+                    scale_cuda,
+                    global_scale_cuda,
+                    torch.bfloat16,
+                    block_size,
+                    swizzle=False,
+                )
+
+        ref_ms, ref_min, ref_max = triton.testing.do_bench(
+            _reference_bench, quantiles=quantiles
+        )
+
+        speedup = ref_ms / triton_ms if triton_ms > 0 else float("inf")
+        print(f"  dequantize {label} {shape}:")
+        print(
+            f"    triton:    median={triton_ms:.3f}ms, "
+            f"min={triton_min:.3f}ms, max={triton_max:.3f}ms"
+        )
+        print(
+            f"    reference: median={ref_ms:.3f}ms, "
+            f"min={ref_min:.3f}ms, max={ref_max:.3f}ms"
+        )
+        print(f"    speedup:   {speedup:.2f}x")
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Triton NVFP4 kernel requires CUDA.",
+)
+@pytest.mark.parametrize(
+    "m, k",
+    [
+        (1, 16),
+        (1, 4096),
+        (2, 4096),
+        (4, 4096),
+        (8, 4096),
+        (16, 4096),
+        (24, 4096),
+        (32, 4096),
+        (1, 8192),
+        (2, 8192),
+        (4, 8192),
+        (8, 8192),
+        (16, 8192),
+        (24, 8192),
+        (32, 8192),
+        (1, 32),
+        (2, 48),
+        (7, 64),
+        (16, 128),
+        (33, 160),
+        (128, 256),
+        (256, 512),
+        (1024, 1024),
+        (5120, 2048),
+        (2048, 4096),
+        (4096, 7168),
+        (8192, 8192),
+        (128, 16384),
+    ],
+)
+@pytest.mark.parametrize("global_scale_value", [0.5, 1.0, 0.001])
+def test_triton_nvfp4_quant_dequant(
+    monkeypatch, m: int, k: int, global_scale_value: float
+) -> None:
+    """Test the Triton quant-dequant kernel against the CPU reference."""
+    block_size = 16
+    x = torch.randn(m, k, dtype=torch.bfloat16, device="cuda")
+    global_scale = torch.tensor(global_scale_value, dtype=torch.float32, device="cuda")
+
+    # Triton path
+    triton_result = ref_nvfp4_quant_dequant(x, global_scale, block_size)
+
+    # CPU reference path
+    with monkeypatch.context() as mp:
+        mp.setattr(
+            nvfp4_emulation_utils.current_platform,
+            "is_cuda_alike",
+            lambda: False,
+        )
+        reference = ref_nvfp4_quant_dequant(x.cpu(), global_scale.cpu(), block_size)
+
+    torch.testing.assert_close(triton_result.cpu(), reference, atol=0, rtol=0)
+
+    # Benchmark (both paths on CUDA tensors for fair comparison)
+    quantiles = [0.5, 0.001, 0.999]
+
+    def _triton_bench(
+        input_tensor=x, input_global_scale=global_scale, input_block_size=block_size
+    ):
+        return ref_nvfp4_quant_dequant(
+            input_tensor, input_global_scale, input_block_size
+        )
+
+    triton_ms, triton_min, triton_max = triton.testing.do_bench(
+        _triton_bench, quantiles=quantiles
+    )
+
+    def _reference_bench(
+        input_tensor=x, input_global_scale=global_scale, input_block_size=block_size
+    ):
+        with monkeypatch.context() as mp2:
+            mp2.setattr(
+                nvfp4_emulation_utils.current_platform,
+                "is_cuda_alike",
+                lambda: False,
+            )
+            ref_nvfp4_quant_dequant(input_tensor, input_global_scale, input_block_size)
+
+    ref_ms, ref_min, ref_max = triton.testing.do_bench(
+        _reference_bench, quantiles=quantiles
+    )
+
+    speedup = ref_ms / triton_ms if triton_ms > 0 else float("inf")
+    print(f"  quant_dequant [{m}x{k}] gs={global_scale_value}:")
+    print(
+        f"    triton:    median={triton_ms:.3f}ms, "
+        f"min={triton_min:.3f}ms, max={triton_max:.3f}ms"
+    )
+    print(
+        f"    reference: median={ref_ms:.3f}ms, "
+        f"min={ref_min:.3f}ms, max={ref_max:.3f}ms"
+    )
+    print(f"    speedup:   {speedup:.2f}x")
diff --git a/tests/kernels/quantization/test_per_token_group_quant.py b/tests/kernels/quantization/test_per_token_group_quant.py
index 5447a43ffb98..4089e9bc4688 100644
--- a/tests/kernels/quantization/test_per_token_group_quant.py
+++ b/tests/kernels/quantization/test_per_token_group_quant.py
@@ -73,11 +73,6 @@ def test_per_token_group_quant_fp8(
         # Larger shapes with padding
         (127, 7168, 128),
         (253, 640, 128),
-        # Non-power-of-2 group size
-        (4, 768, 96),  # 768/96=8 groups, no padding
-        (3, 768, 96),  # 768/96=8 groups, MN padding
-        (4, 480, 96),  # 480/96=5 groups, K padding
-        (1, 480, 96),  # both MN and K padding
     ],
 )
 @pytest.mark.parametrize("poisoned_scales", [False, True])
@@ -161,6 +156,188 @@ def test_per_token_group_quant_fp8_packed(
     )
 
 
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="DeepGEMM not available on this platform"
+)
+def test_per_token_group_quant_fp8_packed_all_zero():
+    """All-zero input must produce well-defined UE8M0 scale bytes via the eps
+    floor in the kernel's UE8M0 path. Locks down the all-zero behavior before
+    optimization.
+
+    The CUDA kernel computes:
+        y_s = eps / fp8_max
+        y_s = exp2(ceil(log2(fmax(y_s, 1e-10))))
+    For all-zero input, eps/fp8_max < 1e-10, so the inner fmax clamps back to
+    1e-10, giving exp2(ceil(log2(1e-10))) = exp2(-33) => UE8M0 byte 0x5E (94).
+    """
+
+    device = "cuda"
+    num_tokens, hidden_dim, group_size = 4, 7168, 128
+    x = torch.zeros((num_tokens, hidden_dim), device=device, dtype=torch.bfloat16)
+
+    out_q, out_s_packed = fp8_utils.per_token_group_quant_fp8_packed_for_deepgemm(
+        x,
+        group_size=group_size,
+        use_ue8m0=True,
+    )
+
+    # Quantized values must be all zero.
+    assert torch.equal(
+        out_q.view(torch.uint8),
+        torch.zeros_like(out_q, dtype=torch.uint8),
+    ), "All-zero input should produce all-zero FP8 output"
+
+    # UE8M0 byte produced by the kernel for all-zero input.
+    # The kernel's inner fmax(y_s, 1e-10) clamps eps/fp8_max back to 1e-10.
+    # 1e-10 as float32 has biased exponent 0x5D and a non-zero mantissa, so
+    # the kernel's bit-twiddle (exp_bits + (mant_bits != 0)) rounds up to
+    # 0x5E. This matches exp2(ceil(log2(1e-10))) = exp2(-33).
+    expected_exp_byte = 0x5E
+
+    mn = num_tokens
+    groups_per_row = hidden_dim // group_size
+    k_num_packed = (groups_per_row + 3) // 4
+    tma_aligned_mn = ((mn + 3) // 4) * 4
+    num_scale_elems = mn + (k_num_packed - 1) * tma_aligned_mn
+
+    # All valid scale slots must contain the expected packed value.
+    # Padding slots must be zero.
+    actual = torch.as_strided(out_s_packed, (num_scale_elems,), (1,)).cpu()
+
+    expected = torch.zeros(num_scale_elems, dtype=torch.int32, device="cpu")
+    for row in range(mn):
+        for g in range(groups_per_row):
+            pack_col = g // 4
+            pos = g % 4
+            idx = pack_col * tma_aligned_mn + row
+            expected[idx] |= expected_exp_byte << (pos * 8)
+
+    assert torch.equal(actual, expected), "All-zero scale bytes mismatch"
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="DeepGEMM not available on this platform"
+)
+def test_per_token_group_quant_fp8_packed_mantissa_rounds_up():
+    """Inputs whose absmax/max_8bit produces a non-power-of-2 force the
+    mantissa-rounding-up branch (exp_byte += 1). Locks down this behavior
+    before optimization."""
+
+    device = "cuda"
+    num_tokens, hidden_dim, group_size = 4, 7168, 128
+
+    # Build a tensor whose per-group absmax = 1.5 * fp8_max * 2^k for various k.
+    # fp8_max = torch.finfo(torch.float8_e4m3fn).max = 448.0.
+    # Then absmax/fp8_max = 1.5 * 2^k -> non-zero mantissa, triggers ceil
+    # rounding to 2^(k+1). Use k=0 for simplicity; the bf16 representation of
+    # 1.5*448=672.0 is exact.
+    x = torch.full(
+        (num_tokens, hidden_dim),
+        672.0,
+        device=device,
+        dtype=torch.bfloat16,
+    )
+
+    out_q, out_s_packed = fp8_utils.per_token_group_quant_fp8_packed_for_deepgemm(
+        x,
+        group_size=group_size,
+        use_ue8m0=True,
+    )
+
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        ref_q, ref_s = fp8_utils.per_token_group_quant_fp8(
+            x,
+            group_size,
+            use_ue8m0=True,
+        )
+
+    assert torch.equal(out_q, ref_q), "Quantized output mismatch"
+
+    mn = num_tokens
+    groups_per_row = hidden_dim // group_size
+    k_num_packed = (groups_per_row + 3) // 4
+    tma_aligned_mn = ((mn + 3) // 4) * 4
+    num_scale_elems = mn + (k_num_packed - 1) * tma_aligned_mn
+
+    ref_s_flat = ref_s.reshape(mn, groups_per_row)
+    ref_exponents = (ref_s_flat.view(torch.int32) >> 23) & 0xFF
+    expected = torch.zeros(num_scale_elems, dtype=torch.int32, device="cpu")
+    for row in range(mn):
+        for g in range(groups_per_row):
+            pack_col = g // 4
+            pos = g % 4
+            idx = pack_col * tma_aligned_mn + row
+            expected[idx] |= int(ref_exponents[row, g].item()) << (pos * 8)
+
+    actual = torch.as_strided(out_s_packed, (num_scale_elems,), (1,)).cpu()
+    assert torch.equal(actual, expected), "Scale bytes mismatch"
+
+
+@pytest.mark.parametrize(
+    "num_tokens,hidden_dim",
+    [
+        (1, 7168),  # mn padded 1 -> 4
+        (2, 7168),  # mn padded 2 -> 4
+        (3, 7168),  # mn padded 3 -> 4
+        (5, 7168),  # mn padded 5 -> 8
+        (127, 7168),  # mn padded 127 -> 128
+        (253, 640),  # both mn and groups padded
+        (1, 384),  # extreme: 1 group, 1 mn row -> both axes padded
+    ],
+)
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="DeepGEMM not available on this platform"
+)
+def test_per_token_group_quant_fp8_packed_zero_fills_padded_output_q(
+    num_tokens, hidden_dim
+):
+    """When output_q is allocated with shape (tma_aligned_mn, k) instead of
+    (mn, k), the kernel must overwrite the padded mn rows with zeros so
+    callers can use ``torch.empty`` instead of ``torch.zeros``."""
+
+    device = "cuda"
+    group_size = 128
+    torch.manual_seed(42)
+    x = torch.randn((num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8
+
+    mn = num_tokens
+    groups_per_row = hidden_dim // group_size
+    k_num_packed = (groups_per_row + 3) // 4
+    tma_aligned_mn = ((mn + 3) // 4) * 4
+
+    fp8_dtype = torch.float8_e4m3fn
+    finfo = torch.finfo(fp8_dtype)
+    # Allocate output_q with the padded mn extent and pre-fill with 0xFF
+    # so the kernel cannot rely on a clean buffer.
+    out_q = torch.empty((tma_aligned_mn, hidden_dim), device=device, dtype=fp8_dtype)
+    out_q.view(torch.uint8).fill_(0xFF)
+
+    out_s_packed = torch.empty_strided(
+        (mn, k_num_packed),
+        (1, tma_aligned_mn),
+        device=device,
+        dtype=torch.int32,
+    )
+
+    torch.ops._C.per_token_group_fp8_quant_packed(
+        x, out_q, out_s_packed, group_size, 1e-10, finfo.min, finfo.max
+    )
+
+    # Live rows must match the Triton reference.
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        ref_q, _ = fp8_utils.per_token_group_quant_fp8(x, group_size, use_ue8m0=True)
+    assert torch.equal(out_q[:mn], ref_q), "Live region mismatch"
+
+    # Padded rows must be all-zero; without this, downstream TMA loads would
+    # see uninitialised data.
+    if tma_aligned_mn > mn:
+        padded_bytes = out_q[mn:tma_aligned_mn].view(torch.uint8)
+        assert padded_bytes.eq(0).all(), (
+            f"Padded rows [{mn}, {tma_aligned_mn}) not zeroed; "
+            f"{padded_bytes.ne(0).sum().item()} non-zero bytes"
+        )
+
+
 @pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
 @pytest.mark.parametrize("group_size", [64, 128])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
diff --git a/tests/kernels/test_compressor_kv_cache.py b/tests/kernels/test_compressor_kv_cache.py
index 592b58fbe430..122254bc3c41 100644
--- a/tests/kernels/test_compressor_kv_cache.py
+++ b/tests/kernels/test_compressor_kv_cache.py
@@ -3,12 +3,11 @@
 """
 Round-trip tests for compressor → FP8 quant + KV cache insert → gather + dequant.
 
-Two paths tested:
+Four test functions cover five paths:
   A) DeepseekV4 Attention: head_dim=512 (448 FP8 nope + 64 bf16 rope), quant_block=64
   B) Indexer:       head_dim=128 (all FP8), quant_block=128
-
-These serve as golden references for validating the future fused
-compressor+quant+cache kernel.
+  C) DeepseekV4 Attention magnitude range: correctness across small/large values
+  D) Indexer fused Triton kernel: compress+norm+rope+quant+insert
 """
 
 import math
@@ -21,6 +20,12 @@
     dequantize_and_gather_k_cache,
     quantize_and_insert_k_cache,
 )
+from vllm.v1.attention.ops.deepseek_v4_ops.fused_compress_quant_cache import (
+    _fused_kv_compress_norm_rope_insert_indexer_attn,
+    _fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn,
+)
+
+from .test_fused_indexer_q_rope_quant import quantize_to_mxfp4
 
 
 def _ue8m0_reference(x: torch.Tensor, block_size: int, fp8_max: float):
@@ -309,3 +314,222 @@ def test_deepseek_v4_quant_magnitude_range():
                 f"Token {t}: rel_err={rel_err:.4f}, abs_diff={abs_diff:.6f}, "
                 f"magnitude={magnitude:.4f}"
             )
+
+
+# ── Test D: Indexer fused K-cache insert (Triton kernels) ────────────────────
+#
+# Both kernels share the same Triton signature; use_fp4 selects between them.
+# Full pipeline: state-cache gather → softmax-weighted compress → RMSNorm →
+#   GPT-J RoPE → quant (MXFP4 or FP8) → paged cache insert.
+
+
+def _reference_kv_compress_norm_rope(
+    state_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    positions: torch.Tensor,
+    rms_weight: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    compress_ratio: int = 1,
+    overlap: int = 0,
+    use_fp4: bool = False,
+    rms_eps: float = 1e-6,
+    fp8_max: float = 448.0,
+):
+    """Compress → RMSNorm → GPT-J RoPE → quantize.
+
+    Gathers (1+overlap)*compress_ratio state entries per output token, applies
+    per-element softmax over the scores, and computes the weighted kv sum.
+    Returns (quantized_values, scale) matching the kernel's output layout.
+    """
+    device = state_cache.device
+    head_dim = rms_weight.shape[0]
+    rope_dim = cos_sin_cache.shape[-1]
+    state_block_size = state_cache.shape[1]
+    state_width = state_cache.shape[-1] // 2
+    nope_dim = head_dim - rope_dim
+    total = (1 + overlap) * compress_ratio
+    results = []
+    for pos in positions.tolist():
+        src = torch.arange(pos - total + 1, pos + 1, dtype=torch.int64, device=device)
+        valid = src >= 0
+        idx = src.clamp(min=0)
+        pages = block_table[0, idx // state_block_size]
+        offsets = idx % state_block_size
+        raw = state_cache[pages, offsets].float()  # [total, state_dim]
+
+        # Group 0 (tokens 0..cr-1):   kv[:H],   score[SW:SW+H]
+        # Group 1 (tokens cr..2cr-1): kv[H:2H], score[SW+H:SW+2H]
+        if overlap:
+            sw = state_width
+            g0_kv = raw[:compress_ratio, :head_dim]
+            g1_kv = raw[compress_ratio:, head_dim : 2 * head_dim]
+            g0_scores = raw[:compress_ratio, sw : sw + head_dim]
+            g1_scores = raw[compress_ratio:, sw + head_dim : sw + 2 * head_dim]
+            kv = torch.cat([g0_kv, g1_kv])
+            scores = torch.cat([g0_scores, g1_scores])
+        else:
+            kv = raw[:, :head_dim]
+            scores = raw[:, state_width : state_width + head_dim]
+
+        scores[~valid] = float("-inf")
+        kv[~valid] = 0.0
+        weights = torch.softmax(scores, dim=0)
+        compressed = (kv * weights).sum(dim=0)  # [H]
+        var = (compressed * compressed).mean()
+        normed = compressed * torch.rsqrt(var + rms_eps) * rms_weight.float()
+        compressed_pos = (pos // compress_ratio) * compress_ratio
+        cos, sin = cos_sin_cache[compressed_pos].float().chunk(2)
+        nope, rope = normed.split([nope_dim, rope_dim])
+        rope = torch.stack(
+            [rope[0::2] * cos - rope[1::2] * sin, rope[1::2] * cos + rope[0::2] * sin],
+            dim=-1,
+        ).reshape(rope_dim)
+        results.append(torch.cat([nope, rope]).to(state_cache.dtype))
+    result = torch.stack(results)
+
+    if use_fp4:
+        return quantize_to_mxfp4(result)
+    else:
+        pairs = [
+            _ue8m0_reference(result[t], head_dim, fp8_max) for t in range(len(result))
+        ]
+        quants, scales = zip(*pairs)
+        return torch.stack(quants), torch.cat(scales)
+
+
+@pytest.mark.parametrize("num_tokens", [1, 7, 32])
+@pytest.mark.parametrize("kv_block_size", [16, 32])
+@pytest.mark.parametrize("use_fp4", [False, True])
+def test_fused_kv_insert_indexer(num_tokens: int, kv_block_size: int, use_fp4: bool):
+    """Fused K compress+norm+rope+quant+insert for the indexer KV cache."""
+    HEAD_DIM = 128
+    ROPE_DIM = 64
+    BLOCK_SIZE = 16
+    RMS_EPS = 1e-6
+    FP8_MAX = 448.0
+
+    device = "cuda"
+    torch.manual_seed(42)
+    compress_ratio = 4
+
+    if use_fp4:
+        TOKEN_STRIDE = HEAD_DIM // 2  # packed nibbles: 64 bytes
+        SCALE_DIM = HEAD_DIM // 32  # ue8m0 bytes: 4
+        QUANT_BLOCK = 32
+        kernel = _fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn
+    else:
+        TOKEN_STRIDE = HEAD_DIM  # FP8 bytes: 128
+        SCALE_DIM = 4  # 1 float32: 4 bytes
+        QUANT_BLOCK = HEAD_DIM
+        kernel = _fused_kv_compress_norm_rope_insert_indexer_attn
+
+    # overlap=1 whenever compress_ratio==4, matching DeepseekCompressor logic.
+    overlap = 1 if compress_ratio == 4 else 0
+    coff = 1 + overlap  # multiplier for state_dim per entry
+
+    num_pages = (compress_ratio * num_tokens - 1) // BLOCK_SIZE + 2
+    state_cache = torch.randn(
+        num_pages,
+        BLOCK_SIZE,
+        2 * coff * HEAD_DIM,  # kv_state + score_state, each coff*HEAD_DIM wide
+        dtype=torch.bfloat16,
+        device=device,
+    )
+    block_table = torch.arange(num_pages, dtype=torch.int32, device=device).unsqueeze(0)
+    token_to_req = torch.zeros(num_tokens, dtype=torch.int32, device=device)
+    slot_mapping = torch.arange(num_tokens, dtype=torch.int64, device=device)
+    positions = torch.arange(
+        compress_ratio - 1,
+        compress_ratio * num_tokens,
+        compress_ratio,
+        dtype=torch.int64,
+        device=device,
+    )
+    rms_weight = torch.randn(HEAD_DIM, dtype=torch.bfloat16, device=device)
+    cos_sin_cache = torch.randn(compress_ratio * num_tokens, ROPE_DIM, device=device)
+
+    kv_n_blocks = (num_tokens + kv_block_size - 1) // kv_block_size + 1
+    kv_cache = torch.zeros(
+        kv_n_blocks,
+        kv_block_size * (TOKEN_STRIDE + SCALE_DIM),
+        dtype=torch.uint8,
+        device=device,
+    )
+
+    kernel[(num_tokens,)](
+        state_cache,
+        state_cache.stride(0),
+        state_cache.stride(1),
+        token_to_req,
+        positions,
+        slot_mapping,
+        block_table,
+        block_table.stride(0),
+        BLOCK_SIZE,
+        rms_weight,
+        RMS_EPS,
+        cos_sin_cache,
+        cos_sin_cache.stride(0),
+        kv_cache,
+        slot_mapping,
+        kv_block_size,
+        HEAD_SIZE=HEAD_DIM,
+        TRITON_BLOCK_SIZE=HEAD_DIM,
+        STATE_WIDTH=coff * HEAD_DIM,
+        COMPRESS_RATIO=compress_ratio,
+        OVERLAP=overlap,
+        ROPE_HEAD_DIM=ROPE_DIM,
+        FP8_MAX=FP8_MAX,
+        QUANT_BLOCK=QUANT_BLOCK,
+        TOKEN_STRIDE=TOKEN_STRIDE,
+        SCALE_DIM=SCALE_DIM,
+        KV_BLOCK_STRIDE=kv_cache.stride(0),
+        num_warps=1,
+    )
+
+    k_quant, scale = _reference_kv_compress_norm_rope(
+        state_cache,
+        block_table,
+        positions,
+        rms_weight,
+        cos_sin_cache,
+        compress_ratio,
+        overlap,
+        use_fp4,
+        rms_eps=RMS_EPS,
+        fp8_max=FP8_MAX,
+    )
+
+    if use_fp4:
+        for i in range(num_tokens):
+            blk, pos = i // kv_block_size, i % kv_block_size
+            val_off = pos * TOKEN_STRIDE
+            fp4_actual = kv_cache[blk, val_off : val_off + TOKEN_STRIDE]
+            assert torch.equal(k_quant[i], fp4_actual), (
+                f"token {i}: packed nibbles differ, "
+                f"{(k_quant[i] != fp4_actual).sum()} "
+                f"/ {TOKEN_STRIDE}"
+            )
+
+            scale_off = kv_block_size * TOKEN_STRIDE + pos * SCALE_DIM
+            scale_actual = kv_cache[blk, scale_off : scale_off + SCALE_DIM]
+            assert torch.equal(scale_actual, scale[i]), (
+                f"token {i}: ue8m0 {scale_actual.tolist()} != {scale[i].tolist()}"
+            )
+
+    else:
+        k_quant = k_quant.view(torch.uint8)
+        for i in range(num_tokens):
+            blk, pos = i // kv_block_size, i % kv_block_size
+            val_off = pos * TOKEN_STRIDE
+            assert torch.equal(
+                k_quant[i], kv_cache[blk, val_off : val_off + TOKEN_STRIDE]
+            ), f"token {i}: FP8 bytes differ"
+
+            scale_off = kv_block_size * TOKEN_STRIDE + pos * SCALE_DIM
+            actual_scale = kv_cache[blk, scale_off : scale_off + SCALE_DIM].view(
+                torch.float32
+            )
+            assert torch.equal(actual_scale, scale[i : i + 1]), (
+                f"token {i}: scale {actual_scale.item()} != {scale[i].item()}"
+            )
diff --git a/tests/kernels/test_fused_indexer_q_rope_quant.py b/tests/kernels/test_fused_indexer_q_rope_quant.py
index 03d5ad4c8ac7..be2039ce513e 100644
--- a/tests/kernels/test_fused_indexer_q_rope_quant.py
+++ b/tests/kernels/test_fused_indexer_q_rope_quant.py
@@ -30,6 +30,56 @@
 MAX_POS = 4096
 
 
+def quantize_to_mxfp4(
+    x: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Reference MXFP4 quantization.
+
+    Args:
+        x: [..., head_dim] where head_dim is divisible by 32
+    Returns:
+        packed: [..., head_dim//2]  uint8   2 E2M1 nibbles/byte, low nibble = even index
+        scales: [..., head_dim//32] uint8   1 ue8m0 byte
+    """
+    MXFP4_BLOCK_SIZE = 32
+    orig_shape = x.shape
+    head_dim = orig_shape[-1]
+    n_blocks = head_dim // MXFP4_BLOCK_SIZE
+
+    x_f32 = x.float().reshape(-1, n_blocks, MXFP4_BLOCK_SIZE)
+
+    # Per-block ue8m0 scale: 2^ceil(log2(amax / 6.0)), stored as byte = exp + 127
+    # 6 * 2^-126 is from https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/inference/kernel.py#L163
+    amax = x_f32.abs().amax(dim=-1, keepdim=True).clamp(min=6 * (2**-126))
+    log2_ratio = (amax * (1.0 / 6.0)).log2().ceil().clamp(-127.0, 127.0)
+    scale = log2_ratio.exp2()
+    ue8m0 = (log2_ratio + 127.0).to(torch.uint8)  # [*, n_blocks]
+
+    # E2M1 round-to-nearest-even: midpoints round to the even code.
+    # E2M1 values: [0.00, 0.50, 1.00, 1.50, 2.00, 3.00, 4.00, 6.00]
+    # boundaries:  [   0.25, 0.75, 1.25, 1.75, 2.50, 3.50, 5.00]
+    x_scaled = (x_f32 / scale).clamp(-6.0, 6.0)
+    abs_x = x_scaled.abs()
+    code = torch.zeros_like(abs_x, dtype=torch.int32)
+    code = torch.where(abs_x > 0.25, 1, code)
+    code = torch.where(abs_x >= 0.75, 2, code)
+    code = torch.where(abs_x > 1.25, 3, code)
+    code = torch.where(abs_x >= 1.75, 4, code)
+    code = torch.where(abs_x > 2.5, 5, code)
+    code = torch.where(abs_x >= 3.5, 6, code)
+    code = torch.where(abs_x > 5.0, 7, code)
+    sign = ((x_scaled.view(torch.int32) >> 31) & 1).to(torch.uint8)
+    nibble = code.to(torch.uint8) | (sign << 3)
+
+    # Pack: even-index element → low nibble, odd-index → high nibble
+    nibble_flat = nibble.reshape(-1, head_dim)
+    packed = (nibble_flat[:, 0::2] | (nibble_flat[:, 1::2] << 4)).contiguous()
+    packed = packed.reshape(*orig_shape[:-1], head_dim // 2)
+
+    scales = ue8m0.view(*orig_shape[:-1], n_blocks)
+    return packed, scales
+
+
 def _reference(
     positions: torch.Tensor,
     q: torch.Tensor,
@@ -37,6 +87,7 @@ def _reference(
     weights: torch.Tensor,
     softmax_scale: float,
     head_scale: float,
+    use_fp4: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     q_rot = q.clone()
     ops.rotary_embedding(
@@ -49,22 +100,33 @@ def _reference(
         HEAD_DIM - ROPE_DIM,  # rope_dim_offset → rotate the tail
         False,
     )
-    q_fp8, q_scale = per_token_group_quant_fp8(
-        q_rot.view(-1, HEAD_DIM).contiguous(),
-        HEAD_DIM,
-        use_ue8m0=True,
-    )
-    q_fp8 = q_fp8.view(-1, N_HEAD, HEAD_DIM)
-    q_scale = q_scale.view(-1, N_HEAD)
 
-    weights_out = weights.to(torch.float32) * q_scale * softmax_scale * head_scale
-    return q_fp8, weights_out
+    if use_fp4:
+        q_packed, ue8m0 = quantize_to_mxfp4(q_rot.view(-1, N_HEAD, HEAD_DIM))
+        # Pack 4 ue8m0 bytes into 1 int32
+        q_scale = ue8m0.view(torch.int32).squeeze(-1)
+        # FP4 path: q_scale stays separate (cannot be folded into a per-token scalar)
+        weights_out = weights.to(torch.float32) * softmax_scale * head_scale
+        return (q_packed, q_scale), weights_out
+
+    else:
+        q_fp8, q_scale = per_token_group_quant_fp8(
+            q_rot.view(-1, HEAD_DIM).contiguous(),
+            HEAD_DIM,
+            use_ue8m0=True,
+        )
+        q_fp8 = q_fp8.view(-1, N_HEAD, HEAD_DIM)
+        q_scale = q_scale.view(-1, N_HEAD)
+
+        weights_out = weights.to(torch.float32) * q_scale * softmax_scale * head_scale
+        return q_fp8, weights_out
 
 
 @pytest.mark.parametrize("num_tokens", [1, 7, 32, 257])
 @pytest.mark.parametrize("cache_dtype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("use_fp4", [False, True])
 @torch.inference_mode()
-def test_fused_indexer_q_rope_quant_matches_unfused(num_tokens, cache_dtype):
+def test_fused_indexer_q_rope_quant_matches_unfused(num_tokens, cache_dtype, use_fp4):
     device = "cuda"
     torch.manual_seed(0)
 
@@ -77,21 +139,32 @@ def test_fused_indexer_q_rope_quant_matches_unfused(num_tokens, cache_dtype):
     softmax_scale = HEAD_DIM**-0.5
     head_scale = N_HEAD**-0.5
 
-    q_fp8_ref, weights_ref = _reference(
-        positions, q, cos_sin_cache, weights, softmax_scale, head_scale
+    q_quant_ref, weights_ref = _reference(
+        positions, q, cos_sin_cache, weights, softmax_scale, head_scale, use_fp4
     )
-    q_fp8_fused, weights_fused = fused_indexer_q_rope_quant(
-        positions, q.clone(), cos_sin_cache, weights, softmax_scale, head_scale
+    q_quant_fused, weights_fused = fused_indexer_q_rope_quant(
+        positions, q.clone(), cos_sin_cache, weights, softmax_scale, head_scale, use_fp4
     )
 
+    if use_fp4:
+        q_quant_ref, q_scale_ref = q_quant_ref
+        q_quant_fused, q_scale_fused = q_quant_fused
+
+        assert torch.equal(q_scale_ref, q_scale_fused), (
+            f"q_scale mismatch: "
+            f"{(q_scale_ref != q_scale_fused).sum().item()} "
+            f"/ {q_scale_ref.numel()} bytes differ"
+        )
+
     # fp8 tensors aren't directly comparable via torch.equal — reinterpret as int8.
-    ref_bits = q_fp8_ref.view(torch.int8)
-    fused_bits = q_fp8_fused.view(torch.int8)
+    ref_bits = q_quant_ref.view(torch.int8)
+    fused_bits = q_quant_fused.view(torch.int8)
     assert torch.equal(ref_bits, fused_bits), (
-        f"q_fp8 mismatch: "
+        f"q_quant_fused mismatch: "
         f"{(ref_bits != fused_bits).sum().item()} / {ref_bits.numel()} bytes differ"
     )
 
+    assert weights_fused.dtype == torch.float32
     assert torch.equal(weights_ref, weights_fused), (
         f"weights mismatch: max abs diff "
         f"{(weights_ref - weights_fused).abs().max().item()}"
diff --git a/tests/kernels/test_kda.py b/tests/kernels/test_kda.py
new file mode 100644
index 000000000000..18531fad999a
--- /dev/null
+++ b/tests/kernels/test_kda.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Precision tests for vllm's chunk_kda Triton operator.
+
+Compares chunk_kda against a naive recurrent reference (float32).
+Uses torch.rand for q/k/v to match FLA's test pattern.
+"""
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.fla.ops.kda import chunk_kda
+from vllm.model_executor.layers.fla.ops.l2norm import l2norm_fwd
+
+DEVICE = "cuda"
+
+
+def naive_recurrent_kda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float | None = None,
+    initial_state: torch.Tensor | None = None,
+    output_final_state: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor | None]:
+    """Naive recurrent KDA reference, ported from FLA's naive.py."""
+    dtype = v.dtype
+    B, T, H, K = q.shape
+    V = v.shape[-1]
+    if scale is None:
+        scale = K**-0.5
+
+    q, k, v, g, beta = (x.to(torch.float) for x in [q, k, v, g, beta])
+    q = q * scale
+
+    S = k.new_zeros(B, H, K, V).to(q)
+    if initial_state is not None:
+        S += initial_state
+    o = torch.zeros_like(v)
+    for i in range(T):
+        q_i, k_i, v_i, g_i, b_i = q[:, i], k[:, i], v[:, i], g[:, i], beta[:, i]
+        S = S * g_i[..., None].exp()
+        S = S + torch.einsum(
+            "bhk,bhv->bhkv",
+            b_i[..., None] * k_i,
+            v_i - (k_i[..., None] * S).sum(-2),
+        )
+        o[:, i] = torch.einsum("bhk,bhkv->bhv", q_i, S)
+    if not output_final_state:
+        S = None
+    return o.to(dtype), S
+
+
+def assert_close(
+    name: str,
+    ref: torch.Tensor,
+    tri: torch.Tensor,
+    ratio: float,
+    err_atol: float = 1e-6,
+):
+    """RMSE-based relative error comparison."""
+    abs_err = (ref.detach() - tri.detach()).flatten().abs().max().item()
+    rmse_diff = (ref.detach() - tri.detach()).flatten().square().mean().sqrt().item()
+    rmse_base = ref.detach().flatten().square().mean().sqrt().item()
+    rel_err = rmse_diff / (rmse_base + 1e-8)
+    print(f"{name:>4} | abs={abs_err:.6f} | rmse={rel_err:.6f} | thr={ratio}")
+    if abs_err <= err_atol:
+        return
+    assert not torch.isnan(ref).any(), f"{name}: NaN detected in ref"
+    assert not torch.isnan(tri).any(), f"{name}: NaN detected in tri"
+    assert rel_err < ratio, (
+        f"{name}: max abs err {abs_err:.6f}, rmse ratio {rel_err:.6f} >= {ratio}"
+    )
+
+
+@pytest.mark.parametrize(
+    ("H", "D", "cu_seqlens", "dtype"),
+    [
+        pytest.param(
+            *test,
+            id="H{}-D{}-cu{}-{}".format(*test),
+        )
+        for test in [
+            (32, 128, [0, 64], torch.float16),
+            (32, 128, [0, 1024], torch.float16),
+            (32, 128, [0, 15], torch.float16),
+            (32, 128, [0, 256, 512, 768, 1024], torch.float16),
+            (32, 128, [0, 15, 100, 300, 1200], torch.float16),
+            (64, 128, [0, 256, 500, 1000], torch.float16),
+            (32, 128, [0, 8192], torch.float16),
+            (32, 128, [0, 256, 500, 1000], torch.bfloat16),
+        ]
+    ],
+)
+@torch.inference_mode()
+def test_chunk_kda(
+    H: int,
+    D: int,
+    cu_seqlens: list[int],
+    dtype: torch.dtype,
+):
+    T = cu_seqlens[-1]
+    torch.manual_seed(42)
+    B = 1
+    cu_seqlens_t = torch.LongTensor(cu_seqlens).to(DEVICE)
+    N = len(cu_seqlens) - 1
+
+    q = torch.rand(B, T, H, D, dtype=dtype, device=DEVICE)
+    k = torch.rand(B, T, H, D, dtype=dtype, device=DEVICE)
+    v = torch.rand(B, T, H, D, dtype=dtype, device=DEVICE)
+    g = F.logsigmoid(torch.randn(B, T, H, D, dtype=torch.float32, device=DEVICE)).to(
+        dtype
+    )
+    beta = torch.rand(B, T, H, dtype=dtype, device=DEVICE).sigmoid()
+    h0 = torch.randn(N, H, D, D, dtype=torch.float32, device=DEVICE)
+
+    # Naive reference with l2norm_fwd (same kernel as chunk_kda)
+    ref_outputs = []
+    ref_states = []
+    for i in range(N):
+        s, e = cu_seqlens[i], cu_seqlens[i + 1]
+        q_i = l2norm_fwd(q[:, s:e].contiguous())
+        k_i = l2norm_fwd(k[:, s:e].contiguous())
+        o_i, ht_i = naive_recurrent_kda(
+            q_i,
+            k_i,
+            v[:, s:e],
+            g[:, s:e],
+            beta[:, s:e],
+            initial_state=h0[i],
+            output_final_state=True,
+        )
+        ref_outputs.append(o_i)
+        ref_states.append(ht_i)
+    ref_o = torch.cat(ref_outputs, dim=1)
+    ref_ht = torch.cat(ref_states, dim=0)
+
+    # h0 transposed to (V, K) layout for the kernel; naive uses (K, V)
+    tri_o, tri_ht = chunk_kda(
+        q=q.clone(),
+        k=k.clone(),
+        v=v.clone(),
+        g=g.clone(),
+        beta=beta.clone(),
+        initial_state=h0.transpose(-1, -2).contiguous().clone(),
+        output_final_state=True,
+        cu_seqlens=cu_seqlens_t,
+        use_qk_l2norm_in_kernel=True,
+    )
+
+    assert not torch.isnan(tri_o).any(), "Triton output o contains NaN"
+    assert not torch.isnan(tri_ht).any(), "Triton output ht contains NaN"
+    assert_close("o", ref_o, tri_o, 0.005)
+    assert_close("ht", ref_ht, tri_ht.transpose(-1, -2).contiguous(), 0.005)
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 1c07dc4ae677..cfe5b46c64c5 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -943,110 +943,3 @@ def test_target_modules_match_packed_runtime_modules(
             ("layer1.dense2", RowParallelLinearWithLoRA),
         ],
     )
-
-
-@pytest.mark.parametrize("device", DEVICES)
-def test_load_adapter_warns_on_unsupported_modules(
-    default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
-):
-    """Test that _load_adapter warns when a LoRA adapter contains modules
-    not in the model's supported LoRA target modules."""
-    from unittest.mock import patch
-
-    import vllm.lora.worker_manager as wm_module
-
-    lora_config = LoRAConfig(
-        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
-    )
-
-    dummy_lora_files = f"{tmp_path}/lora_adapter"
-    os.makedirs(dummy_lora_files, exist_ok=True)
-    create_peft_lora(
-        dummy_model_gate_up,
-        save_dir=dummy_lora_files,
-        target_modules=["layer1.dense1", "dense2"],
-        lora_dtype=DEFAULT_DTYPE,
-    )
-
-    model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
-    vllm_config.scheduler_config.max_num_seqs = 4
-    vllm_config.scheduler_config.max_num_batched_tokens = 2
-
-    worker_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
-    worker_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
-    worker_manager.create_lora_manager(dummy_model_gate_up)
-
-    # Patch from_local_checkpoint to inject an unsupported module
-    original_from_checkpoint = LoRAModel.from_local_checkpoint
-
-    def patched_from_checkpoint(*args, **kwargs):
-        lora = original_from_checkpoint(*args, **kwargs)
-        lora.loras["unsupported_module"] = LoRALayerWeights(
-            module_name="unsupported_module",
-            rank=8,
-            lora_alpha=16,
-            lora_a=torch.randn(8, 10),
-            lora_b=torch.randn(10, 8),
-        )
-        return lora
-
-    lora_request = LoRARequest("test", 1, dummy_lora_files)
-    with (
-        patch.object(LoRAModel, "from_local_checkpoint", patched_from_checkpoint),
-        patch.object(wm_module.logger, "warning_once") as mock_warning,
-    ):
-        worker_manager._load_adapter(lora_request)
-        warning_args = mock_warning.call_args_list
-        found = any("unsupported_module" in str(call) for call in warning_args)
-        assert found, (
-            f"Expected warning about 'unsupported_module', got: {warning_args}"
-        )
-
-
-@pytest.mark.parametrize("device", DEVICES)
-def test_load_adapter_warns_on_target_modules_restriction(
-    default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
-):
-    """Test that _load_adapter warns when a LoRA adapter contains modules
-    excluded by the deployment-time target_modules restriction."""
-    from unittest.mock import patch
-
-    import vllm.lora.worker_manager as wm_module
-
-    # Restrict to only dense2 — adapter has dense1 which will be excluded
-    lora_config = LoRAConfig(
-        max_lora_rank=8,
-        max_cpu_loras=4,
-        max_loras=4,
-        lora_dtype=DEFAULT_DTYPE,
-        target_modules=["dense2"],
-    )
-
-    dummy_lora_files = f"{tmp_path}/lora_adapter"
-    os.makedirs(dummy_lora_files, exist_ok=True)
-    create_peft_lora(
-        dummy_model_gate_up,
-        save_dir=dummy_lora_files,
-        target_modules=["layer1.dense1", "dense2"],
-        lora_dtype=DEFAULT_DTYPE,
-    )
-
-    model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
-    vllm_config.scheduler_config.max_num_seqs = 4
-    vllm_config.scheduler_config.max_num_batched_tokens = 2
-
-    worker_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
-    worker_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
-    worker_manager.create_lora_manager(dummy_model_gate_up)
-
-    lora_request = LoRARequest("test", 1, dummy_lora_files)
-    with patch.object(wm_module.logger, "warning_once") as mock_warning:
-        worker_manager._load_adapter(lora_request)
-        warning_args = mock_warning.call_args_list
-        # dense1 is supported by the model but excluded by target_modules
-        found = any("target_modules" in str(call) for call in warning_args)
-        assert found, (
-            f"Expected warning about target_modules restriction, got: {warning_args}"
-        )
diff --git a/tests/model_executor/model_loader/test_reload.py b/tests/model_executor/model_loader/test_reload.py
index 6e3e2d63e144..cf3553bd57de 100644
--- a/tests/model_executor/model_loader/test_reload.py
+++ b/tests/model_executor/model_loader/test_reload.py
@@ -59,6 +59,34 @@ def test_reload_lifecycle():
         assert tensor.__dict__ == materialized_tensor.__dict__
 
 
+def test_materialize_layer_preserves_non_meta_tensors():
+    """Ensure that materialize_layer does not overwrite non meta tensors."""
+    layer = torch.nn.Linear(2, 3, bias=True)
+
+    # Create a non meta bias tensor and meta weight, which can happen with FP8
+    bias_values = torch.ones(3)
+    layer.bias.data.copy_(bias_values)
+    layer.weight = torch.nn.Parameter(layer.weight.data.to("meta"))
+
+    assert layer.weight.is_meta
+    assert not layer.bias.is_meta
+
+    # materialize the layer weights after the bias is initialized
+    info = LayerReloadingInfo(
+        restore_metadata=({}, {}),
+        restore_device=torch.device("cpu"),
+    )
+    materialize_layer(layer, info)
+
+    # Ensure the weight materialized off meta
+    assert not layer.weight.is_meta
+    assert layer.weight.device.type == "cpu"
+
+    # Ensure that the bias is (still) not meta and values are unchanged
+    assert not layer.bias.is_meta
+    assert torch.equal(layer.bias.data, bias_values)
+
+
 def test_model_cleanup(dist_init, default_vllm_config):
     layer = QKVParallelLinear(2, 3, 4)
     assert layer.weight.weight_loader.__self__ is layer
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index fc4f6f6b63f9..490284f43954 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -23,11 +23,7 @@
     vllm_topk_sigmoid,
     vllm_topk_softmax,
 )
-from vllm.model_executor.layers.layernorm import (
-    RMSNorm,
-    dispatch_rocm_rmsnorm_func,
-    fused_add_rms_norm,
-)
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
 
 RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
@@ -153,26 +149,3 @@ def test_topk_sigmoid_dispatch(use_rocm_aiter: bool):
         assert topk_func == rocm_aiter_ops.topk_sigmoid
     else:
         assert topk_func == vllm_topk_sigmoid
-
-
-@pytest.mark.parametrize("add_residual", [False])
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("use_rocm_aiter", [True, False])
-@pytest.mark.skipif(
-    not current_platform.is_rocm(), reason="AITER is a feature exclusive for ROCm"
-)
-def test_rms_norm_dispatch(
-    add_residual: bool, dtype: torch.dtype, use_rocm_aiter: bool
-):
-    rms_norm_func = dispatch_rocm_rmsnorm_func(dtype, use_rocm_aiter)
-
-    should_use_rocm_aiter = (
-        current_platform.is_rocm()
-        and use_rocm_aiter
-        and dtype in RMS_NORM_SUPPORTED_DTYPES
-    )
-
-    if should_use_rocm_aiter:
-        assert rms_norm_func == rocm_aiter_ops.rms_norm2d_with_add
-    else:
-        assert rms_norm_func == fused_add_rms_norm
diff --git a/tests/model_executor/test_oink_integration.py b/tests/model_executor/test_oink_integration.py
index d7f38fdd5158..2f37472b73ef 100644
--- a/tests/model_executor/test_oink_integration.py
+++ b/tests/model_executor/test_oink_integration.py
@@ -1,60 +1,97 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import multiprocessing
 import types
 
 import pytest
-import torch
-
-
-def _load_oink_ops_module():
-    # Import the module normally (vllm is installed as an editable package in CI).
-    from vllm import _oink_ops
-
-    return _oink_ops
-
 
-def test_oink_availability_checks(monkeypatch: pytest.MonkeyPatch):
-    _oink_ops = _load_oink_ops_module()
-
-    # Ensure the ops namespace exists and is mutable for tests.
-    monkeypatch.setattr(
-        torch.ops,
-        "oink",
-        types.SimpleNamespace(rmsnorm=lambda x, w, eps: x),
-        raising=False,
-    )
-
-    # Case 1: CUDA not available.
-    monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
-    assert _oink_ops.is_oink_available_for_device(0) is False
-
-    # Case 2: CUDA available but < SM100.
-    monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
-    monkeypatch.setattr(torch.cuda, "get_device_capability", lambda idx: (9, 0))
-    assert _oink_ops.is_oink_available_for_device(0) is False
-
-    # Case 3: CUDA available and SM100, rmsnorm op registered.
-    monkeypatch.setattr(torch.cuda, "get_device_capability", lambda idx: (10, 0))
-    assert _oink_ops.is_oink_available_for_device(0) is True
-
-    # fused op presence probe
-    assert _oink_ops.has_fused_add_rms_norm() is False
-    monkeypatch.setattr(
-        torch.ops,
-        "oink",
-        types.SimpleNamespace(
-            rmsnorm=lambda x, w, eps: x,
-            fused_add_rms_norm=lambda x, residual, w, eps: None,
+from vllm.platforms import current_platform
+
+
+def _test_oink_availability_impl(
+    device_capability: tuple[int, int],
+    has_rmsnorm: bool,
+    has_fused_add_rms_norm: bool,
+    expected_available: bool,
+    expected_fused: bool,
+) -> None:
+    """Test OINK support detection with mocked state."""
+    import torch
+
+    from vllm import platforms
+
+    # Mock device capability (class method, override on class)
+    dc = platforms.interface.DeviceCapability(*device_capability)
+    platforms.current_platform.__class__.get_device_capability = lambda device_id=0: dc
+
+    # Mock oink ops
+    oink_ops = types.SimpleNamespace()
+    if has_rmsnorm:
+        oink_ops.rmsnorm = lambda x, w, eps: x
+    if has_fused_add_rms_norm:
+        oink_ops.fused_add_rms_norm = lambda x, residual, w, eps: None
+
+    torch.ops.oink = oink_ops
+
+    # Now import vllm modules with mocks in place (fresh import with mocked platform)
+    import vllm.kernels.oink_ops  # noqa: F401
+    from vllm.ir.ops import fused_add_rms_norm, rms_norm
+
+    # Verify support checks
+    assert rms_norm.impls["oink"].supported is expected_available
+    assert fused_add_rms_norm.impls["oink"].supported is expected_fused
+
+
+@pytest.mark.parametrize(
+    "device_capability,has_rmsnorm,has_fused_add_rms_norm,expected_available,expected_fused",
+    [
+        # Case 1: < SM100, ops not supported
+        ((9, 0), True, False, False, False),
+        # Case 2: CUDA available and SM100, rmsnorm op registered
+        ((10, 0), True, False, True, False),
+        # Case 3: SM100 with both rmsnorm and fused_add_rms_norm
+        ((10, 0), True, True, True, True),
+    ],
+)
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test on CUDA")
+def test_oink_availability_checks(
+    device_capability: tuple[int, int],
+    has_rmsnorm: bool,
+    has_fused_add_rms_norm: bool,
+    expected_available: bool,
+    expected_fused: bool,
+):
+    """Test OINK support detection with clean import state for each parameter set."""
+
+    # Use spawn to run function in fresh process with clean imports
+    # TODO migrate to spawn utility:
+    # https://github.com/vllm-project/vllm/issues/41415
+    ctx = multiprocessing.get_context("spawn")
+    process = ctx.Process(
+        target=_test_oink_availability_impl,
+        args=(
+            device_capability,
+            has_rmsnorm,
+            has_fused_add_rms_norm,
+            expected_available,
+            expected_fused,
         ),
-        raising=False,
     )
-    assert _oink_ops.has_fused_add_rms_norm() is True
+    process.start()
+    process.join()
+
+    if process.exitcode != 0:
+        raise AssertionError(
+            f"Subprocess test failed with exit code {process.exitcode}"
+        )
 
 
 def test_can_view_as_2d_stride_guard():
-    # Import the helper from the layernorm module.
-    from vllm.model_executor.layers.layernorm import _can_view_as_2d
+    # No global import
+    import torch
+
+    # Import the helper from the kernels module.
+    from vllm.kernels.oink_ops import _can_view_as_2d
 
     x = torch.zeros((2, 3, 4))
     assert _can_view_as_2d(x) is True
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 01d395b1e0d8..e410daf2fcdd 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -881,7 +881,7 @@ def test_apc_common_prefix_same_batch(
         "hello what is one plus one what is one plus one what is one plus one the answer is",  # noqa: E501
         "hello what is one plus one what is one plus one what is one plus one the answer is",  # noqa: E501
     ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=20)
     outputs = llm.generate(prompts, sampling_params)
     for output in outputs:
         assert "two" in output.outputs[0].text
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index e04d44d4be28..45e693598f1b 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -805,6 +805,32 @@ def _granite4_vision_vllm_to_hf_output(vllm_output, model):
         max_num_seqs=2,
         patch_hf_runner=model_utils.molmo_patch_hf_runner,
     ),
+    "moondream3": VLMTestInfo(
+        models=["moondream/moondream3-preview"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=identity,
+        img_idx_to_prompt=lambda idx: "<|endoftext|><image>",
+        # Common-image coverage here targets query/caption. The native
+        # detect/point skills are not exposed by vLLM.
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<vlm_image><|md_reserved_0|>query<|md_reserved_1|>What is this sign?<|md_reserved_2|>",  # noqa: E501
+                "cherry_blossom": (
+                    "<vlm_image><|md_reserved_0|>query<|md_reserved_1|>What season is shown?<|md_reserved_2|>"  # noqa: E501
+                ),
+            }
+        ),
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        hf_processor=model_utils.moondream3_processor,
+        patch_hf_runner=model_utils.moondream3_patch_hf_runner,
+        # Single size factor to avoid GPU OOM when running multiple test
+        # cases sequentially (9B MoE model uses ~18 GiB per instance).
+        image_size_factors=[(1.0,)],
+        # Moondream3 is 9B params with MoE, needs significant GPU memory
+        marks=[large_gpu_mark(min_gb=48)],
+    ),
     "ovis1_6-gemma2": VLMTestInfo(
         models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -902,6 +928,16 @@ def _granite4_vision_vllm_to_hf_output(vllm_output, model):
             ),
         ],
     ),
+    "qianfan_ocr": VLMTestInfo(
+        models=["baidu/Qianfan-OCR"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        auto_cls=AutoModelForImageTextToText,
+        hf_model_kwargs=model_utils.qianfan_ocr_hf_model_kwargs("baidu/Qianfan-OCR"),
+    ),
     "qwen_vl": VLMTestInfo(
         models=["Qwen/Qwen-VL"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/multimodal/generation/test_moondream3.py b/tests/models/multimodal/generation/test_moondream3.py
new file mode 100644
index 000000000000..e4aaa5a55b55
--- /dev/null
+++ b/tests/models/multimodal/generation/test_moondream3.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Generation tests for Moondream3 query and caption support."""
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm.platforms import current_platform
+
+from ....conftest import IMAGE_ASSETS, ImageTestAssets
+from ....utils import large_gpu_mark, multi_gpu_test
+
+MOONDREAM3_MODEL_ID = "moondream/moondream3-preview"
+MOONDREAM3_TOKENIZER = "moondream/starmie-v1"
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>What color is the stop sign?<|md_reserved_2|>",  # noqa: E501
+        "cherry_blossom": "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>What color are the flowers?<|md_reserved_2|>",  # noqa: E501
+    }
+)
+
+
+def make_query_prompt(question: str) -> str:
+    """Create a direct-answer query prompt for Moondream3."""
+    return (
+        "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>"
+        f"{question}<|md_reserved_2|>"
+    )
+
+
+def make_caption_prompt(length: str = "normal") -> str:
+    """Create a caption prompt for Moondream3."""
+    return (
+        "<|endoftext|><image><|md_reserved_0|>"
+        f"describe<|md_reserved_1|>{length}<|md_reserved_2|>"
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@large_gpu_mark(min_gb=80)
+def test_tensor_parallel(image_assets: ImageTestAssets):
+    import gc
+
+    from vllm import LLM, SamplingParams
+    from vllm.distributed.parallel_state import destroy_model_parallel
+
+    destroy_model_parallel()
+    gc.collect()
+    current_platform.empty_cache()
+
+    llm = LLM(
+        model=MOONDREAM3_MODEL_ID,
+        tokenizer=MOONDREAM3_TOKENIZER,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        tensor_parallel_size=2,
+        max_model_len=1024,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": 1},
+        gpu_memory_utilization=0.45,
+    )
+
+    image = image_assets[0].pil_image
+    prompt = make_query_prompt("What color is the stop sign?")
+
+    try:
+        outputs = llm.generate(
+            {"prompt": prompt, "multi_modal_data": {"image": image}},
+            SamplingParams(max_tokens=20, temperature=0),
+        )
+
+        assert len(outputs) > 0
+        assert outputs[0].outputs[0].text is not None
+    finally:
+        del llm
+        gc.collect()
+        current_platform.empty_cache()
+
+
+@pytest.fixture(scope="module")
+def llm():
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("Moondream3ForCausalLM")
+    model_info.check_transformers_version(on_fail="skip")
+
+    from vllm import LLM
+
+    try:
+        return LLM(
+            model=MOONDREAM3_MODEL_ID,
+            tokenizer=MOONDREAM3_TOKENIZER,
+            trust_remote_code=True,
+            dtype="bfloat16",
+            max_model_len=2048,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image": 1},
+            gpu_memory_utilization=0.45,
+        )
+    except Exception as exc:
+        pytest.skip(f"Failed to load {MOONDREAM3_MODEL_ID}: {exc}")
+
+
+@large_gpu_mark(min_gb=48)
+def test_model_loading(llm):
+    assert llm is not None
+
+
+@large_gpu_mark(min_gb=48)
+def test_query_skill(llm, image_assets: ImageTestAssets):
+    from vllm import SamplingParams
+
+    image = image_assets[0].pil_image
+    prompt = make_query_prompt("What color is the stop sign?")
+
+    outputs = llm.generate(
+        {"prompt": prompt, "multi_modal_data": {"image": image}},
+        SamplingParams(max_tokens=50, temperature=0),
+    )
+
+    output_text = outputs[0].outputs[0].text
+    assert output_text is not None
+    assert len(output_text) > 0
+
+
+@large_gpu_mark(min_gb=48)
+def test_caption_skill(llm, image_assets: ImageTestAssets):
+    from vllm import SamplingParams
+
+    image = image_assets[1].pil_image
+    prompt = make_caption_prompt()
+
+    outputs = llm.generate(
+        {"prompt": prompt, "multi_modal_data": {"image": image}},
+        SamplingParams(max_tokens=100, temperature=0),
+    )
+
+    output_text = outputs[0].outputs[0].text
+    assert output_text is not None
+    assert len(output_text) > 0
+
+
+@large_gpu_mark(min_gb=48)
+def test_batched_inference(llm, image_assets: ImageTestAssets):
+    from vllm import SamplingParams
+
+    images = [asset.pil_image for asset in image_assets]
+    prompts = [
+        {"prompt": prompt, "multi_modal_data": {"image": img}}
+        for img, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
+
+    outputs = llm.generate(prompts, SamplingParams(max_tokens=50, temperature=0))
+
+    assert len(outputs) == len(images)
+    for output in outputs:
+        assert output.outputs[0].text is not None
+        assert len(output.outputs[0].text) > 0
+
+
+@pytest.mark.parametrize("asset_name", ["stop_sign", "cherry_blossom"])
+@large_gpu_mark(min_gb=48)
+def test_image_assets(llm, image_assets: ImageTestAssets, asset_name: str):
+    from vllm import SamplingParams
+
+    asset_idx = 0 if asset_name == "stop_sign" else 1
+    image = image_assets[asset_idx].pil_image
+    prompt = HF_IMAGE_PROMPTS[asset_idx]
+
+    outputs = llm.generate(
+        {"prompt": prompt, "multi_modal_data": {"image": image}},
+        SamplingParams(max_tokens=50, temperature=0),
+    )
+
+    output_text = outputs[0].outputs[0].text
+    assert output_text is not None
+    assert len(output_text) > 0
diff --git a/tests/models/multimodal/generation/test_qwen2_5_vl.py b/tests/models/multimodal/generation/test_qwen2_5_vl.py
index 3ba665710af4..791bb3b3088f 100644
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from vllm.assets.image import ImageAsset
 from vllm.multimodal.video import sample_frames_from_video
 
 from ....conftest import VIDEO_ASSETS
@@ -11,6 +12,7 @@
 target_dtype = "bfloat16"
 
 VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
 
 
 def qwen2_5_vl_chat_template(*query):
@@ -28,6 +30,25 @@ def qwen2_5_vl_chat_template(*query):
 )
 
 
+WINDOW_ATTN_IMAGE_PROMPT = qwen2_5_vl_chat_template(
+    IMAGE_PLACEHOLDER,
+    "Describe the image.",
+)
+
+
+def _window_attention_regression_image():
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
+    image = ImageAsset("hato").pil_image
+    return image.resize((image.width // 2, image.height // 2))
+
+
+def _encoder_cudagraph_config(*, max_vision_items: int) -> dict:
+    return {
+        "cudagraph_mm_encoder": True,
+        "encoder_cudagraph_max_vision_items_per_batch": max_vision_items,
+    }
+
+
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@@ -146,3 +167,77 @@ def test_qwen2_5_vl_evs_batched_videos(
 
             # Ensure the output is a string
             assert isinstance(output_text, str)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_qwen2_5_vl_window_attention_image(
+    vllm_runner,
+    model,
+    dtype: str,
+    max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
+) -> None:
+    """Regression test for Qwen2.5 window-attention image path."""
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    prompt = [WINDOW_ATTN_IMAGE_PROMPT]
+    images = [[_window_attention_regression_image()]]
+
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4096,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 1},
+        compilation_config=_encoder_cudagraph_config(max_vision_items=1),
+    ) as vllm_model:
+        outputs = vllm_model.generate_greedy(prompt, max_tokens, images=images)
+
+        assert len(outputs) == 1
+        output_ids, output_text = outputs[0]
+        assert len(output_ids) > 0
+        assert len(output_text) > 0
+        assert isinstance(output_text, str)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_qwen2_5_vl_window_attention_image_batch(
+    vllm_runner,
+    model,
+    dtype: str,
+    max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
+) -> None:
+    """Regression test window-attention with a small image batch."""
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    image = _window_attention_regression_image()
+    prompts = [WINDOW_ATTN_IMAGE_PROMPT, WINDOW_ATTN_IMAGE_PROMPT]
+    images = [[image], [image]]
+
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 1},
+        compilation_config=_encoder_cudagraph_config(max_vision_items=2),
+    ) as vllm_model:
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, images=images)
+
+        assert len(outputs) == 2
+        for output_ids, output_text in outputs:
+            assert len(output_ids) > 0
+            assert len(output_text) > 0
+            assert isinstance(output_text, str)
diff --git a/tests/models/multimodal/generation/test_vit_cudagraph.py b/tests/models/multimodal/generation/test_vit_cudagraph.py
index 7adea0771b6d..fb7bdfc8625d 100644
--- a/tests/models/multimodal/generation/test_vit_cudagraph.py
+++ b/tests/models/multimodal/generation/test_vit_cudagraph.py
@@ -54,7 +54,18 @@ def qwen_vl_chat_template(content: str) -> str:
         needs_video_metadata=True,
         marks=[pytest.mark.core_model],
     ),
-    # TODO: Add more models below.
+    "qwen2_5_vl": VitCudagraphTestConfig(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        image_prompt=qwen_vl_chat_template(
+            "<|vision_start|><|image_pad|><|vision_end|>What is in this image?"
+        ),
+        video_prompt=qwen_vl_chat_template(
+            "<|vision_start|><|video_pad|><|vision_end|>"
+            "Describe this video in one sentence."
+        ),
+        needs_video_metadata=False,
+        marks=[pytest.mark.core_model],
+    ),
 }
 
 
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index ae95f39586c0..207d3a3202a1 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -38,6 +38,7 @@ def run_test(
     limit_mm_per_prompt: dict[str, int],
     vllm_runner_kwargs: dict[str, Any] | None,
     hf_model_kwargs: dict[str, Any] | None,
+    hf_processor: Callable[[str], Any] | None,
     patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
     runner: RunnerOption = "auto",
     distributed_executor_backend: str | None = None,
@@ -116,8 +117,18 @@ def run_test(
             )
             vllm_outputs_per_mm.append(vllm_output)
 
+    hf_runner_kwargs: dict[str, Any] = {}
+    if model_info.tokenizer:
+        hf_runner_kwargs["tokenizer_name"] = model_info.tokenizer
+    if hf_processor is not None:
+        hf_runner_kwargs["processor"] = hf_processor(model)
+
     hf_model = hf_runner(
-        model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
+        model,
+        dtype=dtype,
+        auto_cls=auto_cls,
+        model_kwargs=hf_model_kwargs,
+        **hf_runner_kwargs,
     )
 
     # Some models need to patch things like the model processor, e.g., internvl
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 0a692387cffc..62ea36061c9c 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -1336,3 +1336,312 @@ def patched_generate(*args, **kwargs):
     hf_model.get_inputs = patched_get_inputs  # type: ignore[method-assign, assignment]
     hf_model.model.generate = patched_generate  # type: ignore[method-assign]
     return hf_model
+
+
+def moondream3_processor(model: str):
+    from vllm.transformers_utils.processors.moondream3 import Moondream3Processor
+
+    return Moondream3Processor.from_pretrained(model, trust_remote_code=True)
+
+
+def moondream3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patch HfRunner for Moondream3."""
+    moondream_processor = hf_model.processor
+
+    def processor(*args, text="", images=None, **kwargs):
+        if images is None:
+            return moondream_processor(text=text, **kwargs)
+
+        images_list = [images] if isinstance(images, Image) else images
+        return moondream_processor(images=images_list, text=text, **kwargs)
+
+    hf_model.processor = processor
+
+    # Expose the LM head for logprob extraction.
+    hf_model.model.get_output_embeddings = lambda: hf_model.model.model.text.lm_head
+
+    native_model = hf_model.model.model  # MoondreamModel instance
+
+    from torch.nn import functional as F
+
+    from vllm.model_executor.models.moondream3 import reconstruct_from_crops
+
+    # Resolve the placeholder tokens from the tokenizer instead of hard-coding.
+    image_placeholder_ids = moondream_processor.tokenizer.encode(
+        "<image>", add_special_tokens=False
+    )
+
+    def _normalize_tiling(tilings):
+        """Extract (h, w) tuple from various tiling container formats."""
+        tiling = tilings
+        if isinstance(tiling, torch.Tensor):
+            tiling = tuple(tiling.squeeze().tolist())
+        elif isinstance(tiling, (list, tuple)):
+            t0 = tiling[0]
+            if isinstance(t0, torch.Tensor):
+                tiling = tuple(t0.tolist())
+            elif isinstance(t0, (list, tuple)):
+                tiling = tuple(t0)
+        return tiling
+
+    def _encode_vision(pixel_values, tilings):
+        """Run preprocessed crops through vision encoder + projection."""
+        device = native_model.device
+        dtype = native_model.vision.pos_emb.dtype
+        config = native_model.config
+
+        pv = pixel_values
+        while pv.dim() > 4:
+            pv = pv.squeeze(0)
+        pv = pv.to(device=device, dtype=dtype)
+
+        features = native_model._vis_enc(pv)
+        grid_size = config.vision.crop_size // config.vision.enc_patch_size
+        global_feat = features[0]
+
+        if features.shape[0] > 1 and tilings is not None:
+            tiling = _normalize_tiling(tilings)
+            local = features[1:].view(-1, grid_size, grid_size, config.vision.enc_dim)
+            reconstructed = reconstruct_from_crops(
+                local,
+                tiling,
+                config.vision.overlap_margin,
+                patch_size=1,
+            )
+        else:
+            reconstructed = global_feat.view(
+                grid_size, grid_size, config.vision.enc_dim
+            )
+
+        return native_model._vis_proj(global_feat, reconstructed)
+
+    def _find_subsequence(seq, subseq):
+        """Find start index of subseq in seq, or None."""
+        n = len(subseq)
+        for i in range(len(seq) - n + 1):
+            if seq[i : i + n] == subseq:
+                return i
+        return None
+
+    def _generate(
+        self,
+        input_ids=None,
+        pixel_values=None,
+        tilings=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        max_new_tokens = kwargs.get("max_new_tokens", 128)
+        return_dict = kwargs.get("return_dict_in_generate", False)
+        output_hs = kwargs.get("output_hidden_states", False)
+
+        if pixel_values is None:
+            sequences = input_ids
+            if return_dict:
+                return types.SimpleNamespace(
+                    sequences=sequences,
+                    hidden_states=() if output_hs else None,
+                )
+            return sequences
+
+        # Processor may return lists; extract the single element.
+        if isinstance(pixel_values, (list, tuple)):
+            pixel_values = pixel_values[0]
+        if (
+            isinstance(tilings, (list, tuple))
+            and tilings
+            and not isinstance(tilings[0], int)
+        ):
+            tilings = tilings[0]
+
+        hf_model.model._setup_caches()
+        native_model.use_flex_decoding = False
+
+        device = native_model.device
+        config = native_model.config
+
+        with torch.inference_mode():
+            for block in native_model.text.blocks:
+                block.kv_cache.k_cache.zero_()
+                block.kv_cache.v_cache.zero_()
+
+            img_emb = _encode_vision(pixel_values, tilings)
+
+            bos_emb = F.embedding(
+                torch.tensor([[config.tokenizer.bos_id]], device=device),
+                native_model.text.wte,
+            )
+            img_input = torch.cat([bos_emb, img_emb.unsqueeze(0)], dim=1)
+            prefix_len = img_input.size(1)
+
+            mask = native_model.attn_mask[:, :, :prefix_len, :]
+            pos_ids = torch.arange(prefix_len, dtype=torch.long, device=device)
+            native_model._prefill(img_input, mask, pos_ids, None)
+
+            ids = input_ids.squeeze(0).tolist()
+            img_start = _find_subsequence(ids, image_placeholder_ids)
+
+            if img_start is None:
+                sequences = input_ids
+                if return_dict:
+                    return types.SimpleNamespace(
+                        sequences=sequences,
+                        hidden_states=() if output_hs else None,
+                    )
+                return sequences
+
+            prompt_tokens = ids[img_start + len(image_placeholder_ids) :]
+
+            if not prompt_tokens:
+                sequences = input_ids
+                if return_dict:
+                    return types.SimpleNamespace(
+                        sequences=sequences,
+                        hidden_states=() if output_hs else None,
+                    )
+                return sequences
+
+            prompt_tensor = torch.tensor([prompt_tokens], device=device)
+            prompt_emb = F.embedding(prompt_tensor, native_model.text.wte)
+            prompt_len = prompt_emb.size(1)
+
+            mask = native_model.attn_mask[:, :, prefix_len : prefix_len + prompt_len, :]
+            pos_ids = torch.arange(
+                prefix_len,
+                prefix_len + prompt_len,
+                dtype=torch.long,
+                device=device,
+            )
+            hidden = native_model._prefill(prompt_emb, mask, pos_ids, None)
+            pos = prefix_len + prompt_len
+
+            hidden_last = native_model.text.post_ln(hidden[:, -1:, :])
+            logits = native_model.text.lm_head(hidden_last.squeeze(1))
+
+            generated = []
+            all_hidden_states = []
+            # Record the hidden state that predicted each generated token.
+            prev_hs = hidden_last
+            for _ in range(max_new_tokens):
+                next_token = logits.argmax(dim=-1).item()
+                if next_token == 0:
+                    break
+                generated.append(next_token)
+                if output_hs:
+                    all_hidden_states.append((prev_hs,))
+
+                next_emb = F.embedding(
+                    torch.tensor([[next_token]], device=device),
+                    native_model.text.wte,
+                )
+                mask = native_model.attn_mask[:, :, pos : pos + 1, :]
+                pos_ids_step = torch.tensor([pos], dtype=torch.long, device=device)
+                hidden = native_model._prefill(next_emb, mask, pos_ids_step, None)
+                hidden_last = native_model.text.post_ln(hidden[:, -1:, :])
+                prev_hs = hidden_last
+                logits = native_model.text.lm_head(hidden_last.squeeze(1))
+                pos += 1
+
+            result_ids = ids + generated
+            sequences = torch.tensor([result_ids], device=device)
+
+            if return_dict:
+                return types.SimpleNamespace(
+                    sequences=sequences,
+                    hidden_states=tuple(all_hidden_states) if output_hs else None,
+                )
+            return sequences
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+    return hf_model
+
+
+def qianfan_ocr_hf_model_kwargs(model_name: str) -> dict:
+    """Return hf_model_kwargs with a patched config for QianfanOCR."""
+    from vllm.transformers_utils.configs.qianfan_ocr import QianfanOCRConfig
+
+    config = QianfanOCRConfig.from_pretrained(model_name)
+    vc = config.vision_config
+    if isinstance(vc.image_size, int):
+        vc.image_size = (vc.image_size, vc.image_size)
+    if isinstance(vc.patch_size, int):
+        vc.patch_size = (vc.patch_size, vc.patch_size)
+    return {"config": config}
+
+
+def qianfan_ocr_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches an HfRunner instance to run QianfanOCR model inference.
+
+    QianfanOCR shares the same architecture as InternVLChatModel, so the
+    patching logic mirrors ``internvl_patch_hf_runner``.  The only difference
+    is that we load the config via vllm's registered ``QianfanOCRConfig``
+    instead of relying on ``trust_remote_code``.
+    """
+
+    class QianfanOCRProcessor:
+        def __init__(self, hf_runner: HfRunner):
+            self.tokenizer = hf_runner.tokenizer
+
+            from vllm.transformers_utils.configs.qianfan_ocr import QianfanOCRConfig
+
+            self.config = QianfanOCRConfig.from_pretrained(hf_runner.model_name)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+            # Compute num_image_token from config instead of model attribute,
+            # since the transformers-native model doesn't expose it.
+            image_size = self.config.force_image_size or self.vision_config.image_size
+            patch_size = self.vision_config.patch_size
+            downsample_ratio = self.config.downsample_ratio
+            self.num_image_token = int(
+                (image_size // patch_size) ** 2 * (downsample_ratio**2)
+            )
+
+        def __call__(
+            self,
+            text: str,
+            images: PIL.Image.Image | list[PIL.Image.Image] = None,
+            **kwargs,
+        ):
+            from vllm.transformers_utils.processors.internvl import (
+                image_to_pixel_values_internvl,
+            )
+
+            IMG_START = "<img>"
+            IMG_END = "</img>"
+            IMG_CONTEXT = "<IMG_CONTEXT>"
+
+            images = [images] if isinstance(images, PIL.Image.Image) else images
+            pixel_values_list = [
+                image_to_pixel_values_internvl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                )
+                for image in images
+            ]
+            num_patches_list = [pv.shape[0] for pv in pixel_values_list]
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace("<image>", image_tokens, 1)
+
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = QianfanOCRProcessor(hf_model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
+    return hf_model
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index ae2f75481359..af48a1479bad 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -133,6 +133,7 @@ class VLMTestInfo(NamedTuple):
 
     # Exposed options for HF runner
     hf_model_kwargs: dict[str, Any] | None = None
+    hf_processor: Callable[[str], Any] | None = None
     # Indicates we should explicitly pass the EOS from the tokenizer
     use_tokenizer_eos: bool = False
     auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
@@ -196,6 +197,7 @@ def get_non_parametrized_runner_kwargs(self):
             "comparator": self.comparator,
             "get_stop_token_ids": self.get_stop_token_ids,
             "hf_model_kwargs": self.hf_model_kwargs,
+            "hf_processor": self.hf_processor,
             "stop_str": self.stop_str,
             "patch_hf_runner": self.patch_hf_runner,
         }
diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py
index 19154c27da9a..1a466931a0e5 100644
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -1,11 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib.util
+
 import pytest
 import torch
 
 from ....conftest import VllmRunner
 
+pytestmark = pytest.mark.skipif(
+    importlib.util.find_spec("terratorch") is None,
+    reason="terratorch unavailable while PyPI has `lightning` quarantined; see #41376",
+)
+
 
 def _run_test(
     vllm_runner: type[VllmRunner],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index cce69e15b00b..914e1a38ee80 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -311,6 +311,9 @@ def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
             baseline_processor,
             cached_processor,
             batch_idx,
+            hit_rate,
+            num_batches,
+            simplify_rate,
         )
 
 
@@ -320,6 +323,9 @@ def _test_processing_correctness_one(
     baseline_processor: BaseMultiModalProcessor,
     cached_processor: BaseMultiModalProcessor,
     batch_idx: int,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
 ):
     model_type = model_config.hf_config.model_type
 
@@ -343,7 +349,11 @@ def _test_processing_correctness_one(
         baseline_tokenized_result,
         cached_tokenized_result,
         ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})",
+        msg=(
+            f"Failed ({batch_idx=}, {hit_rate=}, "
+            f"{num_batches=}, {simplify_rate=}, "
+            f"{text_prompt=}, {token_prompt=}, {mm_data=})"
+        ),
     )
 
     if text_prompt is not None:
@@ -362,21 +372,33 @@ def _test_processing_correctness_one(
             baseline_text_result,
             cached_text_result,
             ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})",
+            msg=(
+                f"Failed ({batch_idx=}, {hit_rate=}, "
+                f"{num_batches=}, {simplify_rate=}, "
+                f"{text_prompt=}, {token_prompt=}, {mm_data=})"
+            ),
         )
 
         _assert_inputs_equal(
             baseline_text_result,
             baseline_tokenized_result,
             ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
+            msg=(
+                f"Failed ({batch_idx=}, {hit_rate=}, "
+                f"{num_batches=}, {simplify_rate=}, "
+                f"{text_prompt=}, {token_prompt=}, {mm_data=})"
+            ),
         )
 
         _assert_inputs_equal(
             cached_text_result,
             cached_tokenized_result,
             ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
+            msg=(
+                f"Failed ({batch_idx=}, {hit_rate=}, "
+                f"{num_batches=}, {simplify_rate=}, "
+                f"{text_prompt=}, {token_prompt=}, {mm_data=})"
+            ),
         )
 
 
@@ -408,6 +430,8 @@ def test_processing_correctness(
             "correctness test as is. Let's revisit adapting this "
             "test once more realtime models exist."
         )
+    if model_id == "CohereLabs/cohere-transcribe-03-2026":
+        pytest.skip("Fix later")
 
     _test_processing_correctness(
         model_id,
diff --git a/tests/models/multimodal/processing/test_gemma4.py b/tests/models/multimodal/processing/test_gemma4.py
index 808fab6a030f..24a30cae9d47 100644
--- a/tests/models/multimodal/processing/test_gemma4.py
+++ b/tests/models/multimodal/processing/test_gemma4.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+from PIL import Image as PILImage
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
@@ -12,6 +13,119 @@
 GEMMA4_MODEL_ID = "google/gemma-4-E2B-it"
 
 
+@pytest.mark.parametrize(
+    "image_width,image_height,max_soft_tokens",
+    [
+        # Production repro: a 3x900 image (extreme aspect ratio) made the
+        # prompt-side estimator return 289 while the HF Gemma 4 image
+        # processor's vision tower output capped at 280, producing the
+        # "Attempted to assign 280 multimodal tokens to 289 placeholders"
+        # mismatch that crashed EngineCore.
+        (900, 3, 280),
+        (3, 900, 280),
+        # Same pathology should hold for the video-frame budget (70 tokens).
+        (900, 3, 70),
+        # And for any other supported budget.
+        (4000, 2, 1120),
+    ],
+)
+@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
+def test_compute_num_soft_tokens_does_not_exceed_max_soft_tokens(
+    model_id: str,
+    image_width: int,
+    image_height: int,
+    max_soft_tokens: int,
+):
+    """Regression for the Gemma 3/4 multimodal crash.
+
+    `_compute_num_soft_tokens` must never return a value larger than
+    `max_soft_tokens`. The HF Gemma 4 image processor clamps its vision
+    tower output to that value; if the prompt-side estimator returns more,
+    the prompt has more `image` placeholder tokens than the encoder will
+    fill, and `_merge_multimodal_embeddings` raises `ValueError` deep in
+    the model forward.
+    """
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    num_soft_tokens = processor.info._compute_num_soft_tokens(
+        image_width=image_width,
+        image_height=image_height,
+        max_soft_tokens=max_soft_tokens,
+    )
+
+    assert num_soft_tokens <= max_soft_tokens, (
+        f"_compute_num_soft_tokens returned {num_soft_tokens} for "
+        f"image_width={image_width}, image_height={image_height}, "
+        f"max_soft_tokens={max_soft_tokens} — exceeds the cap that the HF "
+        f"image processor enforces on its vision tower output. This is "
+        f"the placeholder/encoder count mismatch that crashes EngineCore."
+    )
+
+
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_image_tokens"),
+    [
+        ({}, 280),
+        ({"max_soft_tokens": 70}, 70),
+        ({"max_soft_tokens": 280}, 280),
+        ({"max_soft_tokens": 1120}, 1120),
+        ({"images_kwargs": {"max_soft_tokens": 560}}, 560),
+        ({"images_kwargs": None}, 280),
+        ({"images_kwargs": "not-a-dict"}, 280),
+    ],
+)
+@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
+def test_get_mm_max_tokens_per_item_respects_configured_max_soft_tokens(
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_image_tokens: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt={"image": 1, "video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    tokens = processor.info.get_mm_max_tokens_per_item(
+        seq_len=ctx.model_config.max_model_len,
+        mm_counts={"image": 1, "video": 1},
+    )
+
+    assert tokens is not None
+    assert tokens["image"] == expected_image_tokens
+    assert tokens["video"] == 32 * (70 + 2 + 6)
+
+
+@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
+def test_get_prompt_updates_respects_nested_max_soft_tokens(model_id: str):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"images_kwargs": {"max_soft_tokens": 560}},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    image = PILImage.new("RGB", (1000, 1000), color="white")
+    image_size = image.size
+    mm_items = processor.info.parse_mm_data({"image": image})
+
+    prompt_update = processor._get_prompt_updates(mm_items, {}, {})[0]
+    replacement = prompt_update.resolve(0).content.full
+    expected = processor.info.get_image_repl(
+        image_width=image_size[0],
+        image_height=image_size[1],
+        processor=processor.info.get_hf_processor(),
+        max_soft_tokens=560,
+    ).full
+
+    assert replacement == expected
+
+
 @pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
 def test_limit_mm_per_prompt(
     image_assets: ImageTestAssets,
diff --git a/tests/models/multimodal/processing/test_moondream3.py b/tests/models/multimodal/processing/test_moondream3.py
new file mode 100644
index 000000000000..a6284ae45fd1
--- /dev/null
+++ b/tests/models/multimodal/processing/test_moondream3.py
@@ -0,0 +1,553 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Moondream3 multimodal processing.
+
+Includes:
+- Processor creation and application tests
+- Image tokenization and placeholder expansion tests
+- Tiling and cropping logic tests (CPU-based)
+- Pixel normalization tests
+"""
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+MOONDREAM3_MODEL_ID = "moondream/moondream3-preview"
+# Expected multimodal prefix: BOS + 729 image tokens.
+EXPECTED_IMAGE_TOKENS = 730
+# Vision encoder constants
+CROP_SIZE = 378
+PATCH_SIZE = 14
+MAX_CROPS = 12
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_processor_creation(model_id: str):
+    """Test that Moondream3 processor can be created."""
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    assert processor is not None
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_processor_apply(
+    image_assets: ImageTestAssets,
+    model_id: str,
+):
+    """Test that Moondream3 processor can process inputs.
+
+    NOTE: The prompt includes the leading BOS token because Moondream3
+    pre-fills BOS and image embeddings together.
+    """
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    prompt = "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>What is this?<|md_reserved_2|>"  # noqa: E501
+    mm_data = {"image": [image_assets[0].pil_image]}
+
+    processed_inputs = processor(
+        prompt,
+        mm_items=processor.info.parse_mm_data(mm_data),
+        hf_processor_mm_kwargs={},
+    )
+
+    assert "prompt_token_ids" in processed_inputs
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+    assert len(image_placeholders) == 1
+    assert image_placeholders[0].length == EXPECTED_IMAGE_TOKENS
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_processor_pixel_values(
+    image_assets: ImageTestAssets,
+    model_id: str,
+):
+    """Test that pixel values are correctly produced."""
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    prompt = "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>What is this?<|md_reserved_2|>"  # noqa: E501
+    mm_data = {"image": [image_assets[0].pil_image]}
+
+    processed_inputs = processor(
+        prompt,
+        mm_items=processor.info.parse_mm_data(mm_data),
+        hf_processor_mm_kwargs={},
+    )
+
+    # Check mm_kwargs contains pixel_values
+    mm_kwargs = processed_inputs.get("mm_kwargs")
+    assert mm_kwargs is not None
+    mm_data_result = mm_kwargs.get_data()
+    assert "pixel_values" in mm_data_result
+
+    # Verify pixel_values shape
+    pixel_values = mm_data_result["pixel_values"]
+    assert pixel_values.dim() == 5  # [batch, num_crops, C, H, W]
+    assert pixel_values.shape[2] == 3  # RGB channels
+    assert pixel_values.shape[3] == 378  # crop height
+    assert pixel_values.shape[4] == 378  # crop width
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_processor_image_token_expansion(
+    image_assets: ImageTestAssets,
+    model_id: str,
+):
+    """Test that <image> placeholder is expanded to correct number of tokens."""
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    prompt = "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>Describe.<|md_reserved_2|>"  # noqa: E501
+    mm_data = {"image": [image_assets[0].pil_image]}
+
+    processed_inputs = processor(
+        prompt,
+        mm_items=processor.info.parse_mm_data(mm_data),
+        hf_processor_mm_kwargs={},
+    )
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+    assert len(image_placeholders) == 1
+    assert image_placeholders[0].length == EXPECTED_IMAGE_TOKENS
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_multi_crop_tiling(
+    model_id: str,
+):
+    """Test that large images produce correct multi-crop tiling."""
+    from PIL import Image
+
+    from vllm.transformers_utils.processors.moondream3 import Moondream3Processor
+
+    processor = Moondream3Processor.from_pretrained(model_id, trust_remote_code=True)
+
+    # Create a large image that requires multiple crops
+    large_image = Image.new("RGB", (1000, 1000), color="blue")
+    pixel_values, tiling = processor.preprocess_image(large_image)
+
+    # Large images should produce more than 1x1 tiling
+    assert tiling[0] >= 1 and tiling[1] >= 1
+    # Check that we have global crop + local crops
+    expected_crops = tiling[0] * tiling[1] + 1
+    assert pixel_values.shape[0] == expected_crops
+
+
+@pytest.mark.parametrize(
+    "image_size",
+    [
+        (500, 500),
+        (800, 600),
+        (1920, 1080),
+    ],
+)
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_tiling_various_sizes(
+    image_size: tuple[int, int],
+    model_id: str,
+):
+    """Test tiling with various image sizes."""
+    from PIL import Image
+
+    from vllm.transformers_utils.processors.moondream3 import Moondream3Processor
+
+    processor = Moondream3Processor.from_pretrained(model_id, trust_remote_code=True)
+
+    width, height = image_size
+    image = Image.new("RGB", (width, height), color="red")
+    pixel_values, tiling = processor.preprocess_image(image)
+
+    # Basic shape checks
+    assert pixel_values.dim() == 4  # [num_crops, C, H, W]
+    assert pixel_values.shape[1] == 3  # RGB
+    assert pixel_values.shape[2] == 378  # crop height
+    assert pixel_values.shape[3] == 378  # crop width
+
+    # Tiling should respect max_crops (12)
+    assert tiling[0] * tiling[1] <= 12
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_pixel_normalization(
+    model_id: str,
+):
+    """Test that pixel values are normalized to [-1, 1] range."""
+    from PIL import Image
+
+    from vllm.transformers_utils.processors.moondream3 import Moondream3Processor
+
+    processor = Moondream3Processor.from_pretrained(model_id, trust_remote_code=True)
+
+    # Create test image
+    image = Image.new("RGB", (378, 378), color="green")
+    pixel_values, _ = processor.preprocess_image(image)
+
+    # Normalization: (x - 0.5) / 0.5 = 2*x - 1
+    # For input [0, 1], output should be [-1, 1]
+    assert pixel_values.min() >= -1.0
+    assert pixel_values.max() <= 1.0
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_chat_template_with_image(
+    image_assets: ImageTestAssets,
+    model_id: str,
+):
+    """Test that chat template correctly formats BOS + image + prompt."""
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = ctx.tokenizer
+
+    # Use the chat template format
+    prompt = "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>What is this?<|md_reserved_2|>"  # noqa: E501
+    mm_data = {"image": [image_assets[0].pil_image]}
+
+    processed_inputs = processor(
+        prompt,
+        mm_items=processor.info.parse_mm_data(mm_data),
+        hf_processor_mm_kwargs={},
+    )
+    token_ids = processed_inputs["prompt_token_ids"]
+
+    # BOS token (<|endoftext|>) should be token ID 0
+    bos_token_id = tokenizer.encode("<|endoftext|>", add_special_tokens=False)[0]
+    assert bos_token_id == 0
+
+    # First token should be BOS
+    assert token_ids[0] == bos_token_id
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        pytest.param(
+            [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.invalid/image.png"},
+                },
+                {"type": "text", "text": "What is in this image?"},
+            ],
+            id="image-first",
+        ),
+        pytest.param(
+            [
+                {"type": "text", "text": "What is in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.invalid/image.png"},
+                },
+            ],
+            id="text-first",
+        ),
+    ],
+)
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_chat_template_content_list_uses_moondream_image_prefix(
+    image_assets: ImageTestAssets,
+    content: list[dict[str, object]],
+    model_id: str,
+):
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor = processor.info.get_hf_processor()
+
+    prompt = hf_processor.tokenizer.apply_chat_template(
+        [{"role": "user", "content": content}],
+        chat_template=hf_processor.chat_template,
+        tokenize=False,
+    )
+
+    expected_prompt = (
+        "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>"
+        "What is in this image?<|md_reserved_2|>"
+    )
+    assert prompt == expected_prompt
+
+    processed_inputs = processor(
+        prompt,
+        mm_items=processor.info.parse_mm_data({"image": [image_assets[0].pil_image]}),
+        hf_processor_mm_kwargs={},
+    )
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+    assert len(image_placeholders) == 1
+    assert image_placeholders[0].length == EXPECTED_IMAGE_TOKENS
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_bos_token_always_first(
+    image_assets: ImageTestAssets,
+    model_id: str,
+):
+    """Test that BOS token (ID 0) is always at position 0."""
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    # Start with BOS token explicitly
+    prompt = "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>Describe this image.<|md_reserved_2|>"  # noqa: E501
+    mm_data = {"image": [image_assets[0].pil_image]}
+
+    processed_inputs = processor(
+        prompt,
+        mm_items=processor.info.parse_mm_data(mm_data),
+        hf_processor_mm_kwargs={},
+    )
+    token_ids = processed_inputs["prompt_token_ids"]
+
+    # Token ID 0 (<|endoftext|>) should be the first token
+    assert token_ids[0] == 0, (
+        f"Expected BOS token (0) at position 0, got {token_ids[0]}"
+    )
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_processor_with_small_image(
+    model_id: str,
+):
+    """Test processor with image smaller than crop size."""
+    from PIL import Image
+
+    from vllm.transformers_utils.processors.moondream3 import Moondream3Processor
+
+    processor = Moondream3Processor.from_pretrained(model_id, trust_remote_code=True)
+
+    # Small image (smaller than crop size)
+    small_image = Image.new("RGB", (100, 100), color="yellow")
+    pixel_values, tiling = processor.preprocess_image(small_image)
+
+    # Small images should use 1x1 tiling
+    assert tiling == (1, 1)
+    # Should have 2 crops (global + 1 local)
+    assert pixel_values.shape[0] == 2
+
+
+@pytest.mark.parametrize(
+    "image_kind",
+    [
+        pytest.param("numpy_hwc", id="numpy-hwc"),
+        pytest.param("numpy_chw", id="numpy-chw"),
+        pytest.param("torch_chw", id="torch-chw"),
+    ],
+)
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_preprocess_image_accepts_non_pil_inputs(
+    image_assets: ImageTestAssets,
+    image_kind: str,
+    model_id: str,
+):
+    from vllm.transformers_utils.processors.moondream3 import Moondream3Processor
+
+    processor = Moondream3Processor.from_pretrained(model_id, trust_remote_code=True)
+    pil_image = image_assets[0].pil_image.convert("RGB")
+    hwc_array = np.asarray(pil_image)
+    expected_pixel_values, expected_tiling = processor.preprocess_image(pil_image)
+
+    if image_kind == "numpy_hwc":
+        image = hwc_array
+    elif image_kind == "numpy_chw":
+        image = np.transpose(hwc_array, (2, 0, 1))
+    else:
+        image = torch.from_numpy(np.transpose(hwc_array, (2, 0, 1)).copy())
+
+    pixel_values, tiling = processor.preprocess_image(image)
+
+    assert tiling == expected_tiling
+    assert pixel_values.shape == expected_pixel_values.shape
+    assert pixel_values.dtype == torch.bfloat16
+    assert torch.equal(pixel_values, expected_pixel_values)
+
+
+@pytest.mark.parametrize("image_kind", ["numpy_chw", "torch_chw"])
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+def test_processor_apply_accepts_non_pil_image_inputs(
+    image_assets: ImageTestAssets,
+    image_kind: str,
+    model_id: str,
+):
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    prompt = "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>What is this?<|md_reserved_2|>"  # noqa: E501
+    hwc_array = np.asarray(image_assets[0].pil_image.convert("RGB"))
+    chw_array = np.transpose(hwc_array, (2, 0, 1)).copy()
+    image = chw_array if image_kind == "numpy_chw" else torch.from_numpy(chw_array)
+
+    processed_inputs = processor(
+        prompt,
+        mm_items=processor.info.parse_mm_data({"image": [image]}),
+        hf_processor_mm_kwargs={},
+    )
+
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+    assert len(image_placeholders) == 1
+    assert image_placeholders[0].length == EXPECTED_IMAGE_TOKENS
+
+    mm_kwargs = processed_inputs["mm_kwargs"].get_data()
+    assert mm_kwargs["pixel_values"].shape[2:] == (3, 378, 378)
+
+
+class TestMoondream3TilingLogic:
+    """CPU-based tests for Moondream3 tiling selection logic.
+
+    These tests validate the select_tiling() function which determines
+    how images are divided into crops for the vision encoder.
+    """
+
+    def test_small_image_no_tiling(self):
+        """Small images should use 1x1 tiling."""
+        from vllm.transformers_utils.processors.moondream3 import select_tiling
+
+        tiling = select_tiling(
+            height=300, width=300, crop_size=CROP_SIZE, max_crops=MAX_CROPS
+        )
+        assert tiling == (1, 1)
+
+    def test_exact_crop_size(self):
+        """Image exactly at crop size should use 1x1."""
+        from vllm.transformers_utils.processors.moondream3 import select_tiling
+
+        tiling = select_tiling(
+            height=CROP_SIZE, width=CROP_SIZE, crop_size=CROP_SIZE, max_crops=MAX_CROPS
+        )
+        assert tiling == (1, 1)
+
+    def test_large_square_image(self):
+        """Large square image should use multiple tiles."""
+        from vllm.transformers_utils.processors.moondream3 import select_tiling
+
+        tiling = select_tiling(
+            height=800, width=800, crop_size=CROP_SIZE, max_crops=MAX_CROPS
+        )
+        h_tiles, w_tiles = tiling
+        assert h_tiles >= 2
+        assert w_tiles >= 2
+        assert h_tiles * w_tiles <= MAX_CROPS
+
+    def test_wide_image(self):
+        """Wide image should have more width tiles."""
+        from vllm.transformers_utils.processors.moondream3 import select_tiling
+
+        tiling = select_tiling(
+            height=400, width=1200, crop_size=CROP_SIZE, max_crops=MAX_CROPS
+        )
+        h_tiles, w_tiles = tiling
+        assert w_tiles >= h_tiles
+
+    def test_tall_image(self):
+        """Tall image should have more height tiles."""
+        from vllm.transformers_utils.processors.moondream3 import select_tiling
+
+        tiling = select_tiling(
+            height=1200, width=400, crop_size=CROP_SIZE, max_crops=MAX_CROPS
+        )
+        h_tiles, w_tiles = tiling
+        assert h_tiles >= w_tiles
+
+    def test_respects_max_crops(self):
+        """Tiling should not exceed max_crops."""
+        from vllm.transformers_utils.processors.moondream3 import select_tiling
+
+        tiling = select_tiling(
+            height=2000, width=2000, crop_size=CROP_SIZE, max_crops=4
+        )
+        h_tiles, w_tiles = tiling
+        assert h_tiles * w_tiles <= 4
+
+
+class TestMoondream3VisionShapes:
+    """CPU-based tests for vision encoder expected shapes.
+
+    These tests verify the mathematical relationships between
+    crop size, patch size, and token counts.
+    """
+
+    def test_expected_patch_count(self):
+        """Test 378/14 = 27 patches per side, 729 total."""
+        patches_per_side = CROP_SIZE // PATCH_SIZE
+        total_patches = patches_per_side**2
+
+        assert patches_per_side == 27
+        assert total_patches == EXPECTED_IMAGE_TOKENS - 1
+
+    def test_patch_embedding_input_dim(self):
+        """Test patch embedding input dimension."""
+        channels = 3
+        input_dim = PATCH_SIZE * PATCH_SIZE * channels
+
+        assert input_dim == 14 * 14 * 3
+        assert input_dim == 588
+
+
+class TestMoondream3TauAttention:
+    """CPU-based tests for tau attention scaling components.
+
+    These tests validate the tau attention formula used in Moondream3:
+    - Token-based: tok_q = tanh(gelu(qkv) @ tau_wq.T)
+    - Position-based: tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)
+    """
+
+    def test_tau_position_range(self):
+        """Test tau position scaling produces values in valid range."""
+        num_heads = 32
+        seq_len = 100
+
+        tau_alpha = torch.randn(num_heads)
+        positions = torch.arange(seq_len)
+
+        pos_float = (positions.float() + 1.0).clamp(min=1e-6)
+        pos_log = pos_float.log()
+        tau_pos = 1.0 + (torch.sigmoid(tau_alpha[:, None] * pos_log[None, :]) - 0.5)
+
+        assert tau_pos.shape == (num_heads, seq_len)
+        # tau_pos should be between 0.5 and 1.5
+        assert tau_pos.min() >= 0.5
+        assert tau_pos.max() <= 1.5
+
+    def test_tau_token_output_range(self):
+        """Test tau token scaling output is bounded by tanh."""
+        import torch.nn.functional as F
+
+        seq_len = 100
+        qkv_dim = 6144  # 2048 * 3
+        num_heads = 32
+
+        qkv = torch.randn(seq_len, qkv_dim)
+        tau_wq = torch.randn(num_heads, qkv_dim)
+
+        tok_feat = F.gelu(qkv)
+        tok_q = torch.tanh(tok_feat @ tau_wq.t())
+
+        assert tok_q.shape == (seq_len, num_heads)
+        # tanh output is bounded by [-1, 1]
+        assert tok_q.min() >= -1.0
+        assert tok_q.max() <= 1.0
diff --git a/tests/models/multimodal/test_nano_nemotron_vl.py b/tests/models/multimodal/test_nano_nemotron_vl.py
new file mode 100644
index 000000000000..6922af79c08e
--- /dev/null
+++ b/tests/models/multimodal/test_nano_nemotron_vl.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.model_executor.models.nano_nemotron_vl import NemotronH_Nano_VL_V2
+
+
+class _TextOnlyMultiModalConfig:
+    def get_limit_per_prompt(self, modality: str) -> int:
+        return 0
+
+
+class _ImageOnlyMultiModalConfig:
+    def get_limit_per_prompt(self, modality: str) -> int:
+        return 1 if modality == "image" else 0
+
+
+class _ModelConfig:
+    multimodal_config = _TextOnlyMultiModalConfig()
+
+
+class _ImageOnlyModelConfig:
+    multimodal_config = _ImageOnlyMultiModalConfig()
+
+
+class _LanguageModel:
+    def __init__(self) -> None:
+        self.loaded_weights: list[tuple[str, object]] = []
+
+    def load_weights(self, weights):
+        self.loaded_weights = list(weights)
+
+
+class _MissingMultiModalModule:
+    def named_parameters(self):
+        raise AssertionError("multimodal weights should not be inspected")
+
+    def load_weights(self, weights):
+        raise AssertionError("multimodal weights should not be loaded")
+
+
+class _AdapterModule:
+    def named_parameters(self):
+        return []
+
+
+class _VisionModel:
+    def __init__(self) -> None:
+        self.loaded_weights: list[tuple[str, object]] = []
+
+    def load_weights(self, weights):
+        self.loaded_weights = list(weights)
+
+
+def test_nano_nemotron_vl_skips_multimodal_weights_in_text_only_mode():
+    model = object.__new__(NemotronH_Nano_VL_V2)
+    language_model = _LanguageModel()
+    object.__setattr__(model, "model_config", _ModelConfig())
+    object.__setattr__(model, "language_model", language_model)
+    object.__setattr__(model, "mlp1", _AdapterModule())
+    object.__setattr__(model, "vision_model", _MissingMultiModalModule())
+    object.__setattr__(model, "sound_encoder", None)
+
+    language_weight = object()
+    model.load_weights(
+        [
+            ("language_model.layers.0.weight", language_weight),
+            ("mlp1.0.weight", object()),
+            ("vision_model.radio_model.encoder.weight", object()),
+            ("sound_encoder.encoder.weight", object()),
+        ]
+    )
+
+    assert language_model.loaded_weights == [("layers.0.weight", language_weight)]
+
+
+def test_nano_nemotron_vl_loads_vision_weights_without_sound_encoder():
+    model = object.__new__(NemotronH_Nano_VL_V2)
+    language_model = _LanguageModel()
+    vision_model = _VisionModel()
+    object.__setattr__(model, "model_config", _ImageOnlyModelConfig())
+    object.__setattr__(model, "language_model", language_model)
+    object.__setattr__(model, "mlp1", _AdapterModule())
+    object.__setattr__(model, "vision_model", vision_model)
+    object.__setattr__(model, "sound_encoder", None)
+
+    language_weight = object()
+    vision_weight = object()
+    model.load_weights(
+        [
+            ("language_model.layers.0.weight", language_weight),
+            ("vision_model.radio_model.encoder.weight", vision_weight),
+        ]
+    )
+
+    assert language_model.loaded_weights == [("layers.0.weight", language_weight)]
+    assert vision_model.loaded_weights == [
+        ("radio_model.encoder.weight", vision_weight)
+    ]
+
+
+def test_nano_nemotron_vl_requires_sound_encoder_for_sound_weights():
+    model = object.__new__(NemotronH_Nano_VL_V2)
+    language_model = _LanguageModel()
+    vision_model = _VisionModel()
+    object.__setattr__(model, "model_config", _ImageOnlyModelConfig())
+    object.__setattr__(model, "language_model", language_model)
+    object.__setattr__(model, "mlp1", _AdapterModule())
+    object.__setattr__(model, "vision_model", vision_model)
+    object.__setattr__(model, "sound_encoder", None)
+
+    with pytest.raises(AssertionError):
+        model.load_weights([("sound_encoder.encoder.weight", object())])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 6b041c67071d..e50b0a8de4d9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -238,6 +238,11 @@ def check_available_online(
         "CohereLabs/c4ai-command-r7b-12-2024",
         trust_remote_code=True,
     ),
+    "CohereMoeForCausalLM": _HfExamplesInfo(
+        "/host/engines/cohere-moe",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
     "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"),
     # FIXME: databricks/dbrx-instruct has been deleted
     "DbrxForCausalLM": _HfExamplesInfo(
@@ -372,6 +377,7 @@ def check_available_online(
     "KimiLinearForCausalLM": _HfExamplesInfo(
         "moonshotai/Kimi-Linear-48B-A3B-Instruct", trust_remote_code=True
     ),
+    "LagunaForCausalLM": _HfExamplesInfo("poolside/Laguna-XS.2"),
     "Lfm2ForCausalLM": _HfExamplesInfo("LiquidAI/LFM2-1.2B"),
     "Lfm2MoeForCausalLM": _HfExamplesInfo(
         "LiquidAI/LFM2-8B-A1B",
@@ -940,13 +946,6 @@ def check_available_online(
     "HCXVisionForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True,
-        max_transformers_version="4.57",
-        transformers_version_reason={
-            "vllm": (
-                "Custom config cannot be loaded with Transformers "
-                "v5 because `text_config` is not always set"
-            )
-        },
     ),
     "HCXVisionV2ForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
@@ -1116,6 +1115,16 @@ def check_available_online(
         extras={"olmo": "allenai/Molmo-7B-O-0924"},
         trust_remote_code=True,
     ),
+    "Moondream3ForCausalLM": _HfExamplesInfo(
+        "moondream/moondream3-preview",
+        tokenizer="moondream/starmie-v1",
+        trust_remote_code=True,
+    ),
+    "HfMoondream": _HfExamplesInfo(
+        "moondream/moondream3-preview",
+        tokenizer="moondream/starmie-v1",
+        trust_remote_code=True,
+    ),
     "Molmo2ForConditionalGeneration": _HfExamplesInfo(
         "allenai/Molmo2-8B",
         extras={"olmo": "allenai/Molmo2-O-7B"},
@@ -1132,30 +1141,17 @@ def check_available_online(
     "NemotronH_Nano_VL_V2": _HfExamplesInfo(
         "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
         max_model_len=4096,
-        # NemotronH layers are constructed via `hybrid_override_pattern`:
+        # NemotronH layers are constructed via `hybrid_override_pattern`
         use_original_num_layers=True,
         hf_overrides={
-            "vision_config": PretrainedConfig(
-                args={
-                    "min_num_patches": 1,  # Trigger image dynamic res
-                    "max_num_patches": 12,
-                    "model": "vit_huge_patch16_224",
-                },
-                # Trigger conv3d:
-                video_temporal_patch_size=2,
-            ),
-            "text_config": {
-                "num_hidden_layers": 2,
-                "hybrid_override_pattern": "M*",
-            },
+            "text_config": {"num_hidden_layers": 2, "hybrid_override_pattern": "M*"},
         },
         trust_remote_code=True,
     ),
-    # NemotronH_Nano_Omni_Reasoning_V3 is an alias for NemotronH_Nano_VL_V2
-    # Use the same registry test as NemotronH_Nano_VL_V2 above
     "NemotronH_Nano_Omni_Reasoning_V3": _HfExamplesInfo(
-        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+        "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
         max_model_len=4096,
+        # NemotronH layers are constructed via `hybrid_override_pattern`
         use_original_num_layers=True,
         hf_overrides={
             "vision_config": PretrainedConfig(
@@ -1165,35 +1161,17 @@ def check_available_online(
                     "model": "vit_huge_patch16_224",
                 },
                 video_temporal_patch_size=2,
+                # TODO(nhaber): This is `true` in the official `config.json`,
+                # but this causes a processor exception in the tests due to a known bug
+                # with mixed-resolution video when `true`. To be resolved.
+                video_maintain_aspect_ratio=False,
             ),
-            "text_config": {
-                "num_hidden_layers": 2,
-                "hybrid_override_pattern": "M*",
-            },
+            "text_config": {"num_hidden_layers": 2, "hybrid_override_pattern": "M*"},
         },
         trust_remote_code=True,
     ),
-    # NemotronH_Super_Omni_Reasoning_V3 is an alias for NemotronH_Nano_VL_V2 as well
-    # Use the same registry test as NemotronH_Nano_VL_V2 above
     "NemotronH_Super_Omni_Reasoning_V3": _HfExamplesInfo(
-        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
-        max_model_len=4096,
-        use_original_num_layers=True,
-        hf_overrides={
-            "vision_config": PretrainedConfig(
-                args={
-                    "min_num_patches": 1,
-                    "max_num_patches": 12,
-                    "model": "vit_huge_patch16_224",
-                },
-                video_temporal_patch_size=2,
-            ),
-            "text_config": {
-                "num_hidden_layers": 2,
-                "hybrid_override_pattern": "M*",
-            },
-        },
-        trust_remote_code=True,
+        "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16", is_available_online=False
     ),
     "OpenCUAForConditionalGeneration": _HfExamplesInfo(
         "xlangai/OpenCUA-7B",
@@ -1286,6 +1264,10 @@ def check_available_online(
         },
         tokenizer_mode="mistral",
     ),
+    "QianfanOCRForConditionalGeneration": _HfExamplesInfo(
+        "baidu/Qianfan-OCR",
+        min_transformers_version="5.6.0",
+    ),
     "QwenVLForConditionalGeneration": _HfExamplesInfo(
         "Qwen/Qwen-VL",
         extras={"chat": "Qwen/Qwen-VL-Chat"},
@@ -1390,9 +1372,7 @@ def check_available_online(
     ),
     # [Encoder-decoder]
     "CohereAsrForConditionalGeneration": _HfExamplesInfo(
-        "CohereLabs/cohere-transcribe-03-2026",
-        trust_remote_code=True,
-        is_available_online=False,  # TODO (ekagra): revert after asr release
+        "CohereLabs/cohere-transcribe-03-2026", trust_remote_code=True
     ),
     "NemotronParseForConditionalGeneration": _HfExamplesInfo(
         "nvidia/NVIDIA-Nemotron-Parse-v1.1", trust_remote_code=True
@@ -1463,6 +1443,11 @@ def check_available_online(
         speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
         tokenizer="MiniMaxAI/MiniMax-M2",
     ),
+    "EagleMistralForCausalLM": _HfExamplesInfo(
+        "mistralai/Mistral-Medium-3.5-128B",
+        speculative_model="mistralai/Mistral-Medium-3.5-128B-EAGLE",
+        is_available_online=False,
+    ),
     "EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
         "mistralai/Mistral-Large-3-675B-Instruct-2512",
         speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
@@ -1509,6 +1494,12 @@ def check_available_online(
         trust_remote_code=True,
         is_available_online=False,
     ),
+    "Gemma4MTPModel": _HfExamplesInfo(
+        "google/gemma-4-E4B-it",
+        speculative_model="google/gemma-4-E4B-it-assistant",
+        trust_remote_code=True,
+        min_transformers_version="5.8.0",
+    ),
     "ErnieMTPModel": _HfExamplesInfo(
         "baidu/ERNIE-4.5-21B-A3B-PT",
         trust_remote_code=True,
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 979c8d31775c..476ad1c7c17f 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -109,6 +109,16 @@ def _initialize_kv_caches_v1(self, vllm_config):
             "which is not configured in test environment"
         )
 
+    if model_arch in ("PrithviGeoSpatialMAE", "Terratorch"):
+        import importlib.util
+
+        if importlib.util.find_spec("terratorch") is None:
+            pytest.skip(
+                "terratorch is not installed; "
+                "temporarily skipped while PyPI has `lightning` quarantined "
+                "(see #41376)"
+            )
+
     if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]:
         from vllm.platforms import current_platform
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 81fae02efda1..0715409abda6 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -36,6 +36,17 @@ def test_registry_imports(model_arch):
         check_max_version=False,
         check_version_reason="vllm",
     )
+
+    if model_arch in ("PrithviGeoSpatialMAE", "Terratorch"):
+        import importlib.util
+
+        if importlib.util.find_spec("terratorch") is None:
+            pytest.skip(
+                "terratorch is not installed; "
+                "temporarily skipped while PyPI has `lightning` quarantined "
+                "(see #41376)"
+            )
+
     # Ensure all model classes can be imported successfully
     model_cls = ModelRegistry._try_load_model_cls(model_arch)
     assert model_cls is not None
diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py
index 71125dbe94f8..6d4d1921a88d 100644
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@@ -1,12 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib.util
+
 import pytest
 import torch
 
 from tests.conftest import VllmRunner
 from tests.utils import create_new_process_for_each_test
 
+pytestmark = pytest.mark.skipif(
+    importlib.util.find_spec("terratorch") is None,
+    reason="terratorch unavailable while PyPI has `lightning` quarantined; see #41376",
+)
+
 
 @create_new_process_for_each_test()  # Hangs otherwise
 @pytest.mark.parametrize(
diff --git a/tests/models/utils.py b/tests/models/utils.py
index b93beee6aa3a..b12ab72d77c7 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -523,6 +523,17 @@ class DummyConfig:
 
     text_config.update(update_dict)
 
+    # Update n_layers and moe configs for Moondream3 model
+    if model_arch in ("Moondream3ForCausalLM", "HfMoondream"):
+        text_config.update(
+            {
+                "n_layers": num_hidden_layers,
+                "moe_num_experts": num_experts,
+                "moe_experts_per_token": 2,
+                "moe_start_layer": num_hidden_layers,
+            }
+        )
+
     if hasattr(hf_config, "vision_config"):
         hf_config.vision_config.update(
             {
@@ -531,6 +542,9 @@ class DummyConfig:
             }
         )
 
+        if model_arch in ("Moondream3ForCausalLM", "HfMoondream"):
+            hf_config.vision_config.update({"enc_n_layers": 1})
+
     # e.g.: ibm-granite/granite-speech-3.3-2b
     if hasattr(hf_config, "encoder_config"):
         hf_config.encoder_config.update(
diff --git a/tests/parser/__init__.py b/tests/parser/__init__.py
new file mode 100644
index 000000000000..208f01a7cb5e
--- /dev/null
+++ b/tests/parser/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/parser/test_streaming.py b/tests/parser/test_streaming.py
new file mode 100644
index 000000000000..d9194d48ed5a
--- /dev/null
+++ b/tests/parser/test_streaming.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+from vllm.parser.abstract_parser import _WrappedParser
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
+
+
+class ThinkReasoningParser(BaseThinkingReasoningParser):
+    @property
+    def start_token(self) -> str:
+        return "<think>"
+
+    @property
+    def end_token(self) -> str:
+        return "</think>"
+
+
+MODEL_OUTPUT = (
+    "<think>let me think about this</think>"
+    '<tool_call>\n{"name": "get_weather", '
+    '"arguments": {"city": "Dallas"}}\n</tool_call>'
+)
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    from vllm.tokenizers import get_tokenizer
+
+    return get_tokenizer("Qwen/Qwen3-32B")
+
+
+@pytest.fixture
+def request_obj():
+    return ChatCompletionRequest(
+        model="test-model",
+        messages=[{"role": "user", "content": "hi"}],
+    )
+
+
+def make_parser(tokenizer, reasoning=False, tool=False):
+    _WrappedParser.reasoning_parser_cls = ThinkReasoningParser if reasoning else None
+    _WrappedParser.tool_parser_cls = Hermes2ProToolParser if tool else None
+    return _WrappedParser(tokenizer)
+
+
+def stream_text(parser, tokenizer, text, request, prompt_token_ids=None):
+    token_ids = tokenizer.encode(text, add_special_tokens=False)
+    results: list[DeltaMessage | None] = []
+    for tid in token_ids:
+        delta_text = tokenizer.decode([tid])
+        result = parser.parse_delta(
+            delta_text, [tid], request, prompt_token_ids=prompt_token_ids
+        )
+        prompt_token_ids = None
+        results.append(result)
+    return results
+
+
+def collect_fields(results):
+    all_reasoning = "".join(r.reasoning for r in results if r and r.reasoning)
+    all_content = "".join(r.content for r in results if r and r.content)
+    all_tool_calls = [tc for r in results if r and r.tool_calls for tc in r.tool_calls]
+    return all_reasoning, all_content, all_tool_calls
+
+
+def test_parse_delta_neither_parser(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=False, tool=False)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == ""
+    assert len(tool_calls) == 0
+    assert "<think>" in content
+    assert "let me think about this" in content
+    assert "<tool_call>" in content
+    assert "get_weather" in content
+
+
+def test_parse_delta_tool_parser_only(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=False, tool=True)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == ""
+    assert "<think>" in content
+    assert "let me think about this" in content
+    assert "</think>" in content
+
+    assert len(tool_calls) > 0
+    assert tool_calls[0].function.name == "get_weather"
+    tool_args = "".join(
+        tc.function.arguments for tc in tool_calls if tc.function.arguments
+    )
+    assert json.loads(tool_args) == {"city": "Dallas"}
+
+
+def test_parse_delta_reasoning_parser_only(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=True, tool=False)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert "let me think about this" in reasoning
+    assert len(tool_calls) == 0
+    assert "<tool_call>" in content
+    assert "get_weather" in content
+    assert "</tool_call>" in content
+
+
+def test_parse_delta_both_parsers(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=True, tool=True)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert "let me think about this" in reasoning
+    assert content == ""
+
+    assert len(tool_calls) > 0
+    assert tool_calls[0].function.name == "get_weather"
+    tool_args = "".join(
+        tc.function.arguments for tc in tool_calls if tc.function.arguments
+    )
+    assert json.loads(tool_args) == {"city": "Dallas"}
+
+
+def test_parse_delta_reasoning_only_thinking_disabled(tokenizer, request_obj):
+    """Regression test for vllm-project/vllm#40466.
+
+    When enable_thinking=False, the chat template places <think>\\n\\n</think>
+    in the prompt. The model then generates pure content (no think tokens).
+    All streaming output must go to delta.content, not delta.reasoning.
+    """
+    parser = make_parser(tokenizer, reasoning=True, tool=False)
+
+    end_token_id = parser._reasoning_parser.end_token_id
+    prompt_token_ids = [1, 2, end_token_id, 3]
+
+    content_text = "Hello! How can I assist you today?"
+    results = stream_text(
+        parser,
+        tokenizer,
+        content_text,
+        request_obj,
+        prompt_token_ids=prompt_token_ids,
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == "", f"Expected no reasoning, got: {reasoning!r}"
+    assert "Hello" in content
+    assert "assist" in content
+    assert len(tool_calls) == 0
diff --git a/tests/plugins_tests/test_terratorch_io_processor_plugins.py b/tests/plugins_tests/test_terratorch_io_processor_plugins.py
index 34799b3c42c0..b4c84b30d2ca 100644
--- a/tests/plugins_tests/test_terratorch_io_processor_plugins.py
+++ b/tests/plugins_tests/test_terratorch_io_processor_plugins.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib.util
 import io
 
 import imagehash
@@ -11,6 +12,11 @@
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse
 
+pytestmark = pytest.mark.skipif(
+    importlib.util.find_spec("terratorch") is None,
+    reason="terratorch unavailable while PyPI has `lightning` quarantined; see #41376",
+)
+
 models_config = {
     "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11": {
         "image_url": "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff",  # noqa: E501
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 3b58614e58d4..151b5d97ddf3 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -70,4 +70,5 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
         ["--enforce_eager"],
         ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
+        include_seeded_sampling=False,
     )
diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py
index 5520dc1747ad..f6e54fc1473e 100644
--- a/tests/quantization/test_cpu_wna16.py
+++ b/tests/quantization/test_cpu_wna16.py
@@ -13,6 +13,7 @@
     "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4",  # without g_idx
     "RedHatAI/Qwen3-1.7B-quantized.w4a16",  # with zp
     "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",
+    "Qwen/Qwen3-0.6B-FP8",  # FP8 W8A16 block-quantized
 ]
 DTYPE = ["bfloat16"]
 
diff --git a/tests/quantization/test_quark_maybe_update_config.py b/tests/quantization/test_quark_maybe_update_config.py
deleted file mode 100644
index 0142e869c22c..000000000000
--- a/tests/quantization/test_quark_maybe_update_config.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Unit tests for QuarkConfig.maybe_update_config.
-
-Fetches real HF configs (metadata only, no model weights) to verify
-that dynamic_mxfp4_quant is only enabled for DeepSeek-V3-family models.
-
-Run: pytest tests/quantization/test_quark_maybe_update_config.py -v
-"""
-
-import pytest
-from transformers import AutoConfig
-
-from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
-
-
-def _make_quark_config() -> QuarkConfig:
-    """Create a minimal QuarkConfig for testing."""
-    return QuarkConfig(quant_config={}, kv_cache_group=[], pack_method="reorder")
-
-
-# ---------------------------------------------------------------------------
-# Non-deepseek models must not flip dynamic_mxfp4_quant
-# ---------------------------------------------------------------------------
-@pytest.mark.parametrize(
-    "model_name",
-    ["amd/MiniMax-M2.1-MXFP4"],
-)
-def test_non_deepseek_model_stays_false(model_name: str):
-    """Non-deepseek_v3 models must not enable dynamic_mxfp4_quant."""
-    hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-    qcfg = _make_quark_config()
-
-    qcfg.maybe_update_config(model_name, hf_config=hf_config)
-
-    assert qcfg.dynamic_mxfp4_quant is False
-
-
-# ---------------------------------------------------------------------------
-# DeepSeek-V3 family + fp4 must enable dynamic_mxfp4_quant
-# ---------------------------------------------------------------------------
-@pytest.mark.parametrize(
-    "model_name",
-    ["amd/DeepSeek-R1-MXFP4-ASQ"],
-)
-def test_deepseek_family_fp4_enables_flag(model_name: str):
-    hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-    qcfg = _make_quark_config()
-
-    qcfg.maybe_update_config(model_name, hf_config=hf_config)
-
-    assert qcfg.dynamic_mxfp4_quant is True
-
-
-# ---------------------------------------------------------------------------
-# Missing hf_config → warn and stay False
-# ---------------------------------------------------------------------------
-def test_missing_hf_config_stays_false():
-    qcfg = _make_quark_config()
-
-    qcfg.maybe_update_config("some/model")
-
-    assert qcfg.dynamic_mxfp4_quant is False
diff --git a/tests/quantization/test_turboquant.py b/tests/quantization/test_turboquant.py
index f074ce119ae8..b9567195b3a8 100644
--- a/tests/quantization/test_turboquant.py
+++ b/tests/quantization/test_turboquant.py
@@ -182,22 +182,100 @@ def test_all_presets_all_head_dims(self, preset, head_dim):
 
     # ---- Boundary skip layers ----
 
+    @staticmethod
+    def _dense_model_config(num_layers):
+        from types import SimpleNamespace
+
+        return SimpleNamespace(
+            is_hybrid=False,
+            hf_text_config=SimpleNamespace(num_hidden_layers=num_layers),
+        )
+
     def test_boundary_skip_layers_basic(self):
-        layers = TurboQuantConfig.get_boundary_skip_layers(32)
+        mc = self._dense_model_config(32)
+        layers = TurboQuantConfig.get_boundary_skip_layers(mc)
         assert layers == ["0", "1", "30", "31"]
 
     def test_boundary_skip_layers_zero(self):
-        assert TurboQuantConfig.get_boundary_skip_layers(32, 0) == []
+        mc = self._dense_model_config(32)
+        assert TurboQuantConfig.get_boundary_skip_layers(mc, 0) == []
 
     def test_boundary_skip_layers_small_model(self):
-        layers = TurboQuantConfig.get_boundary_skip_layers(4)
+        mc = self._dense_model_config(4)
+        layers = TurboQuantConfig.get_boundary_skip_layers(mc)
         assert layers == ["0", "1", "2", "3"]
 
     def test_boundary_skip_layers_cap_at_half(self):
-        layers = TurboQuantConfig.get_boundary_skip_layers(8, 10)
+        mc = self._dense_model_config(8)
+        layers = TurboQuantConfig.get_boundary_skip_layers(mc, 10)
         assert len(layers) == 8
 
 
+class TestHybridAttentionIndices:
+    """Regression tests for boundary protection on hybrid models.
+
+    Hybrid models (attention + Mamba / linear-attention) identify KV-carrying
+    layers via layer_types / layers_block_type / attn_type_list. The helper
+    must return the *global* layer indices of the full-attention layers so
+    that kv_cache_dtype_skip_layers matches what extract_layer_index(prefix)
+    reports on the Attention layers at runtime.
+    """
+
+    @staticmethod
+    def _fake_model_config(text_cfg=None, hf_cfg=None):
+        from types import SimpleNamespace
+
+        return SimpleNamespace(
+            hf_text_config=text_cfg if text_cfg is not None else SimpleNamespace(),
+            hf_config=hf_cfg if hf_cfg is not None else SimpleNamespace(),
+        )
+
+    def test_layer_types_full_attention(self):
+        from vllm.model_executor.layers.quantization.turboquant.config import (
+            _get_full_attention_layer_indices,
+        )
+
+        cfg = type("C", (), {})()
+        cfg.layer_types = [
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "full_attention",
+            "full_attention",
+        ]
+        mc = self._fake_model_config(text_cfg=cfg)
+        assert _get_full_attention_layer_indices(mc) == [2, 4, 5]
+
+    def test_layers_block_type_jamba(self):
+        from vllm.model_executor.layers.quantization.turboquant.config import (
+            _get_full_attention_layer_indices,
+        )
+
+        cfg = type("C", (), {})()
+        cfg.layers_block_type = ["mamba", "attention", "mamba", "attention"]
+        mc = self._fake_model_config(text_cfg=cfg)
+        assert _get_full_attention_layer_indices(mc) == [1, 3]
+
+    def test_attn_type_list_minimax(self):
+        from vllm.model_executor.layers.quantization.turboquant.config import (
+            _get_full_attention_layer_indices,
+        )
+
+        hf = type("C", (), {})()
+        hf.attn_type_list = [0, 1, 0, 1, 1]
+        mc = self._fake_model_config(hf_cfg=hf)
+        assert _get_full_attention_layer_indices(mc) == [1, 3, 4]
+
+    def test_no_hybrid_hints_returns_empty(self):
+        from vllm.model_executor.layers.quantization.turboquant.config import (
+            _get_full_attention_layer_indices,
+        )
+
+        mc = self._fake_model_config()
+        assert _get_full_attention_layer_indices(mc) == []
+
+
 # ============================================================================
 # Centroids tests (CPU-only)
 # ============================================================================
diff --git a/tests/reasoning/test_kimi_k2_reasoning_parser.py b/tests/reasoning/test_kimi_k2_reasoning_parser.py
index 0f80bb8854a8..dfce2075c6a9 100644
--- a/tests/reasoning/test_kimi_k2_reasoning_parser.py
+++ b/tests/reasoning/test_kimi_k2_reasoning_parser.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import MagicMock
+
 import pytest
 
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
@@ -12,6 +14,20 @@
 REASONING_MODEL_NAME = "moonshotai/Kimi-K2.5"
 
 
+@pytest.fixture
+def mock_kimi_k2_tokenizer():
+    tokenizer = MagicMock()
+    tokenizer.get_vocab.return_value = {
+        "<think>": 100,
+        "</think>": 101,
+        "<|tool_calls_section_begin|>": 200,
+        "<|tool_calls_section_end|>": 201,
+        "<|tool_call_begin|>": 202,
+        "<|tool_call_end|>": 203,
+    }
+    return tokenizer
+
+
 @pytest.fixture(scope="module")
 def kimi_k2_tokenizer():
     return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME, trust_remote_code=True)
@@ -153,3 +169,50 @@ def test_streaming_tool_section_ends_reasoning(kimi_k2_tokenizer):
     )
     assert isinstance(result, DeltaMessage)
     assert result.content == "<|tool_calls_section_begin|>"
+
+
+def test_streaming_end_token_id_buffered(mock_kimi_k2_tokenizer):
+    """When stop sequences buffer text, </think> ID arrives before its text.
+
+    The token ID is present in delta_token_ids but the actual string is not
+    yet in delta_text (still buffered). The parser must return None to wait
+    for the next delta, instead of calling find() which returns -1 and
+    silently corrupting the text split.
+    """
+    parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
+    think_id = parser._start_token_id
+    end_think_id = parser._end_token_id
+
+    # Simulate: </think> ID arrived but text not yet flushed.
+    # Two token IDs in delta to bypass the single-special-token guard.
+    result = parser.extract_reasoning_streaming(
+        previous_text="some reasoning",
+        current_text="some reasoning extra",
+        delta_text="extra",  # </think> text not yet flushed
+        previous_token_ids=[think_id],
+        current_token_ids=[think_id, end_think_id, 999],
+        delta_token_ids=[end_think_id, 999],
+    )
+    assert result is None
+
+
+def test_streaming_tool_section_id_buffered(mock_kimi_k2_tokenizer):
+    """When stop sequences buffer text, tool section start ID arrives before its text.
+
+    Same buffering scenario as above but for <|tool_calls_section_begin|>.
+    Without the guard, find() returns -1 and delta_text[:tool_index] silently
+    drops the last character of reasoning.
+    """
+    parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
+    think_id = parser._start_token_id
+    tool_begin_id = parser._tool_section_start_token_id
+
+    result = parser.extract_reasoning_streaming(
+        previous_text="some reasoning",
+        current_text="some reasoning extra",
+        delta_text="extra",  # tool section text not yet flushed
+        previous_token_ids=[think_id],
+        current_token_ids=[think_id, tool_begin_id, 999],
+        delta_token_ids=[tool_begin_id, 999],
+    )
+    assert result is None
diff --git a/tests/renderers/test_chat_utils_prompt_embeds.py b/tests/renderers/test_chat_utils_prompt_embeds.py
new file mode 100644
index 000000000000..e33cc304710d
--- /dev/null
+++ b/tests/renderers/test_chat_utils_prompt_embeds.py
@@ -0,0 +1,576 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Offline unit tests for `prompt_embeds` chat-completion content parts."""
+
+from __future__ import annotations
+
+import inspect
+import io
+from typing import Final
+from unittest import mock
+
+import pybase64 as base64
+import pytest
+import regex as re
+import torch
+from transformers import AutoTokenizer
+
+from vllm.entrypoints.chat_utils import (
+    _ENABLE_PROMPT_EMBEDS_ERROR,
+    _PROMPT_EMBEDS_MISSING_DATA_ERROR,
+    _RESERVED_PLACEHOLDER_IN_TEXT_ERROR,
+    MM_PARSER_MAP,
+    MODALITY_PLACEHOLDERS_MAP,
+    PROMPT_EMBEDS_PLACEHOLDER_TOKEN,
+    parse_chat_messages,
+    parse_chat_messages_async,
+)
+from vllm.renderers.hf import (
+    _PROMPT_EMBEDS_PLACEHOLDER_SPAN_MISMATCH_ERROR,
+    _build_mixed_prompt_embeds,
+    _build_prompt_embeds_positions,
+    _build_prompt_embeds_updates,
+    _ensure_prompt_embeds_placeholder_token,
+    _expand_prompt_embeds_placeholders,
+)
+
+# Cover distinct tokenizer families:
+#   GPT2TokenizerFast  (BPE, OpenAI-style)
+#   Qwen2TokenizerFast (SentencePiece BPE variant)
+#   BertTokenizerFast  (WordPiece)
+TOKENIZER_IDS: Final[list[str]] = [
+    "gpt2",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    "bert-base-uncased",
+]
+
+
+@pytest.fixture(params=TOKENIZER_IDS, ids=TOKENIZER_IDS)
+def tokenizer(request):
+    """A fresh tokenizer instance per tokenizer family."""
+    return AutoTokenizer.from_pretrained(request.param)
+
+
+# Minimal chat template that works with any tokenizer.  Iterates
+# `message.content` as either a string or a list of dicts (openai format).
+_SIMPLE_CHAT_TEMPLATE: Final[str] = (
+    "{% for m in messages %}"
+    "{% if m['content'] is string %}{{m['content']}}"
+    "{% else %}{% for p in m['content'] %}{{p['text']}}{% endfor %}"
+    "{% endif %}\n{% endfor %}"
+)
+
+
+async def _maybe_await(fn, *args, **kwargs):
+    """Call *fn* and `await` the result if it's a coroutine."""
+    result = fn(*args, **kwargs)
+    if inspect.iscoroutine(result):
+        result = await result
+    return result
+
+
+# Parametrize over sync / async parse paths so every end-to-end test
+# exercises both.
+_PARSE_FUNCTIONS = [parse_chat_messages, parse_chat_messages_async]
+
+
+@pytest.fixture(params=_PARSE_FUNCTIONS, ids=["sync", "async"])
+def parse_fn(request):
+    """Either the sync or async `parse_chat_messages` callable."""
+    return request.param
+
+
+def _encode_tensor(t: torch.Tensor) -> str:
+    buf = io.BytesIO()
+    torch.save(t, buf)
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+
+
+_MOCK_HIDDEN_SIZE: Final[int] = 8
+_MOCK_DTYPE: Final[torch.dtype] = torch.float32
+
+
+def _make_mock_model_config(*, enable_prompt_embeds: bool = True) -> mock.MagicMock:
+    mc = mock.MagicMock()
+    mc.enable_prompt_embeds = enable_prompt_embeds
+    mc.multimodal_config = None
+    mc.allowed_local_media_path = None
+    mc.allowed_media_domains = None
+    # Test text-only code path in `MultiModalItemTracker.resolve_items`.
+    mc.is_multimodal_model = False
+    # `safe_load_prompt_embeds` pins each tensor to the model's hidden_size
+    # and dtype, so the mock must return concrete values.
+    mc.get_hidden_size.return_value = _MOCK_HIDDEN_SIZE
+    mc.dtype = _MOCK_DTYPE
+    return mc
+
+
+def test_prompt_embeds_keys_registered():
+    assert "prompt_embeds" in MODALITY_PLACEHOLDERS_MAP
+    assert MODALITY_PLACEHOLDERS_MAP["prompt_embeds"] == "<##PROMPT_EMBEDS##>"
+    assert "prompt_embeds" in MM_PARSER_MAP
+
+
+def test_ensure_placeholder_token_is_single_token_and_idempotent(tokenizer):
+    """Ensure the placeholder token is a single token and that multiple calls to
+    "ensure" are idempotent, across all tokenizer families."""
+    tid1 = _ensure_prompt_embeds_placeholder_token(tokenizer)
+    tid2 = _ensure_prompt_embeds_placeholder_token(tokenizer)
+    assert tid1 == tid2
+
+    ids = tokenizer.encode(PROMPT_EMBEDS_PLACEHOLDER_TOKEN, add_special_tokens=False)
+    assert ids == [tid1]
+
+    # Repeating it in a string N times must produce exactly that many tokens.
+    N = 5
+    ids_rep = tokenizer.encode(
+        PROMPT_EMBEDS_PLACEHOLDER_TOKEN * N, add_special_tokens=False
+    )
+    assert ids_rep == [tid1] * N
+
+
+def test_parse_chat_messages_openai_format():
+    NUM_TOKENS = 3
+    t = torch.randn(NUM_TOKENS, _MOCK_HIDDEN_SIZE, dtype=_MOCK_DTYPE)
+    b64 = _encode_tensor(t)
+    mc = _make_mock_model_config()
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Hello "},
+                {"type": "prompt_embeds", "data": b64},
+                {"type": "text", "text": " world"},
+            ],
+        }
+    ]
+    conv, mm_data, _ = parse_chat_messages(
+        messages,
+        mc,
+        content_format="openai",
+    )
+    # The middle content part is rewritten to a single placeholder-token
+    # sentinel.
+    texts = [p["text"] for p in conv[0]["content"]]
+    assert texts == [
+        "Hello ",
+        PROMPT_EMBEDS_PLACEHOLDER_TOKEN,
+        " world",
+    ]
+    assert mm_data is not None and "prompt_embeds" in mm_data
+    assert torch.equal(mm_data["prompt_embeds"][0], t)
+
+
+# Each layout entry is one content part:
+#   ("text", "A")  -> {"type": "text", "text": "A"}
+#   ("embed", N)   -> {"type": "prompt_embeds", "data": <base64 of (N, H) tensor>}
+@pytest.mark.parametrize(
+    "layout",
+    [
+        # Case: Single embed only.
+        [("embed", 2)],
+        # Case: Embed at the start of the message.
+        [("embed", 3), ("text", "B")],
+        # Case: Embed at the end of the message.
+        [("text", "A"), ("embed", 1)],
+        # Case: Embed sandwiched between text spans.
+        [("text", "A"), ("embed", 2), ("text", "B")],
+        # Case: Multiple embeds with text in between.
+        [("text", "A"), ("embed", 2), ("text", "B"), ("embed", 3)],
+        # Case: Adjacent embeds with no separating text.
+        [("embed", 1), ("embed", 2)],
+        # Case: Multiple text spans before a trailing embed.
+        [("text", "A"), ("text", "B"), ("embed", 1)],
+        # Case: Long-ish run mixing both kinds.
+        [
+            ("text", "head"),
+            ("embed", 4),
+            ("text", "mid"),
+            ("embed", 1),
+            ("embed", 2),
+            ("text", "tail"),
+        ],
+    ],
+    ids=[
+        "single-embed",
+        "embed-then-text",
+        "text-then-embed",
+        "text-embed-text",
+        "text-embed-text-embed",
+        "adjacent-embeds",
+        "text-text-embed",
+        "long-mixed-run",
+    ],
+)
+@pytest.mark.parametrize(
+    "interleave_mm_strings",
+    # `None`: text-only path where `multimodal_config` is absent.
+    # `False`: non-interleave multimodal path (the common default).
+    # `True`: sentinel-substitution interleave path.
+    # All three must preserve the request ordering of prompt_embeds
+    # relative to surrounding text because prompt_embeds are spliced at the
+    # token offset during rendering.
+    [None, False, True],
+    ids=["text-only", "interleave-off", "interleave-on"],
+)
+def test_parse_chat_messages_string_format_preserves_position(
+    layout, interleave_mm_strings
+):
+    mc = _make_mock_model_config()
+    if interleave_mm_strings is not None:
+        mm_cfg = mock.MagicMock()
+        mm_cfg.interleave_mm_strings = interleave_mm_strings
+        mc.multimodal_config = mm_cfg
+
+    content: list[dict] = []
+    expected_parts: list[str] = []
+    expected_embeds: list[torch.Tensor] = []
+    for kind, value in layout:
+        if kind == "text":
+            content.append({"type": "text", "text": value})
+            expected_parts.append(value)
+        else:  # prompt embeds
+            num_tokens = value
+            t = torch.randn(num_tokens, _MOCK_HIDDEN_SIZE, dtype=_MOCK_DTYPE)
+            expected_embeds.append(t)
+            content.append({"type": "prompt_embeds", "data": _encode_tensor(t)})
+            # Parser emits ONE sentinel per part.
+            expected_parts.append(PROMPT_EMBEDS_PLACEHOLDER_TOKEN)
+
+    messages = [{"role": "user", "content": content}]
+    conv, mm_data, _ = parse_chat_messages(
+        messages,
+        mc,
+        content_format="string",
+    )
+
+    assert conv[0]["content"] == "\n".join(expected_parts)
+    assert mm_data is not None and "prompt_embeds" in mm_data
+    assert len(mm_data["prompt_embeds"]) == len(expected_embeds)
+    for got, want in zip(mm_data["prompt_embeds"], expected_embeds, strict=True):
+        assert torch.equal(got, want)
+
+
+def test_parse_chat_messages_requires_flag():
+    t = torch.randn(2, 4)
+    b64 = _encode_tensor(t)
+    mc = _make_mock_model_config(enable_prompt_embeds=False)
+
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "prompt_embeds", "data": b64}],
+        }
+    ]
+    with pytest.raises(ValueError, match=_ENABLE_PROMPT_EMBEDS_ERROR):
+        parse_chat_messages(
+            messages,
+            mc,
+            content_format="openai",
+        )
+
+
+def test_parse_chat_messages_rejects_missing_data():
+    # `data` is marked `Required` on `ChatCompletionContentPartPromptEmbedsParam`;
+    # malformed requests without `data` must surface a clear validation error
+    # rather than being silently dropped.
+    mc = _make_mock_model_config()
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "prompt_embeds"}],  # no `data`
+        }
+    ]
+    with pytest.raises(ValueError, match=_PROMPT_EMBEDS_MISSING_DATA_ERROR):
+        parse_chat_messages(
+            messages,
+            mc,
+            content_format="openai",
+        )
+
+
+# Reserved placeholder guard: when `enable_prompt_embeds=True` the tokenizer is
+# mutated to make `<prompt_embeds>` a single unsplittable token. Any user text
+# containing that literal sequence would tokenize to the same sentinel ID and
+# be mistaken for a splice point, so we reject it at parse time.
+_PLACEHOLDER_ERROR_PATTERN: Final[str] = re.sub(
+    r"\\{[^}]*\\}", ".*", re.escape(_RESERVED_PLACEHOLDER_IN_TEXT_ERROR)
+)
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        # Case: Top-level string content (wrapped as a single text part).
+        f"hello {PROMPT_EMBEDS_PLACEHOLDER_TOKEN} world",
+        # Case: List with a typed text part containing the placeholder.
+        [{"type": "text", "text": f"leading {PROMPT_EMBEDS_PLACEHOLDER_TOKEN}"}],
+        # Case: List with a plain-string part (no wrapping dict).
+        [f"raw string {PROMPT_EMBEDS_PLACEHOLDER_TOKEN}"],
+    ],
+    ids=["top-level-string", "typed-text-part", "plain-string-part"],
+)
+def test_parse_chat_messages_rejects_placeholder_in_user_text(content):
+    mc = _make_mock_model_config()  # enable_prompt_embeds=True by default
+    messages = [{"role": "user", "content": content}]
+    with pytest.raises(ValueError, match=_PLACEHOLDER_ERROR_PATTERN):
+        parse_chat_messages(messages, mc, content_format="openai")
+
+
+def test_parse_chat_messages_allows_placeholder_in_text_when_feature_disabled():
+    # When `enable_prompt_embeds=False` the tokenizer is never mutated, so the
+    # literal `<prompt_embeds>` is just ordinary text and must pass through.
+    mc = _make_mock_model_config(enable_prompt_embeds=False)
+    messages = [
+        {
+            "role": "user",
+            "content": f"benign mention of {PROMPT_EMBEDS_PLACEHOLDER_TOKEN} here",
+        }
+    ]
+    conv, mm_data, _ = parse_chat_messages(messages, mc, content_format="openai")
+    assert mm_data is None or "prompt_embeds" not in mm_data
+    # Text reaches the rendered conversation unchanged.
+    texts = [p["text"] for p in conv[0]["content"]]
+    assert PROMPT_EMBEDS_PLACEHOLDER_TOKEN in "".join(texts)
+
+
+# Token-stream spec: ints are regular token IDs, tuples `(N,)` expand to
+# a placeholder span of length N (creates corresponding `(N, H)` tensor).
+# `expected` lists the `(start_idx, length)` pairs that
+# `_build_prompt_embeds_positions` should return.
+@pytest.mark.parametrize(
+    "stream, expected",
+    [
+        # Case: Single run in the middle.
+        ([10, 20, (3,), 30], [(2, 3)]),
+        # Case: Single run at the start.
+        ([(2,), 10, 20], [(0, 2)]),
+        # Case: Single run at the end.
+        ([10, 20, (4,)], [(2, 4)]),
+        # Case: Two runs with tokens between.
+        ([1, (2,), 2, 3, (3,), 4], [(1, 2), (5, 3)]),
+        # Case: Adjacent runs (no separating tokens).
+        ([(1,), (2,)], [(0, 1), (1, 2)]),
+        # Case: Three runs.
+        ([5, (2,), 6, (1,), 7, (3,), 8], [(1, 2), (4, 1), (6, 3)]),
+    ],
+    ids=[
+        "single-middle",
+        "single-start",
+        "single-end",
+        "two-runs-separated",
+        "two-runs-adjacent",
+        "three-runs",
+    ],
+)
+def test_build_positions(tokenizer, stream, expected):
+    H = 4
+    tid = _ensure_prompt_embeds_placeholder_token(tokenizer)
+    tensors: list[torch.Tensor] = []
+    token_ids: list[int] = []
+    for item in stream:
+        if isinstance(item, tuple):
+            length = item[0]
+            tensors.append(torch.randn(length, H))
+            token_ids.extend([tid] * length)
+        else:
+            token_ids.append(item)
+    mm_updates = _build_prompt_embeds_updates(tensors, tid)
+    positions = _build_prompt_embeds_positions(token_ids, len(tensors), mm_updates)
+    assert positions == expected
+
+
+def test_build_positions_length_mismatch(tokenizer):
+    N1, H1 = 2, 4
+    N2, H2 = 3, 4
+    tid = _ensure_prompt_embeds_placeholder_token(tokenizer)
+    # 2 tensors expected but only a single placeholder run in the token
+    # stream (simulating dropping the second one).
+    tensors = [torch.randn(N1, H1), torch.randn(N2, H2)]
+    token_ids = [1, tid, tid, 2, 3]
+    mm_updates = _build_prompt_embeds_updates(tensors, tid)
+    # The error constant is a `str.format` template, escape it and turn
+    # the `{field}` placeholders into `.*` so it matches any substitution.
+    pattern = re.sub(
+        r"\\{[^}]*\\}", ".*", re.escape(_PROMPT_EMBEDS_PLACEHOLDER_SPAN_MISMATCH_ERROR)
+    )
+    with pytest.raises(ValueError, match=pattern):
+        _build_prompt_embeds_positions(token_ids, len(tensors), mm_updates)
+
+
+# ints  = regular token IDs (any value)
+# (N,)  = embed span of length N
+@pytest.mark.parametrize(
+    "stream",
+    [
+        [10, 20, (3,), 30],
+        [(2,), 10, 20],
+        [10, 20, (4,)],
+        [1, (2,), 2, 3, (3,), 4],
+        [(1,), (2,)],
+        [5, (2,), 6, (1,), 7, (3,), 8],
+    ],
+    ids=[
+        "single-middle",
+        "single-start",
+        "single-end",
+        "two-spans-separated",
+        "two-spans-adjacent",
+        "three-spans",
+    ],
+)
+def test_build_mixed_prompt_embeds(stream):
+    H = 8
+    _PLACEHOLDER = 0  # sentinel for embed positions in token_ids
+
+    tensors: list[torch.Tensor] = []
+    token_ids: list[int] = []
+    positions: list[tuple[int, int]] = []
+    cursor = 0
+    for item in stream:
+        if isinstance(item, tuple):
+            length = item[0]
+            tensors.append(torch.randn(length, H))
+            positions.append((cursor, length))
+            token_ids.extend([_PLACEHOLDER] * length)
+            cursor += length
+        else:
+            token_ids.append(item)
+            cursor += 1
+
+    embeds, mask = _build_mixed_prompt_embeds(token_ids, tensors, positions)
+
+    assert embeds.shape == (len(token_ids), H)
+    assert len(mask) == len(token_ids)
+
+    # Mask: False exactly at embed positions, True everywhere else.
+    expected_mask = torch.ones(len(token_ids), dtype=torch.bool)
+    for start, length in positions:
+        expected_mask[start : start + length] = False
+    assert mask == expected_mask.tolist()
+
+    # Embed rows match input tensors at the right positions.
+    for tensor, (start, length) in zip(tensors, positions):
+        assert torch.equal(embeds[start : start + length], tensor)
+
+    # Non-embed positions remain zero-filled.
+    assert torch.all(embeds[expected_mask] == 0)
+
+
+# End-to-end tests: each runs both sync and async parse paths via the
+# `parse_fn` fixture.
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("role", ["user", "system"])
+async def test_end_to_end_expand_and_build(tokenizer, parse_fn, role):
+    """Full renderer pipeline: parse -> chat template -> expand -> locate
+    -> build mixed prompt, across tokenizers, roles, and sync/async."""
+    tokenizer.chat_template = _SIMPLE_CHAT_TEMPLATE
+    tid = _ensure_prompt_embeds_placeholder_token(tokenizer)
+
+    LEN_A, LEN_B = 3, 2
+    t_a = torch.randn(LEN_A, _MOCK_HIDDEN_SIZE, dtype=_MOCK_DTYPE)
+    t_b = torch.randn(LEN_B, _MOCK_HIDDEN_SIZE, dtype=_MOCK_DTYPE)
+    NUM_TENSORS = 2
+
+    mc = _make_mock_model_config()
+
+    messages = [
+        {
+            "role": role,
+            "content": [
+                {"type": "text", "text": "Hello "},
+                {"type": "prompt_embeds", "data": _encode_tensor(t_a)},
+                {"type": "text", "text": " world "},
+                {"type": "prompt_embeds", "data": _encode_tensor(t_b)},
+                {"type": "text", "text": "!"},
+            ],
+        }
+    ]
+
+    conv, mm_data, _ = await _maybe_await(
+        parse_fn, messages, mc, content_format="openai"
+    )
+    tensors = list(mm_data["prompt_embeds"])
+    assert len(tensors) == NUM_TENSORS
+
+    # Tokenize: each prompt_embeds part becomes 1 placeholder token.
+    # `return_dict=False` to get a flat `list[int]` on transformers v5
+    # (where the default flipped to True and yields a `BatchEncoding` dict).
+    token_ids = tokenizer.apply_chat_template(conv, tokenize=True, return_dict=False)
+    assert sum(t == tid for t in token_ids) == NUM_TENSORS
+
+    # Expand, locate, and build.
+    mm_updates = _build_prompt_embeds_updates(tensors, tid)
+    expanded = _expand_prompt_embeds_placeholders(token_ids, mm_updates)
+    assert len(expanded) == len(token_ids) + LEN_A + LEN_B - NUM_TENSORS
+
+    positions = _build_prompt_embeds_positions(expanded, len(tensors), mm_updates)
+    assert positions[0][1] == LEN_A
+    assert positions[1][1] == LEN_B
+
+    embeds, mask = _build_mixed_prompt_embeds(expanded, tensors, positions)
+    assert embeds.shape == (len(expanded), _MOCK_HIDDEN_SIZE)
+    assert mask.count(False) == LEN_A + LEN_B
+    assert torch.equal(embeds[positions[0][0] : positions[0][0] + LEN_A], t_a)
+    assert torch.equal(embeds[positions[1][0] : positions[1][0] + LEN_B], t_b)
+
+
+@pytest.mark.asyncio
+async def test_end_to_end_multi_message_conversation(tokenizer, parse_fn):
+    """Full pipeline with prompt_embeds spread across system + user messages,
+    verifying ordering and positioning in the final token stream."""
+    tokenizer.chat_template = _SIMPLE_CHAT_TEMPLATE
+    tid = _ensure_prompt_embeds_placeholder_token(tokenizer)
+
+    LEN_SYS, LEN_USR = 4, 3
+    t_sys = torch.randn(LEN_SYS, _MOCK_HIDDEN_SIZE, dtype=_MOCK_DTYPE)
+    t_usr = torch.randn(LEN_USR, _MOCK_HIDDEN_SIZE, dtype=_MOCK_DTYPE)
+    NUM_TENSORS = 2  # t_sys and t_usr.
+
+    mc = _make_mock_model_config()
+
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are helpful."},
+                {"type": "prompt_embeds", "data": _encode_tensor(t_sys)},
+            ],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "prompt_embeds", "data": _encode_tensor(t_usr)},
+                {"type": "text", "text": "Summarize."},
+            ],
+        },
+    ]
+
+    conv, mm_data, _ = await _maybe_await(
+        parse_fn, messages, mc, content_format="openai"
+    )
+    tensors = list(mm_data["prompt_embeds"])
+    assert len(tensors) == NUM_TENSORS
+
+    # Tokenize, expand, locate, and build.
+    # `return_dict=False` to get a flat `list[int]` on transformers v5
+    # (where the default flipped to True and yields a `BatchEncoding` dict).
+    token_ids = tokenizer.apply_chat_template(conv, tokenize=True, return_dict=False)
+    mm_updates = _build_prompt_embeds_updates(tensors, tid)
+    expanded = _expand_prompt_embeds_placeholders(token_ids, mm_updates)
+    positions = _build_prompt_embeds_positions(expanded, len(tensors), mm_updates)
+
+    assert positions[0][1] == LEN_SYS
+    assert positions[1][1] == LEN_USR
+    # System embed must appear before user embed in the token stream.
+    assert positions[0][0] < positions[1][0]
+
+    embeds, mask = _build_mixed_prompt_embeds(expanded, tensors, positions)
+    assert embeds.shape == (len(expanded), _MOCK_HIDDEN_SIZE)
+    assert mask.count(False) == LEN_SYS + LEN_USR
+    assert torch.equal(embeds[positions[0][0] : positions[0][0] + LEN_SYS], t_sys)
+    assert torch.equal(embeds[positions[1][0] : positions[1][0] + LEN_USR], t_usr)
diff --git a/tests/renderers/test_completions.py b/tests/renderers/test_completions.py
index ccc806ba137d..00d604afdcf9 100644
--- a/tests/renderers/test_completions.py
+++ b/tests/renderers/test_completions.py
@@ -39,6 +39,11 @@ class MockModelConfig:
     is_encoder_decoder: bool = False
     is_multimodal_model: bool = False
     renderer_num_workers: int = 1
+    hidden_size: int = 768
+    dtype: torch.dtype = torch.float32
+
+    def get_hidden_size(self) -> int:
+        return self.hidden_size
 
 
 @dataclass
@@ -384,12 +389,13 @@ def test_single_prompt_embed(self):
         assert torch.equal(results[0]["prompt_embeds"], tensor_input)
 
     def test_multiple_prompt_embeds(self):
-        renderer = _build_renderer(MockModelConfig())
+        hidden_size = 512
+        renderer = _build_renderer(MockModelConfig(hidden_size=hidden_size))
 
         # Create multiple test tensors
         tensor_inputs = [
-            torch.randn(8, 512, dtype=torch.float32),
-            torch.randn(12, 512, dtype=torch.float32),
+            torch.randn(8, hidden_size, dtype=torch.float32),
+            torch.randn(12, hidden_size, dtype=torch.float32),
         ]
 
         prompts = renderer.render_prompts(
@@ -432,13 +438,15 @@ def test_prompt_embed_truncation(self):
         assert torch.equal(results[0]["prompt_embeds"], expected)
 
     def test_prompt_embed_different_dtypes(self):
-        renderer = _build_renderer(MockModelConfig())
-
+        hidden_size = 256
         # Test different supported dtypes
         dtypes = [torch.float32, torch.float16, torch.bfloat16]
 
         for dtype in dtypes:
-            tensor_input = torch.randn(5, 256, dtype=dtype)
+            renderer = _build_renderer(
+                MockModelConfig(hidden_size=hidden_size, dtype=dtype)
+            )
+            tensor_input = torch.randn(5, hidden_size, dtype=dtype)
 
             prompts = renderer.render_prompts(
                 _preprocess_prompt(
@@ -474,10 +482,11 @@ def test_prompt_embed_squeeze_batch_dim(self):
         assert results[0]["prompt_embeds"].shape == (10, 768)
 
     def test_both_prompts_and_embeds(self):
-        renderer = _build_renderer(MockModelConfig())
+        hidden_size = 256
+        renderer = _build_renderer(MockModelConfig(hidden_size=hidden_size))
 
         text_input = "Hello world"
-        tensor_input = torch.randn(5, 256, dtype=torch.float32)
+        tensor_input = torch.randn(5, hidden_size, dtype=torch.float32)
 
         prompts = renderer.render_prompts(
             _preprocess_prompt(
diff --git a/tests/renderers/test_sparse_tensor_validation.py b/tests/renderers/test_sparse_tensor_validation.py
index 5c51cd30a336..642867086fc9 100644
--- a/tests/renderers/test_sparse_tensor_validation.py
+++ b/tests/renderers/test_sparse_tensor_validation.py
@@ -12,6 +12,7 @@
 import pytest
 import torch
 
+from vllm.exceptions import VLLMValidationError
 from vllm.multimodal.media import AudioEmbeddingMediaIO, ImageEmbeddingMediaIO
 from vllm.renderers.embed_utils import safe_load_prompt_embeds
 
@@ -53,8 +54,14 @@ def _create_malicious_sparse_tensor() -> torch.Tensor:
     values = torch.tensor([1.0])
     shape = (3, 3)
 
-    # Create sparse tensor (this will be invalid)
-    sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
+    # Create sparse tensor (this will be invalid). Pass `check_invariants=False`
+    # explicitly so this fixture is robust to process-wide invariant-check state
+    # left enabled by other tests (the global flag isn't thread-local, and
+    # concurrent users of the `check_sparse_tensor_invariants` context manager
+    # can leak the "enabled" state across tests).
+    sparse_tensor = torch.sparse_coo_tensor(
+        indices, values, shape, dtype=torch.float32, check_invariants=False
+    )
     return sparse_tensor
 
 
@@ -117,7 +124,7 @@ def test_extremely_large_indices_rejected(self, model_config):
         shape = (10, 10)
 
         malicious_tensor = torch.sparse_coo_tensor(
-            indices, values, shape, dtype=torch.float32
+            indices, values, shape, dtype=torch.float32, check_invariants=False
         )
         encoded = _encode_tensor(malicious_tensor)
 
@@ -132,13 +139,69 @@ def test_negative_indices_rejected(self, model_config):
         shape = (10, 10)
 
         malicious_tensor = torch.sparse_coo_tensor(
-            indices, values, shape, dtype=torch.float32
+            indices, values, shape, dtype=torch.float32, check_invariants=False
         )
         encoded = _encode_tensor(malicious_tensor)
 
         with pytest.raises((RuntimeError, ValueError)):
             safe_load_prompt_embeds(model_config, encoded)
 
+    def test_hidden_size_mismatch_rejected(self, model_config):
+        """Tensors whose trailing dim doesn't match the model's hidden_size
+        must be rejected at parse time."""
+        # opt-125m has hidden_size=768, passing 512 triggers the check.
+        wrong_hidden = torch.randn(10, 512, dtype=torch.float32)
+        encoded = _encode_tensor(wrong_hidden)
+
+        with pytest.raises(VLLMValidationError, match="hidden_size"):
+            safe_load_prompt_embeds(model_config, encoded)
+
+    def test_float_dtype_mismatch_cast_to_model_dtype(self, model_config):
+        """Tensors whose dtype doesn't match the model's dtype but are still
+        floating-point are cast, since API clients generally can't know the
+        server's `--dtype` setting ahead of time."""
+        # Fixture pins model dtype to float32, upload a bfloat16 tensor.
+        mismatched_float = torch.randn(10, 768, dtype=torch.bfloat16)
+        encoded = _encode_tensor(mismatched_float)
+
+        result = safe_load_prompt_embeds(model_config, encoded)
+
+        assert result.dtype == torch.float32
+        assert result.shape == mismatched_float.shape
+
+    def test_non_float_dtype_rejected(self, model_config):
+        """Non-floating-point dtypes cannot be safely cast for embeddings
+        (e.g. integer tensors almost certainly indicate caller confusion),
+        so they are rejected at parse time."""
+        non_float = torch.randint(0, 100, (10, 768), dtype=torch.int32)
+        encoded = _encode_tensor(non_float)
+
+        with pytest.raises(VLLMValidationError, match="floating-point"):
+            safe_load_prompt_embeds(model_config, encoded)
+
+    def test_non_2d_tensor_rejected(self, model_config):
+        """Tensors that aren't 2D (even after squeezing a leading dim)
+        must be rejected with a clear error."""
+        # A 1D tensor cannot be interpreted as (num_tokens, hidden_size).
+        bad = torch.randn(768, dtype=torch.float32)
+        encoded = _encode_tensor(bad)
+
+        with pytest.raises(VLLMValidationError, match="2D tensor"):
+            safe_load_prompt_embeds(model_config, encoded)
+
+    def test_non_tensor_payload_rejected(self, model_config):
+        """Deserializing to a non-Tensor object must raise a clear error
+        instead of propagating an AssertionError."""
+        # `torch.save` will serialize a plain dict; `weights_only=True` allows
+        # loading built-in containers, so this exercises the isinstance check.
+        buffer = io.BytesIO()
+        torch.save({"not": "a tensor"}, buffer)
+        buffer.seek(0)
+        encoded = base64.b64encode(buffer.read())
+
+        with pytest.raises(VLLMValidationError, match="torch.Tensor"):
+            safe_load_prompt_embeds(model_config, encoded)
+
 
 class TestImageEmbedsValidation:
     """Test sparse tensor validation in image embeddings (Chat API)."""
diff --git a/tests/test_config.py b/tests/test_config.py
index 41d34a6cb06b..57d1e1bc686b 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1215,8 +1215,6 @@ def test_scheduler_config_init():
         ("facebook/opt-125m", 1, False, False),
         # Non-MoE model with DP>1 internal LB should need coordinator
         ("facebook/opt-125m", 2, False, True),
-        # Non-MoE model with DP>1 external LB should not need coordinator
-        ("facebook/opt-125m", 2, True, False),
         # MoE model with DP=1 should not need coordinator
         ("mistralai/Mixtral-8x7B-Instruct-v0.1", 1, False, False),
         # MoE model with DP>1 internal LB should need both coordinator
@@ -1295,11 +1293,14 @@ def test_ir_op_priority_default():
     # Assert default is applied to ops
     priority_config = IrOpPriorityConfig.with_default(["vllm_c", "native"])
     assert priority_config.rms_norm == ["vllm_c", "native"]
+    assert priority_config.fused_add_rms_norm == ["vllm_c", "native"]
 
     # Assert single ops override the default
-    assert IrOpPriorityConfig.with_default(
-        ["vllm_c", "native"], rms_norm=["oink", "native"]
-    ) == IrOpPriorityConfig(rms_norm=["oink", "native"])
+    priority_config = IrOpPriorityConfig.with_default(
+        ["native"], rms_norm=["oink", "native"]
+    )
+    assert priority_config.rms_norm == ["oink", "native"]
+    assert priority_config.fused_add_rms_norm == ["native"]
 
 
 def test_ir_op_priority_str():
@@ -1318,3 +1319,34 @@ def test_ir_op_priority_str():
     with pytest.raises(pydantic.ValidationError):
         # must be list of only strings
         priority_config = IrOpPriorityConfig(rms_norm=["vllm_c", 4, "native"])
+
+
+def test_ir_op_priority_ctx():
+    """Test that the priority-setting context sets priority correctly."""
+    from vllm import ir
+    from vllm.config.kernel import IrOpPriorityConfig
+
+    priority = IrOpPriorityConfig.with_default(["native"], rms_norm=["vllm_c"])
+    priority2 = IrOpPriorityConfig.with_default(
+        ["native"], fused_add_rms_norm=["vllm_c"]
+    )
+    with priority.set_priority():
+        assert ir.ops.rms_norm.get_priority() == ["vllm_c", "native"]
+        assert ir.ops.fused_add_rms_norm.get_priority() == ["native"]
+        with priority2.set_priority():
+            assert ir.ops.rms_norm.get_priority() == ["native"]
+            assert ir.ops.fused_add_rms_norm.get_priority() == ["vllm_c", "native"]
+
+        # context restored
+        assert ir.ops.rms_norm.get_priority() == ["vllm_c", "native"]
+        assert ir.ops.fused_add_rms_norm.get_priority() == ["native"]
+
+        with pytest.raises(ValueError), priority2.set_priority():
+            assert ir.ops.rms_norm.get_priority() == ["native"]
+            assert ir.ops.fused_add_rms_norm.get_priority() == ["vllm_c", "native"]
+
+            raise ValueError
+
+        # context restored even after exception
+        assert ir.ops.rms_norm.get_priority() == ["vllm_c", "native"]
+        assert ir.ops.fused_add_rms_norm.get_priority() == ["native"]
diff --git a/tests/test_jit_monitor.py b/tests/test_jit_monitor.py
new file mode 100644
index 000000000000..a463f4b5faa1
--- /dev/null
+++ b/tests/test_jit_monitor.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import sys
+from types import SimpleNamespace
+from unittest import mock
+
+import pytest
+
+from vllm.triton_utils import jit_monitor
+
+
+@pytest.fixture(autouse=True)
+def _reset_monitor():
+    """Reset global monitor state between tests."""
+    jit_monitor._active = False
+    yield
+    jit_monitor._active = False
+
+
+# ------------------------------------------------------------------
+# Helpers — lightweight stand-ins for triton.knobs
+# ------------------------------------------------------------------
+
+
+def _make_fake_knobs(*, autotuning_print=False, jit_hook=None):
+    """Build a minimal fake ``triton.knobs`` namespace."""
+    autotuning = SimpleNamespace(print=autotuning_print)
+    runtime = SimpleNamespace(jit_post_compile_hook=jit_hook)
+    return SimpleNamespace(autotuning=autotuning, runtime=runtime)
+
+
+def _patch_triton_knobs(fake_knobs):
+    """Context manager that makes ``from triton import knobs`` return *fake_knobs*."""
+    fake_triton = SimpleNamespace(knobs=fake_knobs)
+    return mock.patch.dict(sys.modules, {"triton": fake_triton})
+
+
+# ------------------------------------------------------------------
+# Unit tests (no GPU required, triton is mocked)
+# ------------------------------------------------------------------
+
+
+class TestActivateBasic:
+    def test_sets_active(self):
+        assert not jit_monitor.is_active()
+        with _patch_triton_knobs(_make_fake_knobs()):
+            jit_monitor.activate()
+        assert jit_monitor.is_active()
+
+    def test_idempotent(self):
+        fake = _make_fake_knobs()
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+            first_hook = fake.runtime.jit_post_compile_hook
+            jit_monitor.activate()
+            assert fake.runtime.jit_post_compile_hook is first_hook
+
+    def test_logs_info_on_activation(self):
+        with (
+            mock.patch.object(jit_monitor.logger, "info") as m,
+            _patch_triton_knobs(_make_fake_knobs()),
+        ):
+            jit_monitor.activate()
+        m.assert_called_once()
+        assert "Kernel JIT monitor activated" in m.call_args[0][0]
+
+
+class TestAutotuningPrint:
+    def test_enables_autotuning_print(self):
+        fake = _make_fake_knobs(autotuning_print=False)
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+        assert fake.autotuning.print is True
+
+    def test_respects_user_opt_out(self):
+        fake = _make_fake_knobs(autotuning_print=False)
+        with (
+            mock.patch.dict(os.environ, {"TRITON_PRINT_AUTOTUNING": "0"}),
+            _patch_triton_knobs(fake),
+        ):
+            jit_monitor.activate()
+        assert fake.autotuning.print is False
+
+    def test_noop_when_user_already_enabled(self):
+        fake = _make_fake_knobs(autotuning_print=True)
+        with (
+            mock.patch.dict(os.environ, {"TRITON_PRINT_AUTOTUNING": "1"}),
+            _patch_triton_knobs(fake),
+        ):
+            jit_monitor.activate()
+        assert fake.autotuning.print is True
+
+
+class TestJitHook:
+    def test_hook_registered(self):
+        fake = _make_fake_knobs()
+        assert fake.runtime.jit_post_compile_hook is None
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+        assert fake.runtime.jit_post_compile_hook is not None
+
+    def test_hook_logs_warning(self):
+        fake = _make_fake_knobs()
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+
+        hook = fake.runtime.jit_post_compile_hook
+        mock_fn = SimpleNamespace(name="test_kernel")
+
+        with mock.patch.object(jit_monitor.logger, "warning") as m:
+            hook(
+                key="some_key",
+                repr="some_repr",
+                fn=mock_fn,
+                compile=lambda: None,
+                is_manual_warmup=False,
+                already_compiled=False,
+            )
+
+        m.assert_called_once()
+        msg = m.call_args[0][0] % m.call_args[0][1:]
+        assert "Triton kernel JIT compilation during inference" in msg
+        assert "test_kernel" in msg
+
+    def test_hook_chains_existing_hook(self):
+        existing = mock.MagicMock(return_value="existing_result")
+        fake = _make_fake_knobs(jit_hook=existing)
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+
+        hook = fake.runtime.jit_post_compile_hook
+        mock_fn = SimpleNamespace(name="chained_kernel")
+        kwargs = dict(
+            key="k",
+            repr="r",
+            fn=mock_fn,
+            compile=lambda: None,
+            is_manual_warmup=False,
+            already_compiled=False,
+        )
+        result = hook(**kwargs)
+
+        existing.assert_called_once()
+        assert result == "existing_result"
+
+    def test_hook_works_without_existing_hook(self):
+        fake = _make_fake_knobs(jit_hook=None)
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+
+        hook = fake.runtime.jit_post_compile_hook
+        mock_fn = SimpleNamespace(name="solo_kernel")
+        result = hook(
+            key="k",
+            repr="r",
+            fn=mock_fn,
+            compile=lambda: None,
+            is_manual_warmup=False,
+            already_compiled=False,
+        )
+        assert result is None
+
+
+class TestNoTritonFallback:
+    def test_activate_without_triton(self):
+        with mock.patch.object(jit_monitor, "HAS_TRITON", False):
+            jit_monitor.activate()
+        assert jit_monitor.is_active()
+
+
+# ------------------------------------------------------------------
+# Integration tests (real Triton + GPU)
+# ------------------------------------------------------------------
+
+try:
+    import torch
+
+    _HAS_CUDA = torch.cuda.is_available()
+except ImportError:
+    _HAS_CUDA = False
+
+try:
+    import triton
+    import triton.language as tl
+
+    _HAS_TRITON = True
+except ImportError:
+    _HAS_TRITON = False
+
+_skip_no_gpu = pytest.mark.skipif(
+    not (_HAS_CUDA and _HAS_TRITON),
+    reason="Requires CUDA GPU and Triton",
+)
+
+
+if _HAS_TRITON:
+
+    @triton.jit
+    def _add_kernel(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr):
+        pid = tl.program_id(0)
+        offs = pid * BLOCK + tl.arange(0, BLOCK)
+        mask = offs < n
+        x = tl.load(x_ptr + offs, mask=mask)
+        y = tl.load(y_ptr + offs, mask=mask)
+        tl.store(out_ptr + offs, x + y, mask=mask)
+
+
+def _run_add_kernel(n: int, block: int = 256) -> None:
+    """Launch ``_add_kernel`` with vectors of length *n*."""
+    x = torch.randn(n, device="cuda")
+    y = torch.randn(n, device="cuda")
+    out = torch.empty(n, device="cuda")
+    grid = ((n + block - 1) // block,)
+    _add_kernel[grid](x, y, out, n, BLOCK=block)
+    torch.accelerator.synchronize()
+
+
+@_skip_no_gpu
+class TestTritonJitHookIntegration:
+    """End-to-end: real Triton kernel, real GPU, real hook."""
+
+    def test_no_warning_on_cached_shape(self):
+        _run_add_kernel(1024)
+
+        jit_monitor.activate()
+        with mock.patch.object(jit_monitor.logger, "warning") as w:
+            _run_add_kernel(1024)
+        w.assert_not_called()
+
+    def test_warning_on_new_constexpr(self):
+        _run_add_kernel(1024, block=256)
+
+        jit_monitor.activate()
+        with mock.patch.object(jit_monitor.logger, "warning") as w:
+            # Different BLOCK (a tl.constexpr) forces recompilation.
+            _run_add_kernel(1024, block=512)
+        w.assert_called()
+        msg = w.call_args[0][0] % w.call_args[0][1:]
+        assert "_add_kernel" in msg
diff --git a/tests/tokenizers_/test_deepseek_v4.py b/tests/tokenizers_/test_deepseek_v4.py
index 9f3b88cf658d..358732eabf40 100644
--- a/tests/tokenizers_/test_deepseek_v4.py
+++ b/tests/tokenizers_/test_deepseek_v4.py
@@ -40,6 +40,7 @@ def _model_config():
         multimodal_config=None,
         allowed_local_media_path="",
         allowed_media_domains=None,
+        enable_prompt_embeds=False,
     )
 
 
@@ -182,7 +183,7 @@ def test_deepseek_v4_renders_parsed_history_tool_arguments():
     assert 'parameter name="arguments"' not in prompt
 
 
-@pytest.mark.parametrize("reasoning_effort", ["none", "low", "medium", "high"])
+@pytest.mark.parametrize("reasoning_effort", ["minimal", "low", "medium", "high"])
 def test_deepseek_v4_accepts_openai_reasoning_effort_values(reasoning_effort):
     prompt = _tokenizer().apply_chat_template(
         [{"role": "user", "content": "Hello"}],
@@ -195,6 +196,58 @@ def test_deepseek_v4_accepts_openai_reasoning_effort_values(reasoning_effort):
     assert "Reasoning Effort: Absolute maximum" not in prompt
 
 
+def test_deepseek_v4_none_reasoning_effort_disables_thinking():
+    prompt = _tokenizer().apply_chat_template(
+        [{"role": "user", "content": "Hello"}],
+        tokenize=False,
+        enable_thinking=True,
+        reasoning_effort="none",
+    )
+
+    assert prompt == ("<｜begin▁of▁sentence｜><｜User｜>Hello<｜Assistant｜></think>")
+
+
+@pytest.mark.parametrize(
+    ("reasoning_effort", "expected_mode", "expected_effort"),
+    [
+        ("none", "chat", None),
+        ("minimal", "thinking", "high"),
+        ("low", "thinking", "high"),
+        ("medium", "thinking", "high"),
+        ("high", "thinking", "high"),
+        ("xhigh", "thinking", "max"),
+        ("max", "thinking", "max"),
+        ("unexpected", "thinking", "high"),
+    ],
+)
+def test_deepseek_v4_maps_compatible_thinking_reasoning_effort_values(
+    monkeypatch: pytest.MonkeyPatch,
+    reasoning_effort,
+    expected_mode,
+    expected_effort,
+):
+    captured_kwargs = []
+
+    def fake_encode_messages(messages, **kwargs):
+        captured_kwargs.append(kwargs)
+        return "prompt"
+
+    monkeypatch.setattr(
+        "vllm.tokenizers.deepseek_v4.encode_messages",
+        fake_encode_messages,
+    )
+
+    _tokenizer().apply_chat_template(
+        [{"role": "user", "content": "Hello"}],
+        tokenize=False,
+        enable_thinking=True,
+        reasoning_effort=reasoning_effort,
+    )
+
+    assert captured_kwargs[-1]["thinking_mode"] == expected_mode
+    assert captured_kwargs[-1]["reasoning_effort"] == expected_effort
+
+
 def test_deepseek_v4_preserves_reference_max_reasoning_effort():
     prompt = _tokenizer().apply_chat_template(
         [{"role": "user", "content": "Hello"}],
@@ -208,6 +261,19 @@ def test_deepseek_v4_preserves_reference_max_reasoning_effort():
     )
 
 
+def test_deepseek_v4_maps_xhigh_to_reference_max_reasoning_effort():
+    prompt = _tokenizer().apply_chat_template(
+        [{"role": "user", "content": "Hello"}],
+        tokenize=False,
+        enable_thinking=True,
+        reasoning_effort="xhigh",
+    )
+
+    assert prompt.startswith(
+        "<｜begin▁of▁sentence｜>Reasoning Effort: Absolute maximum"
+    )
+
+
 @pytest.mark.parametrize(
     ("case_id", "kwargs"),
     [
diff --git a/tests/tokenizers_/test_mistral.py b/tests/tokenizers_/test_mistral.py
index 2b101e8f98d9..2023337e8577 100644
--- a/tests/tokenizers_/test_mistral.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import copy
 from typing import Any
 
 import llguidance
@@ -11,353 +12,34 @@
 
 from vllm.tokenizers.mistral import (
     MistralTokenizer,
-    _prepare_apply_chat_template_tools_and_messages,
+    _validate_apply_chat_template_args,
 )
 
 
-@pytest.mark.parametrize(
-    "openai_request,expected_mistral_output",
-    [
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                        },
-                    }
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    }
-                ],
-            ),
-        ),
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    }
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    }
-                ],
-            ),
-        ),
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "unsupported_field": False,
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "unsupported_field2": False,
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                ],
-            ),
-        ),
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "unsupported_field": False,
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                    {
-                        "type": "function",
-                        "unsupported_field2": False,
-                        "function": {
-                            "description": "Fetch the current local date and time 2.",
-                            "name": "get_current_time2",
-                            "parameters": {"a": "1"},
-                        },
-                    },
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time 2.",
-                            "name": "get_current_time2",
-                            "parameters": {"a": "1"},
-                        },
-                    },
-                ],
-            ),
-        ),
-    ],
-)
-def test_prepare_apply_chat_template_tools_and_messages(
-    openai_request, expected_mistral_output
-):
-    actual_request = _prepare_apply_chat_template_tools_and_messages(
-        openai_request["messages"], openai_request["tools"]
-    )
-    assert actual_request == expected_mistral_output
-
-
-# Tool use with list content and reasoning
-@pytest.mark.parametrize(
-    "openai_request,expected_mistral_output",
-    [
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What's the weather in Paris?",
-                    },
-                    {
-                        "role": "assistant",
-                        "reasoning": None,
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call123",
-                                "type": "function",
-                                "function": {
-                                    "name": "get_weather",
-                                    "arguments": '{"city": "Paris"}',
-                                },
-                            }
-                        ],
-                    },
-                    {
-                        "role": "tool",
-                        "content": [{"type": "text", "text": "Rainy"}],
-                        "name": "get_weather",
-                        "tool_call_id": "call123",
-                    },
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "name": "get_weather",
-                            "description": "Gets the current weather in a city.",
-                            "parameters": {
-                                "type": "object",
-                                "properties": {
-                                    "city": {
-                                        "type": "string",
-                                        "description": "The city name",
-                                    }
-                                },
-                                "required": ["city"],
-                            },
-                        },
-                    }
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What's the weather in Paris?",
-                    },
-                    {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call123",
-                                "type": "function",
-                                "function": {
-                                    "name": "get_weather",
-                                    "arguments": '{"city": "Paris"}',
-                                },
-                            }
-                        ],
-                    },
-                    {
-                        "role": "tool",
-                        "content": [{"type": "text", "text": "Rainy"}],
-                        "name": "get_weather",
-                        "tool_call_id": "call123",
-                    },
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "name": "get_weather",
-                            "description": "Gets the current weather in a city.",
-                            "parameters": {
-                                "type": "object",
-                                "properties": {
-                                    "city": {
-                                        "type": "string",
-                                        "description": "The city name",
-                                    }
-                                },
-                                "required": ["city"],
-                            },
-                        },
-                    }
-                ],
-            ),
-        )
-    ],
-)
-def test_prepare_apply_chat_template_tools_and_messages_list_content(
-    openai_request, expected_mistral_output
-):
-    actual_request = _prepare_apply_chat_template_tools_and_messages(
-        openai_request["messages"], openai_request["tools"]
-    )
-    assert actual_request == expected_mistral_output
-
-
-def test_prepare_apply_chat_template_generation_prompt_and_continue():
+def test_validate_apply_chat_template_args():
+    # add_generation_prompt with assistant last message → error
     messages = [{"role": "assistant", "content": "Hello"}]
-    tools: list[dict[str, Any]] = []
     with pytest.raises(ValueError):
-        _prepare_apply_chat_template_tools_and_messages(
-            messages, tools, add_generation_prompt=True
-        )
+        _validate_apply_chat_template_args(messages, add_generation_prompt=True)
 
+    # add_generation_prompt with user last message → ok
     messages = [{"role": "user", "content": "Hello"}]
-    out_messages, _ = _prepare_apply_chat_template_tools_and_messages(
-        messages, tools, add_generation_prompt=True
-    )
-    assert out_messages == [{"role": "user", "content": "Hello"}]
+    _validate_apply_chat_template_args(messages, add_generation_prompt=True)
 
+    # both add_generation_prompt and continue_final_message → error
     with pytest.raises(ValueError):
-        _prepare_apply_chat_template_tools_and_messages(
-            messages, tools, add_generation_prompt=True, continue_final_message=True
+        _validate_apply_chat_template_args(
+            messages, add_generation_prompt=True, continue_final_message=True
         )
 
+    # continue_final_message with assistant last message → ok
     messages = [{"role": "assistant", "content": "Hello"}]
-    out_messages, _ = _prepare_apply_chat_template_tools_and_messages(
-        messages, tools, add_generation_prompt=False, continue_final_message=True
-    )
-    assert out_messages == [{"role": "assistant", "content": "Hello"}]
+    _validate_apply_chat_template_args(messages, continue_final_message=True)
 
+    # continue_final_message with user last message → error
     messages = [{"role": "user", "content": "Hello"}]
     with pytest.raises(ValueError):
-        _prepare_apply_chat_template_tools_and_messages(
-            messages, tools, add_generation_prompt=False, continue_final_message=True
-        )
+        _validate_apply_chat_template_args(messages, continue_final_message=True)
 
 
 @pytest.fixture(scope="module")
@@ -2435,3 +2117,120 @@ def test_llg_tokenizer(self, mistral_tokenizer: MistralTokenizer) -> None:
         # Test caching
         llg_tokenizer_2 = mistral_tokenizer.llg_tokenizer
         assert llg_tokenizer is llg_tokenizer_2
+
+    @pytest.mark.parametrize(
+        "messages,tools,tekken_expected_substrings,spm_expected_substrings",
+        [
+            pytest.param(
+                [{"role": "user", "content": "Hello"}],
+                [{"type": "function", "function": {"name": "do_nothing"}}],
+                ["do_nothing", '"description": ""', '"parameters": {}'],
+                ["do_nothing", '"description":▁""', '"parameters":▁{}'],
+                id="tool_without_description_and_parameters",
+            ),
+            pytest.param(
+                [
+                    {"role": "user", "content": "Do nothing"},
+                    {
+                        "role": "assistant",
+                        "content": "",
+                        "tool_calls": [
+                            {
+                                "id": "123456789",
+                                "type": "function",
+                                "function": {
+                                    "name": "do_nothing",
+                                    "arguments": None,
+                                },
+                            }
+                        ],
+                    },
+                    {
+                        "role": "tool",
+                        "tool_call_id": "123456789",
+                        "content": "done",
+                    },
+                ],
+                [{"type": "function", "function": {"name": "do_nothing"}}],
+                ["do_nothing"],
+                ["do_nothing"],
+                id="tool_call_with_none_arguments",
+            ),
+        ],
+    )
+    def test_apply_chat_template_tool_optional_fields(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]],
+        tekken_expected_substrings: list[str],
+        spm_expected_substrings: list[str],
+    ) -> None:
+        output = mistral_tokenizer.apply_chat_template(
+            messages, tools=tools, add_generation_prompt=True
+        )
+        decoded = mistral_tokenizer.tokenizer.decode(output, SpecialTokenPolicy.KEEP)
+
+        expected = (
+            tekken_expected_substrings
+            if mistral_tokenizer.is_tekken
+            else spm_expected_substrings
+        )
+        for substring in expected:
+            assert substring in decoded
+
+    def test_apply_chat_template_tools_not_mutated(
+        self, mistral_tokenizer: MistralTokenizer
+    ) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Hello"},
+        ]
+        tools: list[dict[str, Any]] = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Gets weather.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string"},
+                        },
+                    },
+                },
+            },
+        ]
+        original_tools = copy.deepcopy(tools)
+
+        mistral_tokenizer.apply_chat_template(
+            messages, tools=tools, add_generation_prompt=True
+        )
+
+        assert tools == original_tools
+
+    @pytest.mark.parametrize(
+        "reasoning_key",
+        ["reasoning", "reasoning_content"],
+    )
+    def test_apply_chat_template_reasoning_assistant(
+        self, mistral_tokenizer: MistralTokenizer, reasoning_key: str
+    ) -> None:
+        if not mistral_tokenizer.is_tekken:
+            pytest.skip("Reasoning tokens only supported on tekken tokenizers")
+
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "What is 2+2?"},
+            {
+                "role": "assistant",
+                "content": "4",
+                reasoning_key: "2+2 equals 4",
+            },
+            {"role": "user", "content": "Are you sure?"},
+        ]
+
+        output = mistral_tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True
+        )
+        decoded = mistral_tokenizer.tokenizer.decode(output, SpecialTokenPolicy.KEEP)
+
+        assert "[THINK]2+2 equals 4[/THINK]" in decoded
diff --git a/tests/tool_parsers/test_deepseekv32_tool_parser.py b/tests/tool_parsers/test_deepseekv32_tool_parser.py
index 6145253d9f90..693cf5caddd5 100644
--- a/tests/tool_parsers/test_deepseekv32_tool_parser.py
+++ b/tests/tool_parsers/test_deepseekv32_tool_parser.py
@@ -188,6 +188,149 @@ def test_multiple_tools(self, parser):
             "location": "NYC"
         }
 
+    def test_type_conversion_in_non_streaming(self):
+        """Non-streaming extraction must convert params using the tool schema."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="toggle",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "enabled": {"type": "boolean"},
+                        "count": {"type": "integer"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}toggle">\n'
+            f'{PARAM_START}enabled" string="false">true{PARAM_END}\n'
+            f'{PARAM_START}count" string="false">42{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        assert len(result.tool_calls) == 1
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"enabled": True, "count": 42}
+        assert isinstance(args["enabled"], bool)
+        assert isinstance(args["count"], int)
+
+    def test_string_attr_true_preserves_literal_despite_schema(self):
+        """string="true" must keep the value as a string even
+        if the schema says integer."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="score",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "integer"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}score">\n'
+            f'{PARAM_START}value" string="true">42{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"value": "42"}
+        assert isinstance(args["value"], str)
+
+    def test_string_attr_false_allows_schema_conversion(self):
+        """string="false" allows the parser to convert via the tool schema."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="score",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "integer"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}score">\n'
+            f'{PARAM_START}value" string="false">42{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"value": 42}
+        assert isinstance(args["value"], int)
+
+    def test_arguments_wrapper_repaired(self):
+        """A single 'arguments' wrapper parameter must be unwrapped when it
+        is not part of the tool schema and the inner object matches schema fields."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="get_weather",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}get_weather">\n'
+            f'{PARAM_START}arguments" string="false">'
+            f'{{"location":"Beijing"}}'
+            f"{PARAM_END}\n"
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"location": "Beijing"}
+
+    def test_input_wrapper_repaired(self):
+        """A single 'input' wrapper parameter must be unwrapped similarly."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="get_weather",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}get_weather">\n'
+            f'{PARAM_START}input" string="true">'
+            f'{{"location":"Beijing"}}'
+            f"{PARAM_END}\n"
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"location": "Beijing"}
+
 
 # ---------------------------------------------------------------------------
 # Tests: extract_tool_calls_streaming
@@ -295,11 +438,45 @@ def test_type_conversion_in_streaming(self):
             ),
         )
         parser = make_parser(tools=[tool])
-        full_text = build_tool_call("add", {"x": "3", "y": "4"})
+        full_text = (
+            f"{FC_START}\n"
+            f'{INV_START}add">\n'
+            f'{PARAM_START}x" string="false">3{PARAM_END}\n'
+            f'{PARAM_START}y" string="false">4{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
         deltas = self._stream(parser, full_text)
         args_str = self._reconstruct_args(deltas)
         assert json.loads(args_str) == {"x": 3, "y": 4}
 
+    def test_string_attr_true_preserves_literal_in_streaming(self):
+        """Streaming: string='true' must keep the value literal despite schema."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="score",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "integer"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        full_text = (
+            f"{FC_START}\n"
+            f'{INV_START}score">\n'
+            f'{PARAM_START}value" string="true">42{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        deltas = self._stream(parser, full_text)
+        args_str = self._reconstruct_args(deltas)
+        args = json.loads(args_str)
+        assert args == {"value": "42"}
+        assert isinstance(args["value"], str)
+
     def test_multiple_tools_streaming(self, parser):
         full_text = (
             f"{FC_START}\n"
diff --git a/tests/tool_parsers/test_deepseekv4_tool_parser.py b/tests/tool_parsers/test_deepseekv4_tool_parser.py
index 631d0fb97b33..afcd0573958b 100644
--- a/tests/tool_parsers/test_deepseekv4_tool_parser.py
+++ b/tests/tool_parsers/test_deepseekv4_tool_parser.py
@@ -6,6 +6,15 @@
 import json
 from unittest.mock import MagicMock
 
+import pytest
+from xgrammar import StructuralTag
+
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedFunction,
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+)
 from vllm.tool_parsers import ToolParserManager
 from vllm.tool_parsers.deepseekv4_tool_parser import DeepSeekV4ToolParser
 
@@ -20,6 +29,43 @@
 PARAM_END = "</｜DSML｜parameter>"
 
 
+@pytest.fixture
+def sample_tools() -> list[ChatCompletionToolsParam]:
+    return [
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string", "description": "The city name"},
+                        "state": {"type": "string", "description": "The state code"},
+                        "unit": {"type": "string", "enum": ["fahrenheit", "celsius"]},
+                    },
+                    "required": ["city", "state"],
+                },
+            },
+        ),
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "calculate_area",
+                "description": "Calculate area of a shape",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "shape": {"type": "string"},
+                        "dimensions": {"type": "object"},
+                        "precision": {"type": "integer"},
+                    },
+                },
+            },
+        ),
+    ]
+
+
 def make_parser(tools=None) -> DeepSeekV4ToolParser:
     return DeepSeekV4ToolParser(MOCK_TOKENIZER, tools=tools)
 
@@ -121,3 +167,72 @@ def test_streaming_extracts_complete_invokes():
     ]
     assert names == ["search"]
     assert json.loads(reconstruct_args(deltas)) == {"query": "deepseek v4"}
+
+
+def test_get_vllm_registry_structural_tag_returns_structural_tag(
+    sample_tools: list[ChatCompletionToolsParam],
+) -> None:
+    parser = make_parser()
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=sample_tools,
+        tool_choice="auto",
+    )
+    tag = parser.get_structural_tag(req)
+    assert isinstance(tag, StructuralTag)
+
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=sample_tools,
+        tool_choice="required",
+    )
+    tag = parser.get_structural_tag(req)
+    assert isinstance(tag, StructuralTag)
+
+    if sample_tools:
+        tool = sample_tools[0]
+        req = ChatCompletionRequest(
+            messages=[],
+            model="m",
+            tools=sample_tools,
+        )
+        req.tool_choice = ChatCompletionNamedToolChoiceParam(
+            function=ChatCompletionNamedFunction(name=tool.function.name)
+        )
+        tag = parser.get_structural_tag(req)
+        assert isinstance(tag, StructuralTag)
+
+
+def test_extract_tool_calls_arguments_wrapper():
+    mock_tokenizer = MagicMock()
+    mock_tokenizer.get_vocab.return_value = {}
+
+    tool = ChatCompletionToolsParam(
+        type="function",
+        function={
+            "name": "get_weather",
+            "parameters": {
+                "type": "object",
+                "properties": {"location": {"type": "string"}},
+            },
+        },
+    )
+
+    parser = DeepSeekV4ToolParser(mock_tokenizer, tools=[tool])
+    request = MagicMock()
+    request.tools = [tool]
+
+    model_output = (
+        f"{TC_START}"
+        f'{INV_START}get_weather">'
+        f'{PARAM_START}arguments" string="false">{{"location":"Beijing"}}{PARAM_END}'
+        f"{INV_END}"
+        f"{TC_END}"
+    )
+
+    result = parser.extract_tool_calls(model_output, request)
+    assert result.tools_called
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args == {"location": "Beijing"}
diff --git a/tests/tool_parsers/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py
index 42e8cf138b97..f6a5c6bfb265 100644
--- a/tests/tool_parsers/test_mistral_tool_parser.py
+++ b/tests/tool_parsers/test_mistral_tool_parser.py
@@ -590,6 +590,33 @@ def _test_extract_tool_calls_streaming(
     ]
     assert_tool_calls(actual_tool_calls, expected_tool_calls)
 
+    if expected_tool_calls:
+        assert len(tool_parser.streamed_args_for_tool) == len(expected_tool_calls)
+        assert len(tool_parser.prev_tool_call_arr) == len(expected_tool_calls)
+        for i in range(len(expected_tool_calls)):
+            assert (
+                tool_parser.prev_tool_call_arr[i]["arguments"]
+                == tool_parser.streamed_args_for_tool[i]
+            )
+            assert tool_parser.streamed_args_for_tool[i] == function_args_strs[i]
+            assert (
+                tool_parser.prev_tool_call_arr[i]["name"]
+                == expected_tool_calls[i].function.name
+            )
+
+        # Simulate the serving layer's unstreamed-args check
+        index = len(tool_parser.prev_tool_call_arr) - 1
+        args = tool_parser.prev_tool_call_arr[index].get("arguments", {})
+        expected_call = (
+            args if isinstance(args, str) else json.dumps(args, ensure_ascii=False)
+        )
+        actual_call = tool_parser.streamed_args_for_tool[index]
+        remaining_call = expected_call.replace(actual_call, "", 1)
+        assert remaining_call == ""
+    else:
+        assert len(tool_parser.streamed_args_for_tool) == 0
+        assert len(tool_parser.prev_tool_call_arr) == 0
+
 
 @pytest.mark.parametrize(
     ids=[
@@ -855,6 +882,8 @@ def test_extract_tool_calls_streaming_v11_no_tools(
         previous_text = current_text
 
     assert collected_content == model_output
+    assert len(mistral_tool_parser.streamed_args_for_tool) == 0
+    assert len(mistral_tool_parser.prev_tool_call_arr) == 0
 
 
 @pytest.mark.parametrize(
diff --git a/tests/tool_parsers/test_qwen3coder_tool_parser.py b/tests/tool_parsers/test_qwen3coder_tool_parser.py
index c62e95830243..26bbf1a044bc 100644
--- a/tests/tool_parsers/test_qwen3coder_tool_parser.py
+++ b/tests/tool_parsers/test_qwen3coder_tool_parser.py
@@ -6,8 +6,11 @@
 
 import pytest
 from openai.types.responses.function_tool import FunctionTool
+from xgrammar import StructuralTag
 
 from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedFunction,
+    ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionToolsParam,
 )
@@ -108,6 +111,27 @@ def sample_tools(request):
         ]
 
 
+def _as_chat_completion_tools(
+    tools: list[ChatCompletionToolsParam | FunctionTool],
+) -> list[ChatCompletionToolsParam]:
+    normalized: list[ChatCompletionToolsParam] = []
+    for tool in tools:
+        if isinstance(tool, ChatCompletionToolsParam):
+            normalized.append(tool)
+        else:
+            normalized.append(
+                ChatCompletionToolsParam(
+                    type="function",
+                    function={
+                        "name": tool.name,
+                        "description": tool.description,
+                        "parameters": tool.parameters,
+                    },
+                )
+            )
+    return normalized
+
+
 def assert_tool_calls(
     actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
 ):
@@ -1146,3 +1170,88 @@ def test_no_double_serialization_string_args(qwen3_tool_parser):
     args = json.loads(raw_arguments)
     assert args["message"] == "hello world"
     assert '\\"hello world\\"' not in raw_arguments
+
+
+def test_get_vllm_registry_structural_tag_returns_structural_tag(
+    qwen3_tool_parser: Qwen3CoderToolParser,
+    sample_tools: list[ChatCompletionToolsParam],
+) -> None:
+    request_tools = _as_chat_completion_tools(sample_tools)
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=request_tools,
+        tool_choice="auto",
+    )
+    tag = qwen3_tool_parser.get_structural_tag(req)
+    assert isinstance(tag, StructuralTag)
+
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=request_tools,
+        tool_choice="required",
+    )
+    tag = qwen3_tool_parser.get_structural_tag(req)
+    assert isinstance(tag, StructuralTag)
+
+    if request_tools:
+        tool = request_tools[0]
+        req = ChatCompletionRequest(
+            messages=[],
+            model="m",
+            tools=request_tools,
+        )
+        req.tool_choice = ChatCompletionNamedToolChoiceParam(
+            function=ChatCompletionNamedFunction(name=tool.function.name)
+        )
+        tag = qwen3_tool_parser.get_structural_tag(req)
+        assert isinstance(tag, StructuralTag)
+
+
+@pytest.mark.parametrize("include_reasoning", [True, False])
+def test_adjust_request_auto_uses_vllm_registry_structural_tag(
+    monkeypatch: pytest.MonkeyPatch,
+    qwen3_tool_parser: Qwen3CoderToolParser,
+    sample_tools: list[ChatCompletionToolsParam],
+    include_reasoning: bool,
+) -> None:
+    monkeypatch.setattr(
+        "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
+        True,
+    )
+    request_tools = _as_chat_completion_tools(sample_tools)
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=request_tools,
+        tool_choice="auto",
+        include_reasoning=include_reasoning,
+    )
+    out = qwen3_tool_parser.adjust_request(req)
+    assert out.structured_outputs is not None
+    assert out.structured_outputs.structural_tag is not None
+    assert isinstance(out.structured_outputs.structural_tag, str)
+    loaded = json.loads(out.structured_outputs.structural_tag)
+    assert isinstance(loaded, dict)
+
+
+def test_adjust_request_required_prefers_structural_tag(
+    monkeypatch: pytest.MonkeyPatch,
+    qwen3_tool_parser: Qwen3CoderToolParser,
+    sample_tools: list[ChatCompletionToolsParam],
+) -> None:
+    monkeypatch.setattr(
+        "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
+        True,
+    )
+    request_tools = _as_chat_completion_tools(sample_tools)
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=request_tools,
+        tool_choice="required",
+    )
+    out = qwen3_tool_parser.adjust_request(req)
+    assert out.structured_outputs is not None
+    assert out.structured_outputs.structural_tag is not None
diff --git a/tests/tools/test_docker_build_metadata_args.py b/tests/tools/test_docker_build_metadata_args.py
new file mode 100644
index 000000000000..fa2eac558f53
--- /dev/null
+++ b/tests/tools/test_docker_build_metadata_args.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import shlex
+import subprocess
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+HELPER = REPO_ROOT / ".buildkite" / "scripts" / "docker-build-metadata-args.sh"
+
+
+def run_helper(
+    *args: str,
+    env: dict[str, str] | None = None,
+    path: str | None = None,
+) -> list[str]:
+    helper_env = {"PATH": path or os.environ["PATH"]}
+    if env:
+        helper_env.update(env)
+    result = subprocess.run(
+        ["bash", str(HELPER), *args],
+        check=True,
+        env=helper_env,
+        stdout=subprocess.PIPE,
+        text=True,
+    )
+    return shlex.split(result.stdout)
+
+
+def option_values(args: list[str], option: str) -> list[str]:
+    return [args[i + 1] for i, arg in enumerate(args[:-1]) if arg == option]
+
+
+def build_args(args: list[str]) -> dict[str, str]:
+    values = {}
+    for value in option_values(args, "--build-arg"):
+        key, arg_value = value.split("=", 1)
+        values[key] = arg_value
+    return values
+
+
+def test_release_metadata_args_prefer_pipeline_id() -> None:
+    args = run_helper(
+        "cu130-ubuntu2404",
+        env={
+            "BUILDKITE": "1",
+            "BUILDKITE_COMMIT": "abc123",
+            "BUILDKITE_PIPELINE_ID": "pipe-uuid",
+            "BUILDKITE_PIPELINE_SLUG": "release",
+            "BUILDKITE_BUILD_URL": "https://buildkite.example/vllm/builds/1",
+            "RELEASE_VERSION": "v0.20.0",
+        },
+    )
+
+    assert build_args(args) == {
+        "VLLM_BUILD_COMMIT": "abc123",
+        "VLLM_BUILD_PIPELINE": "pipe-uuid",
+        "VLLM_BUILD_URL": "https://buildkite.example/vllm/builds/1",
+        "VLLM_IMAGE_TAG": "vllm/vllm-openai:v0.20.0-cu130-ubuntu2404",
+    }
+    expected_tag = (
+        "public.ecr.aws/q9t5s3a7/vllm-release-repo:"
+        f"abc123-{os.uname().machine}-cu130-ubuntu2404"
+    )
+    assert option_values(args, "--tag") == [expected_tag]
+
+
+def test_nightly_metadata_args_fall_back_to_pipeline_slug() -> None:
+    args = run_helper(
+        "ubuntu2404",
+        env={
+            "BUILDKITE": "1",
+            "BUILDKITE_COMMIT": "def456",
+            "BUILDKITE_PIPELINE_SLUG": "release",
+            "BUILDKITE_BUILD_URL": "https://buildkite.example/vllm/builds/2",
+            "NIGHTLY": "1",
+        },
+    )
+
+    assert build_args(args) == {
+        "VLLM_BUILD_COMMIT": "def456",
+        "VLLM_BUILD_PIPELINE": "release",
+        "VLLM_BUILD_URL": "https://buildkite.example/vllm/builds/2",
+        "VLLM_IMAGE_TAG": "vllm/vllm-openai:nightly-def456-ubuntu2404",
+    }
+    expected_tag = (
+        "public.ecr.aws/q9t5s3a7/vllm-release-repo:"
+        f"def456-{os.uname().machine}-ubuntu2404"
+    )
+    assert option_values(args, "--tag") == [expected_tag]
+
+
+def test_local_metadata_args_use_local_overrides() -> None:
+    args = run_helper(
+        env={
+            "VLLM_IMAGE_TAG": "local/test:dev",
+            "VLLM_BUILD_COMMIT": "localsha",
+            "VLLM_BUILD_PIPELINE": "local-pipeline",
+            "VLLM_BUILD_URL": "https://buildkite.example/local",
+        },
+    )
+
+    assert build_args(args) == {
+        "VLLM_BUILD_COMMIT": "localsha",
+        "VLLM_BUILD_PIPELINE": "local-pipeline",
+        "VLLM_BUILD_URL": "https://buildkite.example/local",
+        "VLLM_IMAGE_TAG": "local/test:dev",
+    }
+    assert option_values(args, "--tag") == ["local/test:dev"]
+
+
+def test_release_version_lookup_failure_falls_back_to_commit(
+    tmp_path: Path,
+) -> None:
+    fake_bin = tmp_path / "bin"
+    fake_bin.mkdir()
+    buildkite_agent = fake_bin / "buildkite-agent"
+    buildkite_agent.write_text("#!/bin/sh\nexit 1\n")
+    buildkite_agent.chmod(0o755)
+
+    args = run_helper(
+        "cu129",
+        env={
+            "BUILDKITE": "1",
+            "BUILDKITE_COMMIT": "fallback123",
+            "BUILDKITE_PIPELINE_SLUG": "release",
+        },
+        path=f"{fake_bin}:{os.environ['PATH']}",
+    )
+
+    assert build_args(args)["VLLM_IMAGE_TAG"] == ("vllm/vllm-openai:vfallback123-cu129")
+
+
+def test_vllm_openai_image_embeds_metadata_contract() -> None:
+    dockerfile = (REPO_ROOT / "docker" / "Dockerfile").read_text()
+
+    for expected in (
+        "ARG VLLM_BUILD_COMMIT",
+        "ARG VLLM_BUILD_PIPELINE",
+        "ARG VLLM_BUILD_URL",
+        "ARG VLLM_IMAGE_TAG",
+        "VLLM_BUILD_COMMIT=${VLLM_BUILD_COMMIT:-unknown}",
+        "VLLM_BUILD_PIPELINE=${VLLM_BUILD_PIPELINE:-local}",
+        "VLLM_BUILD_URL=${VLLM_BUILD_URL:-}",
+        "VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-local/vllm-openai:dev}",
+        'ai.vllm.build.commit="${VLLM_BUILD_COMMIT}"',
+        'ai.vllm.build.pipeline="${VLLM_BUILD_PIPELINE}"',
+        'ai.vllm.build.url="${VLLM_BUILD_URL}"',
+        'ai.vllm.image.tag="${VLLM_IMAGE_TAG}"',
+    ):
+        assert expected in dockerfile
diff --git a/tests/utils.py b/tests/utils.py
index 5ccdaa0d64e2..e8fd3f1e8152 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import atexit
 import contextlib
 import copy
 import functools
@@ -14,10 +15,11 @@
 import subprocess
 import sys
 import tempfile
+import threading
 import time
 import warnings
-from collections.abc import Callable, Iterable
-from contextlib import ExitStack, contextmanager, suppress
+from collections.abc import Callable, Iterable, Sequence
+from contextlib import ExitStack, contextmanager
 from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Literal
@@ -62,8 +64,6 @@
 FP8_DTYPE = current_platform.fp8_dtype()
 
 if current_platform.is_rocm():
-    import threading
-
     from amdsmi import (
         amdsmi_get_gpu_vram_usage,
         amdsmi_get_processor_handles,
@@ -135,6 +135,11 @@ class RemoteVLLMServer:
     """
 
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
+    _active_servers: set["RemoteVLLMServer"] = set()
+    _active_servers_lock = threading.RLock()
+    _cleanup_hooks_registered = False
+    _signal_hooks_registered = False
+    _previous_signal_handlers: dict[int, Any] = {}
     proc: subprocess.Popen
 
     def _create_cli_subcommand(self):
@@ -210,6 +215,7 @@ def __init__(
         )
 
         self._pre_download_model(model, args)
+        self._shutdown_complete = False
 
         # Record GPU memory before server start so we know what
         # "released" looks like.
@@ -222,6 +228,7 @@ def __init__(
             )
 
         self._start_server(model, vllm_serve_args, env_dict)
+        self._register_active_server()
         max_wait_seconds = max_wait_seconds or 480
         try:
             self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)
@@ -247,6 +254,78 @@ def _shutdown(self) -> None:
         (when the server fails to start). Must be safe to call even if
         the process is already dead.
         """
+        if self._shutdown_complete:
+            return
+
+        self._shutdown_complete = True
+        try:
+            self._terminate_process_tree()
+            self._wait_for_gpu_memory_release()
+        finally:
+            self._unregister_active_server()
+
+    @classmethod
+    def _ensure_cleanup_hooks_registered(cls) -> None:
+        """Register process-exit cleanup for detached server subprocesses."""
+        root_cls = RemoteVLLMServer
+        with root_cls._active_servers_lock:
+            if not root_cls._cleanup_hooks_registered:
+                atexit.register(root_cls._shutdown_active_servers)
+                root_cls._cleanup_hooks_registered = True
+
+            if (
+                threading.current_thread() is threading.main_thread()
+                and not root_cls._signal_hooks_registered
+            ):
+                for signum in (signal.SIGTERM, signal.SIGINT):
+                    root_cls._previous_signal_handlers[signum] = signal.getsignal(
+                        signum
+                    )
+                    signal.signal(signum, root_cls._handle_parent_signal)
+                root_cls._signal_hooks_registered = True
+
+    def _register_active_server(self) -> None:
+        """Track this server so parent-process exits still clean it up."""
+        RemoteVLLMServer._ensure_cleanup_hooks_registered()
+        with RemoteVLLMServer._active_servers_lock:
+            RemoteVLLMServer._active_servers.add(self)
+
+    def _unregister_active_server(self) -> None:
+        with RemoteVLLMServer._active_servers_lock:
+            RemoteVLLMServer._active_servers.discard(self)
+
+    @classmethod
+    def _shutdown_active_servers(cls) -> None:
+        """Best-effort shutdown for all live RemoteVLLMServer instances."""
+        with cls._active_servers_lock:
+            servers = list(cls._active_servers)
+
+        for server in servers:
+            with contextlib.suppress(Exception):
+                server._shutdown()
+
+    @classmethod
+    def _handle_parent_signal(cls, signum, frame) -> None:
+        """Clean up detached servers before letting the signal terminate pytest."""
+        cls._shutdown_active_servers()
+
+        previous_handler = cls._previous_signal_handlers.get(signum, signal.SIG_DFL)
+        if callable(previous_handler):
+            previous_handler(signum, frame)
+        elif previous_handler == signal.SIG_IGN:
+            return
+        elif signum == signal.SIGINT:
+            raise KeyboardInterrupt
+        else:
+            raise SystemExit(128 + signum)
+
+    def _terminate_process_tree(self) -> None:
+        """Kill the server process tree without waiting for GPU memory release.
+
+        Split out from ``_shutdown`` so that ``shutdown_many`` can run this
+        phase in parallel for sibling servers and then wait for GPU memory
+        release once at the end.
+        """
         pid = self.proc.pid
 
         # Get the process group ID. Because we used
@@ -288,9 +367,56 @@ def _shutdown(self) -> None:
         # prevent VRAM from being reclaimed by the driver.
         self._kill_process_group_survivors(pgid)
 
-        # Wait for GPU memory to actually be freed, not just
-        # "stabilized at whatever level it's at".
-        self._wait_for_gpu_memory_release()
+    @classmethod
+    def shutdown_many(cls, servers: Sequence["RemoteVLLMServer"]) -> None:
+        """Shut down multiple sibling servers and wait for GPU memory once.
+
+        Test fixtures that hold several ``RemoteVLLMServer`` instances at
+        once must NOT shut them down by calling each server's ``__exit__``
+        sequentially: every server measures total GPU memory across all
+        visible devices in ``_wait_for_gpu_memory_release``, so the first
+        server's wait blocks the full timeout because later sibling
+        servers are still holding GPU memory.
+
+        Instead, this method terminates every server's process tree in
+        parallel, then runs the GPU-memory-release wait once against the
+        earliest recorded baseline (memory before any server started).
+        """
+        if not servers:
+            return
+
+        for server in servers:
+            server._shutdown_complete = True
+
+        threads = [
+            threading.Thread(
+                target=s._terminate_process_tree,
+                name=f"shutdown-{s.proc.pid}",
+                daemon=True,
+            )
+            for s in servers
+        ]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        # Use the smallest pre-server baseline so the wait targets memory
+        # usage before *any* of these sibling servers started, not after
+        # earlier siblings had already allocated.
+        earliest = min(
+            servers,
+            key=lambda s: (
+                float("inf")
+                if s._pre_server_gpu_memory is None
+                else s._pre_server_gpu_memory
+            ),
+        )
+        try:
+            earliest._wait_for_gpu_memory_release()
+        finally:
+            for server in servers:
+                server._unregister_active_server()
 
     def _kill_process_group_survivors(
         self, pgid: int | None, timeout: float = 15.0
@@ -656,6 +782,7 @@ def _test_completion(
     model: str,
     prompt: str,
     token_ids: list[int],
+    include_seeded_sampling: bool = True,
 ):
     results = []
 
@@ -690,33 +817,40 @@ def _test_completion(
         }
     )
 
-    # test seeded random sampling
-    completion = client.completions.create(
-        model=model, prompt=prompt, max_tokens=5, seed=33, temperature=1.0
-    )
+    if include_seeded_sampling:
+        # test seeded random sampling
+        completion = client.completions.create(
+            model=model, prompt=prompt, max_tokens=5, seed=33, temperature=1.0
+        )
 
-    results.append(
-        {
-            "test": "seeded_sampling",
-            "text": completion.choices[0].text,
-            "finish_reason": completion.choices[0].finish_reason,
-            "usage": completion.usage,
-        }
-    )
+        results.append(
+            {
+                "test": "seeded_sampling",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            }
+        )
 
-    # test seeded random sampling with multiple prompts
-    completion = client.completions.create(
-        model=model, prompt=[prompt, prompt], max_tokens=5, seed=33, temperature=1.0
-    )
+        # test seeded random sampling with multiple prompts
+        completion = client.completions.create(
+            model=model,
+            prompt=[prompt, prompt],
+            max_tokens=5,
+            seed=33,
+            temperature=1.0,
+        )
 
-    results.append(
-        {
-            "test": "seeded_sampling",
-            "text": [choice.text for choice in completion.choices],
-            "finish_reason": [choice.finish_reason for choice in completion.choices],
-            "usage": completion.usage,
-        }
-    )
+        results.append(
+            {
+                "test": "seeded_sampling",
+                "text": [choice.text for choice in completion.choices],
+                "finish_reason": [
+                    choice.finish_reason for choice in completion.choices
+                ],
+                "usage": completion.usage,
+            }
+        )
 
     # test simple list
     batch = client.completions.create(
@@ -911,6 +1045,7 @@ def compare_two_settings(
     *,
     method: str = "generate",
     max_wait_seconds: float | None = None,
+    include_seeded_sampling: bool = True,
 ) -> None:
     """
     Launch API server with two different sets of arguments/environments
@@ -922,6 +1057,8 @@ def compare_two_settings(
         arg2: The second set of arguments to pass to the API server.
         env1: The first set of environment variables to pass to the API server.
         env2: The second set of environment variables to pass to the API server.
+        include_seeded_sampling: Whether to include temperature=1.0 seeded
+            sampling checks in the default generate comparison.
     """
 
     compare_all_settings(
@@ -930,6 +1067,7 @@ def compare_two_settings(
         [env1, env2],
         method=method,
         max_wait_seconds=max_wait_seconds,
+        include_seeded_sampling=include_seeded_sampling,
     )
 
 
@@ -940,6 +1078,7 @@ def compare_all_settings(
     *,
     method: str = "generate",
     max_wait_seconds: float | None = None,
+    include_seeded_sampling: bool = True,
 ) -> None:
     """
     Launch API server with several different sets of arguments/environments
@@ -948,6 +1087,8 @@ def compare_all_settings(
         model: The model to test.
         all_args: A list of argument lists to pass to the API server.
         all_envs: A list of environment dictionaries to pass to the API server.
+        include_seeded_sampling: Whether to include temperature=1.0 seeded
+            sampling checks in the default generate comparison.
     """
 
     trust_remote_code = False
@@ -1008,7 +1149,13 @@ def compare_all_settings(
             )
 
             if method == "generate":
-                results += _test_completion(client, model, prompt, token_ids)
+                results += _test_completion(
+                    client,
+                    model,
+                    prompt,
+                    token_ids,
+                    include_seeded_sampling=include_seeded_sampling,
+                )
             elif method == "generate_close":
                 results += _test_completion_close(client, model, prompt)
             elif method == "generate_chat":
@@ -1365,52 +1512,65 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
 
 
 def spawn_new_process_for_each_test(f: Callable[_P, None]) -> Callable[_P, None]:
-    """Decorator to spawn a new process for each test function."""
+    """Decorator to spawn a new process for each test function.
+
+    Uses subprocess with cloudpickle to serialize the test function and
+    propagates exceptions back to the parent, so test failures are never
+    silently swallowed (fixes https://github.com/vllm-project/vllm/issues/41415).
+    """
 
     @functools.wraps(f)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
-        # Check if we're already in a subprocess
-        if os.environ.get("RUNNING_IN_SUBPROCESS") == "1":
-            # If we are, just run the function directly
-            return f(*args, **kwargs)
-
-        import torch.multiprocessing as mp
-
-        with suppress(RuntimeError):
-            mp.set_start_method("spawn")
-
-        # Get the module
-        module_name = f.__module__
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".tb", mode="wb") as tmp:
+            tb_file = tmp.name
 
-        # Create a process with environment variable set
-        env = os.environ.copy()
-        env["RUNNING_IN_SUBPROCESS"] = "1"
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            output_filepath = os.path.join(tempdir, "new_process.tmp")
-
-            # `cloudpickle` allows pickling complex functions directly
-            input_bytes = cloudpickle.dumps((f, output_filepath))
+        try:
+            # Serialize the function + args with cloudpickle so closures work
+            payload = cloudpickle.dumps((f, args, kwargs, tb_file))
+
+            child_script = (
+                "import sys, cloudpickle, traceback\n"
+                "try:\n"
+                "    from _pytest.outcomes import Skipped\n"
+                "except ImportError:\n"
+                "    class Skipped(BaseException): pass\n"
+                "f, args, kwargs, tb_file = "
+                "cloudpickle.loads(sys.stdin.buffer.read())\n"
+                "try:\n"
+                "    f(*args, **kwargs)\n"
+                "except Skipped:\n"
+                "    sys.exit(0)\n"
+                "except BaseException:\n"
+                "    open(tb_file, 'w').write(traceback.format_exc())\n"
+                "    sys.exit(1)\n"
+            )
 
             repo_root = str(VLLM_PATH.resolve())
-
-            env = dict(env or os.environ)
+            env = os.environ.copy()
             env["PYTHONPATH"] = repo_root + os.pathsep + env.get("PYTHONPATH", "")
 
-            cmd = [sys.executable, "-m", f"{module_name}"]
-
-            returned = subprocess.run(
-                cmd, input=input_bytes, capture_output=True, env=env
+            result = subprocess.run(
+                [sys.executable, "-c", child_script],
+                input=payload,
+                capture_output=True,
+                env=env,
             )
 
-            # check if the subprocess is successful
-            try:
-                returned.check_returncode()
-            except Exception as e:
-                # wrap raised exception to provide more information
+            if result.returncode != 0:
+                # Read traceback written by child, fall back to stderr
+                tb = ""
+                if os.path.exists(tb_file) and os.path.getsize(tb_file) > 0:
+                    with open(tb_file) as fp:
+                        tb = fp.read()
+                else:
+                    tb = result.stderr.decode()
                 raise RuntimeError(
-                    f"Error raised in subprocess:\n{returned.stderr.decode()}"
-                ) from e
+                    f"Test subprocess '{f.__name__}' failed "
+                    f"(exit code {result.returncode}):\n{tb}"
+                )
+        finally:
+            with contextlib.suppress(OSError):
+                os.remove(tb_file)
 
     return wrapper
 
diff --git a/tests/utils_/test_spawn_decorator.py b/tests/utils_/test_spawn_decorator.py
new file mode 100644
index 000000000000..1740ea30de94
--- /dev/null
+++ b/tests/utils_/test_spawn_decorator.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for spawn_new_process_for_each_test decorator."""
+
+import pytest
+
+from tests.utils import spawn_new_process_for_each_test
+
+
+@spawn_new_process_for_each_test
+def test_spawn_decorator_passing():
+    """Passing function should complete normally."""
+    assert 1 + 1 == 2
+
+
+@pytest.mark.xfail(raises=RuntimeError, strict=True)
+@spawn_new_process_for_each_test
+def test_spawn_decorator_failure_is_caught():
+    """Failing function should raise RuntimeError, never silently pass."""
+    raise ValueError("intentional failure")
+
+
+@spawn_new_process_for_each_test
+def test_spawn_decorator_skip():
+    """pytest.skip inside subprocess should propagate correctly."""
+    pytest.skip("intentional skip")
+
+
+@spawn_new_process_for_each_test
+@pytest.mark.parametrize("x,y,expected", [(1, 2, 3), (0, 0, 0)])
+def test_spawn_decorator_parametrized(x, y, expected):
+    """Args and kwargs must be forwarded correctly to subprocess."""
+    assert x + y == expected
diff --git a/tests/v1/attention/test_kv_head_stride_canonicalization.py b/tests/v1/attention/test_kv_head_stride_canonicalization.py
new file mode 100644
index 000000000000..635f46390cfc
--- /dev/null
+++ b/tests/v1/attention/test_kv_head_stride_canonicalization.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for canonicalize_singleton_dim_strides.
+
+Background
+----------
+When num_kv_heads_per_rank == 1 (e.g. Qwen3.5-397B with TP=8 → 1 KV head
+per rank), PyTorch's is_contiguous() returns True for *any* stride on the
+size-1 dimension.  The KV cache allocator can therefore produce a tensor
+where that singleton dim has stride = 1 element (2 bytes for bf16) instead
+of the canonical product-of-remaining-dims value.
+
+CUDA TMA (used by FlashInfer XQA SM90 and Flash-Attention 3/4 on H100+)
+requires all non-outermost strides to be multiples of 16 bytes.  A 2-byte
+stride triggers cudaErrorIllegalInstruction.
+
+canonicalize_singleton_dim_strides() patches degenerate strides on all
+size-1 dimensions via torch.as_strided — zero-copy.
+
+The degenerate stride manifests at different positions in different backends:
+- FlashInfer: stride(-3) after kv_cache.permute() → shape [..., 1, B, D]
+- FlashAttention: stride(-2) after kv_cache.unbind(0) → shape [N, B, 1, D]
+"""
+
+import torch
+
+from vllm.utils.torch_utils import canonicalize_singleton_dim_strides
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _inject_degenerate_stride(t: torch.Tensor, dim: int) -> torch.Tensor:
+    """Return a view of t with a degenerate (stride=1) on a size-1 dim."""
+    assert t.shape[dim] == 1, f"dim {dim} must have size 1"
+    strides = list(t.stride())
+    strides[dim] = 1  # inject the bug
+    return t.as_strided(t.shape, strides)
+
+
+# ---------------------------------------------------------------------------
+# Tests: canonicalize_singleton_dim_strides
+# ---------------------------------------------------------------------------
+
+
+class TestCanonicalizeSingletonDimStrides:
+    def test_flashinfer_layout_dim_neg3(self):
+        """FlashInfer path: degenerate stride at dim -3 (num_kv_heads)."""
+        # Shape after permute: [num_blocks, 2, num_kv_heads, block_size, head_size]
+        num_blocks, block_size, head_size = 64, 16, 128
+        t = torch.zeros(num_blocks, 2, 1, block_size, head_size, dtype=torch.bfloat16)
+        t_deg = _inject_degenerate_stride(t, dim=-3)
+
+        assert t_deg.stride(-3) == 1  # confirm degenerate
+        assert t_deg.is_contiguous()  # PyTorch doesn't notice
+
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        assert fixed.stride(-3) == block_size * head_size  # canonical = 2048
+        assert fixed.stride(-2) == head_size  # inner dims unchanged
+        assert fixed.stride(-1) == 1
+
+    def test_flash_attn_layout_dim_neg2(self):
+        """FlashAttention path: degenerate stride at dim -2 (num_kv_heads)."""
+        # Shape after unbind(0): [num_blocks, block_size, num_kv_heads, head_size]
+        num_blocks, block_size, head_size = 64, 16, 128
+        t = torch.zeros(num_blocks, block_size, 1, head_size, dtype=torch.bfloat16)
+        t_deg = _inject_degenerate_stride(t, dim=-2)
+
+        assert t_deg.stride(-2) == 1
+        assert t_deg.is_contiguous()
+
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        assert fixed.stride(-2) == head_size  # canonical = 128
+        assert fixed.stride(-1) == 1
+
+    def test_canonical_strides_returned_as_is(self):
+        """No degenerate strides → same object returned (no copy, no new view)."""
+        t = torch.zeros(64, 2, 1, 16, 128, dtype=torch.bfloat16)
+        result = canonicalize_singleton_dim_strides(t)
+        assert result is t
+
+    def test_multi_kv_heads_unchanged(self):
+        """num_kv_heads > 1 → strides are already canonical → unchanged."""
+        t = torch.zeros(16, 2, 4, 16, 128, dtype=torch.bfloat16)
+        original_strides = t.stride()
+        result = canonicalize_singleton_dim_strides(t)
+        assert result.stride() == original_strides
+
+    def test_data_pointer_preserved(self):
+        """Fix is zero-copy: same underlying storage."""
+        t = torch.zeros(8, 2, 1, 16, 128, dtype=torch.bfloat16)
+        t_deg = _inject_degenerate_stride(t, dim=-3)
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+        assert fixed.data_ptr() == t_deg.data_ptr()
+        assert fixed.storage_offset() == t_deg.storage_offset()
+
+    def test_multiple_singleton_dims(self):
+        """All size-1 dims with degenerate strides are fixed."""
+        # Shape: [1, 1, 8, 32] — two size-1 dims
+        t = torch.zeros(1, 1, 8, 32, dtype=torch.float16)
+        # Both size-1 dims get degenerate strides
+        t_deg = t.as_strided(t.shape, (1, 1, 32, 1))  # both leading dims = 1
+
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        assert fixed.stride(0) == 1 * 8 * 32  # canonical: 256
+        assert fixed.stride(1) == 1 * 8 * 32  # canonical: 256 (same since size-1)
+        assert fixed.stride(2) == 32
+        assert fixed.stride(3) == 1
+
+    def test_various_shapes_flashinfer(self):
+        """Correctness across different block_size / head_size for FlashInfer layout."""
+        for block_size, head_size in [(16, 64), (16, 128), (32, 128), (16, 256)]:
+            t = torch.zeros(8, 2, 1, block_size, head_size, dtype=torch.bfloat16)
+            t_deg = _inject_degenerate_stride(t, dim=-3)
+            fixed = canonicalize_singleton_dim_strides(t_deg)
+            assert fixed.stride(-3) == block_size * head_size, (
+                f"Failed for block_size={block_size}, head_size={head_size}: "
+                f"got stride(-3)={fixed.stride(-3)}"
+            )
+
+    def test_various_shapes_flash_attn(self):
+        """Correctness across different shapes for FlashAttention layout."""
+        for block_size, head_size in [(16, 64), (16, 128), (32, 128)]:
+            t = torch.zeros(8, block_size, 1, head_size, dtype=torch.bfloat16)
+            t_deg = _inject_degenerate_stride(t, dim=-2)
+            fixed = canonicalize_singleton_dim_strides(t_deg)
+            assert fixed.stride(-2) == head_size, (
+                f"Failed for block_size={block_size}, head_size={head_size}: "
+                f"got stride(-2)={fixed.stride(-2)}"
+            )
+
+    def test_tma_alignment_satisfied_after_fix_bf16(self):
+        """After fix, all strides meet 16-byte TMA alignment for bf16."""
+        t = torch.zeros(64, 2, 1, 16, 128, dtype=torch.bfloat16)
+        t_deg = _inject_degenerate_stride(t, dim=-3)
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        element_size = fixed.element_size()  # 2 bytes for bf16
+        for i, s in enumerate(fixed.stride()):
+            assert (s * element_size) % 16 == 0 or i == len(fixed.stride()) - 1, (
+                f"dim {i} stride {s} * {element_size} bytes not 16-byte aligned"
+            )
+
+    def test_non_contiguous_outer_dims_preserved(self):
+        """Outer (non-size-1) non-contiguous strides are left unchanged."""
+        # Simulate cross-layer unified allocation: num_blocks stride is non-canonical
+        # but the inner dims should be fixed.
+        base = torch.zeros(200, 2, 1, 16, 128, dtype=torch.bfloat16)
+        # Slice every 2nd block → non-canonical outer stride
+        t_sliced = base[::2]  # shape [100, 2, 1, 16, 128], stride[0] = 2*canonical
+        t_deg = _inject_degenerate_stride(t_sliced, dim=-3)
+
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        # Outer stride should be unchanged (not a size-1 dim)
+        assert fixed.stride(0) == t_sliced.stride(0)
+        # Inner degenerate stride should be fixed
+        assert fixed.stride(-3) == 16 * 128
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index e65d1d604029..3807ee69ecfc 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -30,6 +30,7 @@
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.attention.backend import CommonAttentionMetadata
 from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
+from vllm.v1.attention.backends.mla.prefill import get_mla_prefill_backend
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported
 from vllm.v1.kv_cache_interface import MLAAttentionSpec
@@ -621,6 +622,19 @@ def run_attention_backend(
             k_scale=k_scale,
         )
 
+        # Attach prefill backend (normally created by MLAAttention.__init__)
+        prefill_scale = (qk_nope_head_dim + qk_rope_head_dim) ** -0.5
+        prefill_backend_cls = get_mla_prefill_backend(vllm_config)
+        mock_layer.prefill_backend = prefill_backend_cls(
+            num_heads=num_heads,
+            scale=prefill_scale,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            vllm_config=vllm_config,
+        )
+
         # Populate static_forward_context with mock attention layers
         for layer_name in layer_names:
             vllm_config.compilation_config.static_forward_context[layer_name] = (
@@ -672,6 +686,7 @@ def run_attention_backend(
 def test_backend_correctness(
     default_vllm_config,
     dist_init,
+    workspace_init,
     batch_spec_name: str,
     model: str,
     tensor_parallel_size: int,
@@ -784,7 +799,9 @@ def test_backend_correctness(
     assert kv_lora_rank + qk_rope_head_dim == head_size, (
         f"MLA dimensions don't match: {total_head_size} != {head_size}"
     )
-    scale = 1.0 / (total_head_size**0.5)
+    decode_scale = 1.0 / (total_head_size**0.5)
+    qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+    prefill_scale = qk_head_dim**-0.5
 
     # 2. Generate data and compute SDPA reference output for MLA
     all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], []
@@ -901,7 +918,7 @@ def test_backend_correctness(
         v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
 
         sdpa_out_i_decode = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=decode_scale
         )
         sdpa_out_i_decode = sdpa_out_i_decode.transpose(1, 2).squeeze(
             0
@@ -937,7 +954,7 @@ def test_backend_correctness(
 
         # Single attention call with custom mask
         sdpa_out_i_prefill = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=prefill_scale
         )
         sdpa_out_i_prefill = sdpa_out_i_prefill.transpose(1, 2).squeeze(0)
         sdpa_out_i_prefill = sdpa_out_i_prefill.flatten(start_dim=-2)
diff --git a/tests/v1/attention/test_mla_prefill_selector.py b/tests/v1/attention/test_mla_prefill_selector.py
new file mode 100644
index 000000000000..068eb43faf40
--- /dev/null
+++ b/tests/v1/attention/test_mla_prefill_selector.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for MLA prefill backend selector."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm.config import AttentionConfig, ModelConfig, VllmConfig
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
+from vllm.v1.attention.backends.mla.prefill.selector import (
+    MLAPrefillSelectorConfig,
+    _auto_select_mla_prefill_backend,
+    get_mla_prefill_backend,
+    is_deepseek_r1_mla_compatible,
+)
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching."""
+    _auto_select_mla_prefill_backend.cache_clear()
+
+
+def _make_mock_model_config(
+    qk_nope_head_dim: int = 128,
+    qk_rope_head_dim: int = 64,
+    v_head_dim: int = 128,
+    dtype: torch.dtype = torch.bfloat16,
+) -> ModelConfig:
+    mock_config = MagicMock(spec=ModelConfig)
+    mock_config.dtype = dtype
+    mock_config.hf_text_config = MagicMock()
+    mock_config.hf_text_config.qk_nope_head_dim = qk_nope_head_dim
+    mock_config.hf_text_config.qk_rope_head_dim = qk_rope_head_dim
+    mock_config.hf_text_config.v_head_dim = v_head_dim
+    return mock_config
+
+
+def _make_vllm_config(
+    model_config: ModelConfig | None = None,
+    mla_prefill_backend: MLAPrefillBackendEnum | None = None,
+) -> VllmConfig:
+    if model_config is None:
+        model_config = _make_mock_model_config()
+
+    attention_config = AttentionConfig(mla_prefill_backend=mla_prefill_backend)
+    mock_vllm_config = MagicMock(spec=VllmConfig)
+    mock_vllm_config.model_config = model_config
+    mock_vllm_config.attention_config = attention_config
+    return mock_vllm_config
+
+
+class TestGetMLAPrefillBackend:
+    """Tests for get_mla_prefill_backend (public API)."""
+
+    def test_no_device_capability_returns_flash_attn(self):
+        vllm_config = _make_vllm_config()
+
+        with patch("vllm.platforms.current_platform") as mock_platform:
+            mock_platform.get_device_capability.return_value = None
+
+            backend = get_mla_prefill_backend(vllm_config)
+            assert backend.get_name() == "FLASH_ATTN"
+
+    def test_explicit_flash_attn_selection(self):
+        try:
+            flash_attn_cls = MLAPrefillBackendEnum.FLASH_ATTN.get_class()
+        except ImportError:
+            pytest.skip("FLASH_ATTN backend not available")
+            return
+
+        vllm_config = _make_vllm_config(
+            mla_prefill_backend=MLAPrefillBackendEnum.FLASH_ATTN,
+        )
+
+        with patch("vllm.platforms.current_platform") as mock_platform:
+            mock_platform.get_device_capability.return_value = DeviceCapability(
+                major=9, minor=0
+            )
+
+            with patch.object(
+                flash_attn_cls,
+                "validate_configuration",
+                return_value=[],
+            ):
+                backend = get_mla_prefill_backend(vllm_config)
+                assert backend.get_name() == "FLASH_ATTN"
+
+    def test_explicit_backend_invalid_raises_error(self):
+        vllm_config = _make_vllm_config(
+            mla_prefill_backend=MLAPrefillBackendEnum.FLASHINFER,
+        )
+
+        with patch("vllm.platforms.current_platform") as mock_platform:
+            mock_platform.get_device_capability.return_value = DeviceCapability(
+                major=9, minor=0
+            )
+
+            with pytest.raises(ValueError, match="is not valid"):
+                get_mla_prefill_backend(vllm_config)
+
+    def test_explicit_backend_import_error_raises(self):
+        vllm_config = _make_vllm_config(
+            mla_prefill_backend=MLAPrefillBackendEnum.TRTLLM_RAGGED,
+        )
+
+        with patch("vllm.platforms.current_platform") as mock_platform:
+            mock_platform.get_device_capability.return_value = DeviceCapability(
+                major=10, minor=0
+            )
+
+            with (
+                patch.object(
+                    MLAPrefillBackendEnum.TRTLLM_RAGGED,
+                    "get_class",
+                    side_effect=ImportError("trtllm not installed"),
+                ),
+                pytest.raises(ValueError, match="is not valid"),
+            ):
+                get_mla_prefill_backend(vllm_config)
+
+    def test_auto_selection_on_hopper(self):
+        try:
+            flash_attn_cls = MLAPrefillBackendEnum.FLASH_ATTN.get_class()
+        except ImportError:
+            pytest.skip("FLASH_ATTN backend not available")
+            return
+
+        vllm_config = _make_vllm_config()
+
+        with patch("vllm.platforms.current_platform") as mock_platform:
+            mock_platform.get_device_capability.return_value = DeviceCapability(
+                major=9, minor=0
+            )
+
+            with patch.object(
+                flash_attn_cls,
+                "validate_configuration",
+                return_value=[],
+            ):
+                backend = get_mla_prefill_backend(vllm_config)
+                assert backend.get_name() == "FLASH_ATTN"
+
+
+class TestAutoSelectMLAPrefillBackend:
+    """Tests for fallback and error paths in auto-selection."""
+
+    def test_blackwell_falls_back_to_trtllm(self):
+        vllm_config = _make_vllm_config()
+        capability = DeviceCapability(major=10, minor=0)
+        selector_config = MLAPrefillSelectorConfig(
+            dtype=torch.bfloat16,
+            is_r1_compatible=is_deepseek_r1_mla_compatible(vllm_config),
+        )
+
+        try:
+            trtllm_cls = MLAPrefillBackendEnum.TRTLLM_RAGGED.get_class()
+        except ImportError:
+            pytest.skip("TRTLLM_RAGGED backend not available")
+            return
+
+        with (
+            patch.object(
+                MLAPrefillBackendEnum.FLASH_ATTN,
+                "get_class",
+                side_effect=ImportError("FLASH_ATTN not available"),
+            ),
+            patch.object(trtllm_cls, "validate_configuration", return_value=[]),
+        ):
+            backend = _auto_select_mla_prefill_backend(
+                capability,
+                selector_config,
+            )
+            assert backend.get_name() == "TRTLLM_RAGGED"
+
+    def test_all_fail_raises_error(self):
+        vllm_config = _make_vllm_config()
+        capability = DeviceCapability(major=10, minor=0)
+        selector_config = MLAPrefillSelectorConfig(
+            dtype=torch.bfloat16,
+            is_r1_compatible=is_deepseek_r1_mla_compatible(vllm_config),
+        )
+
+        def mock_get_class(backend_enum):  # noqa: ARG001
+            cls = MagicMock()
+            cls.validate_configuration.return_value = ["not available"]
+            return cls
+
+        with patch.object(MLAPrefillBackendEnum, "get_class", mock_get_class):
+            _auto_select_mla_prefill_backend.cache_clear()
+            with pytest.raises(ValueError, match="No valid MLA"):
+                _auto_select_mla_prefill_backend(
+                    capability,
+                    selector_config,
+                )
+
+
+class TestBackendValidation:
+    """Tests for backend validation logic."""
+
+    def test_r1_dimension_requirement(self):
+        try:
+            from vllm.v1.attention.backends.mla.prefill.flashinfer import (
+                FlashInferPrefillBackend,
+            )
+        except ImportError:
+            pytest.skip("FlashInfer prefill backend not available")
+            return
+
+        assert FlashInferPrefillBackend.requires_r1_mla_dimensions is True
+
+        vllm_config = _make_vllm_config(
+            model_config=_make_mock_model_config(
+                qk_nope_head_dim=128,
+                qk_rope_head_dim=64,
+                v_head_dim=128,
+            )
+        )
+        capability = DeviceCapability(major=10, minor=0)
+        selector_config = MLAPrefillSelectorConfig(
+            dtype=torch.bfloat16,
+            is_r1_compatible=is_deepseek_r1_mla_compatible(vllm_config),
+        )
+
+        with patch.object(FlashInferPrefillBackend, "is_available", return_value=True):
+            invalid_reasons = FlashInferPrefillBackend.validate_configuration(
+                capability,
+                selector_config,
+            )
+            assert len(invalid_reasons) == 0
+
+        vllm_config_invalid = _make_vllm_config(
+            model_config=_make_mock_model_config(
+                qk_nope_head_dim=64,
+                qk_rope_head_dim=64,
+                v_head_dim=128,
+            )
+        )
+        selector_config_invalid = MLAPrefillSelectorConfig(
+            dtype=torch.bfloat16,
+            is_r1_compatible=is_deepseek_r1_mla_compatible(vllm_config_invalid),
+        )
+
+        with patch.object(FlashInferPrefillBackend, "is_available", return_value=True):
+            invalid_reasons = FlashInferPrefillBackend.validate_configuration(
+                capability,
+                selector_config_invalid,
+            )
+            assert len(invalid_reasons) == 1
+            assert "DeepSeek R1 MLA dimensions" in invalid_reasons[0]
+
+
+class TestMLAPrefillBackendParsing:
+    """Tests for string-based mla_prefill_backend parsing from CLI args."""
+
+    def test_valid_string_parses_to_enum(self):
+        config = AttentionConfig(
+            mla_prefill_backend="FLASH_ATTN",  # type: ignore[arg-type]
+        )
+        assert config.mla_prefill_backend == MLAPrefillBackendEnum.FLASH_ATTN
+
+    def test_invalid_string_raises_error(self):
+        with pytest.raises(ValueError, match="Unknown MLA prefill backend"):
+            AttentionConfig(
+                mla_prefill_backend="NONEXISTENT",  # type: ignore[arg-type]
+            )
+
+
+class TestDeprecatedFlagMigration:
+    """Tests for _migrate_deprecated_mla_prefill_flags in AttentionConfig."""
+
+    def test_no_deprecated_flags_leaves_backend_none(self):
+        config = AttentionConfig()
+        assert config.mla_prefill_backend is None
+
+    def test_use_trtllm_ragged_migrates_to_trtllm_ragged(self):
+        config = AttentionConfig(use_trtllm_ragged_deepseek_prefill=True)
+        assert config.mla_prefill_backend == MLAPrefillBackendEnum.TRTLLM_RAGGED
+
+    def test_disable_flashinfer_prefill_migrates_to_flash_attn(self):
+        config = AttentionConfig(disable_flashinfer_prefill=True)
+        assert config.mla_prefill_backend == MLAPrefillBackendEnum.FLASH_ATTN
+
+    def test_explicit_backend_ignores_deprecated_flags(self):
+        config = AttentionConfig(
+            mla_prefill_backend=MLAPrefillBackendEnum.FLASH_ATTN,
+            use_cudnn_prefill=True,
+        )
+        assert config.mla_prefill_backend == MLAPrefillBackendEnum.FLASH_ATTN
+
+    def test_cudnn_raises_error(self):
+        match = "cuDNN MLA prefill backend has been removed"
+        with pytest.raises(ValueError, match=match):
+            AttentionConfig(use_cudnn_prefill=True)
+
+    def test_trtllm_takes_priority_over_disable_flashinfer(self):
+        config = AttentionConfig(
+            use_trtllm_ragged_deepseek_prefill=True,
+            disable_flashinfer_prefill=True,
+        )
+        assert config.mla_prefill_backend == MLAPrefillBackendEnum.TRTLLM_RAGGED
diff --git a/tests/v1/attention/test_trtllm_attention_integration.py b/tests/v1/attention/test_trtllm_attention_integration.py
index 12af0773cb39..06c5844508f4 100644
--- a/tests/v1/attention/test_trtllm_attention_integration.py
+++ b/tests/v1/attention/test_trtllm_attention_integration.py
@@ -17,13 +17,13 @@
 from vllm.config import set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
-from vllm.utils.torch_utils import set_random_seed
+from vllm.utils.torch_utils import nvfp4_kv_cache_full_dim, set_random_seed
 from vllm.v1.attention.backends.utils import (
     PerLayerParameters,
     get_kv_cache_layout,
     set_kv_cache_layout,
 )
-from vllm.v1.kv_cache_interface import FullAttentionSpec
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVQuantMode
 
 if not current_platform.is_device_capability_family(100):
     pytest.skip(
@@ -53,6 +53,7 @@ def __init__(self, device: torch.device):
 
 
 MODEL = "Qwen/Qwen2.5-0.5B"
+MODEL_NVFP4 = "Qwen/Qwen3-4B"  # nvfp4 needs head_dim >= 128 (or 80)
 BLOCK_SIZE = 16
 NUM_GPU_BLOCKS = 8192
 DEVICE_TYPE = current_platform.device_type
@@ -169,19 +170,129 @@ def _create_hnd_kv_cache(
     return kv_cache
 
 
-def _run_trtllm_integration(batch_spec):
+def _create_nvfp4_hnd_kv_cache(
+    k_contexts,
+    v_contexts,
+    block_size,
+    num_kv_heads,
+    head_size,
+    dtype,
+    device,
+    num_blocks,
+    common_attn_metadata,
+    kv_scale_val,
+):
+    """Create an nvfp4 KV cache by quantizing bf16 context via
+    reshape_and_cache_flash, using the same block-table layout as
+    _create_hnd_kv_cache.
+
+    The returned tensor is dtype ``uint8`` with shape
+    ``(num_blocks, 2, block_size, num_kv_heads, full_dim)`` in logical
+    (NHD) order, but physically permuted to HND layout via stride order
+    ``(0, 1, 3, 2, 4)`` (i.e. ``num_kv_heads`` before ``block_size``).
+
+    The last dimension ``full_dim = head_size // 2 + head_size // 16``
+    packs two regions contiguously:
+      - **FP4 data** (``head_size // 2`` bytes): pairs of E2M1 values,
+        two per byte.
+      - **FP8 block scales** (``head_size // 16`` bytes): one E4M3
+        scale per 16-element block.
+
+    Dimension 1 indexes K (``[:, 0]``) and V (``[:, 1]``).
+
+    Args:
+        k_contexts: List of key context tensors, one per sequence.
+        v_contexts: List of value context tensors, one per sequence.
+        block_size: Number of tokens per cache block.
+        num_kv_heads: Number of key/value heads.
+        head_size: Head dimension (must be divisible by 16).
+        dtype: Source data type for the bf16 intermediate cache.
+        device: Target device.
+        num_blocks: Total number of blocks to allocate.
+        common_attn_metadata: Metadata containing block tables and
+            sequence lengths.
+        kv_scale_val: Scalar float used as both k_scale and v_scale
+            during quantization.
+
+    Returns:
+        ``torch.Tensor``: The nvfp4 kv_cache tensor (uint8, HND-strided).
+    """
+    # First create a bf16 HND cache so block tables are populated.
+    bf16_cache = _create_hnd_kv_cache(
+        k_contexts,
+        v_contexts,
+        block_size,
+        num_kv_heads,
+        head_size,
+        dtype,
+        device,
+        num_blocks,
+        common_attn_metadata,
+    )
+
+    # Allocate nvfp4 cache: same shape but with full_dim (data + scale).
+    full_dim = nvfp4_kv_cache_full_dim(head_size)
+    hnd_order = (0, 1, 3, 2, 4)
+    nvfp4_cache = torch.zeros(
+        (num_blocks, 2, num_kv_heads, block_size, full_dim),
+        dtype=torch.uint8,
+        device=device,
+    ).permute(*hnd_order)
+
+    # Flatten bf16 context into tokens and quantize via reshape_and_cache_flash.
+    # bf16_cache is (num_blocks, 2, block_size, num_kv_heads, head_size) logical
+    # with HND physical strides.
+    block_table = common_attn_metadata.block_table_tensor
+    seq_lens = common_attn_metadata.seq_lens.cpu()
+    query_lens = (
+        common_attn_metadata.query_start_loc_cpu[1:]
+        - common_attn_metadata.query_start_loc_cpu[:-1]
+    )
+    kv_scale_t = torch.tensor(kv_scale_val, dtype=torch.float32, device=device)
+
+    for i in range(len(k_contexts)):
+        ctx_len = int(seq_lens[i]) - int(query_lens[i])
+        if ctx_len == 0:
+            continue
+        # Gather context tokens from the bf16 cache using block table.
+        n_ctx_blocks = (ctx_len + block_size - 1) // block_size
+        blocks = block_table[i, :n_ctx_blocks]
+        # bf16_cache[:, kv_idx] is (num_blocks, block_size, num_kv_heads, head_size)
+        k_ctx = bf16_cache[blocks, 0].reshape(-1, num_kv_heads, head_size)[:ctx_len]
+        v_ctx = bf16_cache[blocks, 1].reshape(-1, num_kv_heads, head_size)[:ctx_len]
+        # Build slot mapping for these context tokens.
+        token_offsets = torch.arange(ctx_len, device=device)
+        block_indices = token_offsets // block_size
+        intra_offsets = token_offsets % block_size
+        slots = block_table[i, block_indices] * block_size + intra_offsets
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            k_ctx,
+            v_ctx,
+            nvfp4_cache[:, 0],
+            nvfp4_cache[:, 1],
+            slots,
+            "nvfp4",
+            kv_scale_t,
+            kv_scale_t,
+        )
+
+    return nvfp4_cache
+
+
+def _run_trtllm_integration(batch_spec, kv_cache_dtype="auto", model_name=MODEL):
     """Run TRTLLM attention through the full FlashInfer pipeline
     and compare against an SDPA reference."""
     set_random_seed(42)
     device = torch.device(f"{DEVICE_TYPE}:0")
 
     vllm_config = create_vllm_config(
-        model_name=MODEL,
+        model_name=model_name,
         max_model_len=max(batch_spec.seq_lens),
         block_size=BLOCK_SIZE,
         num_gpu_blocks=NUM_GPU_BLOCKS,
     )
     vllm_config.attention_config.use_trtllm_attention = True
+    vllm_config.cache_config.cache_dtype = kv_cache_dtype
 
     num_q_heads = vllm_config.model_config.get_num_attention_heads(
         vllm_config.parallel_config
@@ -248,28 +359,51 @@ def causal_mask_mod(b, h, q_idx, kv_idx, *, context_len):
     common_attn_metadata = create_common_attn_metadata(batch_spec, BLOCK_SIZE, device)
 
     # 2. Create HND KV cache
-    kv_cache = _create_hnd_kv_cache(
-        k_contexts,
-        v_contexts,
-        BLOCK_SIZE,
-        num_kv_heads,
-        head_size,
-        dtype,
-        device,
-        NUM_GPU_BLOCKS,
-        common_attn_metadata,
-    )
+    is_nvfp4 = kv_cache_dtype == "nvfp4"
+    if is_nvfp4:
+        # Compute a global scale from the context data.
+        all_ctx = torch.cat(k_contexts + v_contexts, dim=0)
+        kv_scale_val = (all_ctx.abs().amax() / 448.0).item()
+        kv_cache = _create_nvfp4_hnd_kv_cache(
+            k_contexts,
+            v_contexts,
+            BLOCK_SIZE,
+            num_kv_heads,
+            head_size,
+            dtype,
+            device,
+            NUM_GPU_BLOCKS,
+            common_attn_metadata,
+            kv_scale_val,
+        )
+    else:
+        kv_scale_val = 1.0
+        kv_cache = _create_hnd_kv_cache(
+            k_contexts,
+            v_contexts,
+            BLOCK_SIZE,
+            num_kv_heads,
+            head_size,
+            dtype,
+            device,
+            NUM_GPU_BLOCKS,
+            common_attn_metadata,
+        )
 
     # 3. Run through FlashInfer with TRTLLM enabled
     set_kv_cache_layout("HND")
     get_kv_cache_layout.cache_clear()
 
     try:
+        is_nvfp4 = kv_cache_dtype == "nvfp4"
+        kv_quant_mode = KVQuantMode.NVFP4 if is_nvfp4 else KVQuantMode.NONE
+        spec_dtype = torch.uint8 if is_nvfp4 else dtype
         kv_cache_spec = FullAttentionSpec(
             block_size=BLOCK_SIZE,
             num_kv_heads=num_kv_heads,
             head_size=head_size,
-            dtype=dtype,
+            dtype=spec_dtype,
+            kv_quant_mode=kv_quant_mode,
         )
         layer_names = ["test_layer_0"]
 
@@ -312,10 +446,20 @@ def causal_mask_mod(b, h, q_idx, kv_idx, *, context_len):
                 num_kv_heads=num_kv_heads,
                 alibi_slopes=None,
                 sliding_window=None,
-                kv_cache_dtype="auto",
+                kv_cache_dtype=kv_cache_dtype,
             )
 
             mock_layer = MockAttentionLayer(device)
+            if is_nvfp4:
+                # For nvfp4, k_scale/v_scale are the global quantization
+                # scales (amax/448) used by reshape_and_cache_flash.
+                kv_scale_t = torch.tensor(
+                    kv_scale_val, dtype=torch.float32, device=device
+                )
+                mock_layer._k_scale = kv_scale_t
+                mock_layer._v_scale = kv_scale_t
+                mock_layer._k_scale_float = kv_scale_val
+                mock_layer._v_scale_float = kv_scale_val
             output = torch.empty_like(query_vllm)
 
             impl.do_kv_cache_update(
@@ -326,6 +470,23 @@ def causal_mask_mod(b, h, q_idx, kv_idx, *, context_len):
                 attn_metadata.slot_mapping,
             )
 
+            # nvfp4 trtllm kernel requires FP8 queries. In the real
+            # pipeline the attention layer handles this; here we
+            # quantize manually.
+            if is_nvfp4:
+                finfo = torch.finfo(torch.float8_e4m3fn)
+                q_amax = query_vllm.abs().amax().clamp(min=1e-12)
+                q_s = (finfo.max / q_amax * 0.1).item()
+                query_vllm = (
+                    (query_vllm * q_s)
+                    .clamp(finfo.min, finfo.max)
+                    .to(torch.float8_e4m3fn)
+                )
+                mock_layer._q_scale = torch.tensor(
+                    1.0 / q_s, dtype=torch.float32, device=device
+                )
+                mock_layer._q_scale_float = 1.0 / q_s
+
             output = impl.forward(
                 mock_layer,
                 query_vllm,
@@ -337,12 +498,11 @@ def causal_mask_mod(b, h, q_idx, kv_idx, *, context_len):
             )
 
         # 4. Compare against SDPA reference
-        torch.testing.assert_close(
-            output,
-            sdpa_output,
-            atol=1e-2,
-            rtol=1e-2,
-        )
+        if is_nvfp4:
+            atol, rtol = 1.0, 1.0  # nvfp4 has higher quantization error
+        else:
+            atol, rtol = 1e-2, 1e-2
+        torch.testing.assert_close(output, sdpa_output, atol=atol, rtol=rtol)
 
     finally:
         set_kv_cache_layout(None)
@@ -359,3 +519,18 @@ def test_trtllm_gen_full_attention_integration(batch_spec_name: str):
     MetadataBuilder.build() -> FlashInferImpl.forward() pipeline,
     with real TRTLLM kernels on Blackwell."""
     _run_trtllm_integration(BATCH_SPECS[batch_spec_name])
+
+
+@pytest.mark.parametrize(
+    "batch_spec_name",
+    list(BATCH_SPECS.keys()),
+)
+@torch.inference_mode()
+def test_trtllm_gen_nvfp4_kv_integration(batch_spec_name: str):
+    """Test TRTLLM attention with nvfp4 KV cache through the full
+    FlashInfer MetadataBuilder.build() -> FlashInferImpl.forward() pipeline."""
+    _run_trtllm_integration(
+        BATCH_SPECS[batch_spec_name],
+        kv_cache_dtype="nvfp4",
+        model_name=MODEL_NVFP4,
+    )
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index cfd03c5f687e..985b97c69ca4 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -2074,6 +2074,54 @@ def test_auto_fit_max_model_len_not_triggered():
     assert vllm_config.model_config.max_model_len == 16
 
 
+def test_auto_fit_max_model_len_respects_num_gpu_blocks_override():
+    """Auto-fit must size max_model_len against the override-clamped pool, not
+    the raw `available_memory`. Without this, auto-fit could pick a
+    max_model_len that no longer fits once `num_gpu_blocks_override` is applied.
+    """
+    model_config = ModelConfig(max_model_len=16384)
+    model_config.original_max_model_len = -1  # request auto-fit
+    vllm_config = VllmConfig(model_config=model_config)
+    # Cap the cache to 32 blocks regardless of available memory.
+    vllm_config.cache_config.num_gpu_blocks_override = 32
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),  # block_size=16
+        "layer_2": new_kv_cache_spec(),
+    }
+    # Plenty of raw memory (1024 blocks per layer would fit max_model_len=16384).
+    large_available_memory = mem_per_block_per_layer * 2 * 1024
+
+    get_kv_cache_configs(vllm_config, [kv_cache_specs], [large_available_memory])
+
+    # 32 blocks * block_size 16 = 512 token slots, so max_model_len must
+    # auto-fit at or below that.
+    assert 0 < vllm_config.model_config.max_model_len <= 32 * 16
+
+
+def test_check_enough_kv_cache_memory_respects_num_gpu_blocks_override():
+    """Admission check must use the override-clamped pool size, not raw
+    `available_memory`. Without this, startup could accept a max_model_len
+    that does not actually fit in `num_gpu_blocks_override` blocks.
+    """
+    model_config = ModelConfig(max_model_len=16384)
+    vllm_config = VllmConfig(model_config=model_config)
+    # 32 blocks is far too small for max_model_len=16384 (would need 1024).
+    vllm_config.cache_config.num_gpu_blocks_override = 32
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+    # Plenty of raw memory: a bytes-only check against this would pass.
+    large_available_memory = mem_per_block_per_layer * 2 * 1024
+
+    with pytest.raises(ValueError, match="max seq len"):
+        get_kv_cache_configs(vllm_config, [kv_cache_specs], [large_available_memory])
+
+
 def test_unify_hybrid_kv_cache_specs():
     # 1. has_full_attention and has_sliding_window
     before_spec_1 = new_kv_cache_spec()
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 78617aa1c122..c35c38911a1a 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -2568,7 +2568,9 @@ def test_can_fit_full_sequence_swa_cap_admits_long_prompt():
     prompt_len = 32 * block_size
     req = make_request("long", list(range(prompt_len)), block_size, sha256)
 
-    assert manager.can_fit_full_sequence(req)
+    assert (
+        manager.allocate_slots(req, block_size, full_sequence_must_fit=True) is not None
+    )
 
 
 def test_can_fit_full_sequence_full_attention_still_gates_oversized():
@@ -2619,4 +2621,4 @@ def test_can_fit_full_sequence_full_attention_still_gates_oversized():
     prompt_len = 16 * block_size
     req = make_request("oversized", list(range(prompt_len)), block_size, sha256)
 
-    assert not manager.can_fit_full_sequence(req)
+    assert manager.allocate_slots(req, block_size, full_sequence_must_fit=True) is None
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index 08fda7593e28..f59830dcd741 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -432,3 +432,52 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
         )
         == 15
     )
+
+
+def test_predictor_matches_allocator_blocks_calculation_with_admission_cap():
+    """In forward steps, `get_num_blocks_to_allocate` must return exactly what
+    `allocate_new_blocks` will pull; otherwise `block_pool.get_new_blocks`
+    raises `ValueError: Cannot get N free blocks from the pool`.
+    """
+    block_size = 2
+    sliding_window = 8  # 4-block live window
+    cap = sliding_window // block_size
+
+    spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=sliding_window,
+    )
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
+    manager = SlidingWindowManager(
+        spec,
+        block_pool=block_pool,
+        enable_caching=False,
+        kv_cache_group_id=0,
+        max_admission_blocks_per_request=cap,
+    )
+
+    request_id = "req"
+    total_computed = 0
+    # Walk through request forward steps. Check num_blocks returned by
+    # `get_num_blocks_to_allocate` matches what `allocate_new_blocks` pulls
+    for num_tokens in (4, 8, 12, 16):
+        predicted = manager.get_num_blocks_to_allocate(
+            request_id=request_id,
+            num_tokens=num_tokens,
+            new_computed_blocks=[],
+            total_computed_tokens=total_computed,
+            num_tokens_main_model=num_tokens,
+        )
+        new_blocks = manager.allocate_new_blocks(
+            request_id, num_tokens=num_tokens, num_tokens_main_model=num_tokens
+        )
+        assert predicted == len(new_blocks), (
+            f"num_tokens={num_tokens}: predictor returned {predicted} "
+            f"but allocator pulled {len(new_blocks)}"
+        )
+        total_computed = num_tokens
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 66e6d7dd4605..97b5fd46a2eb 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -371,193 +371,200 @@ def test_bypass_on_mode_none(self):
         assert not wrapper.concrete_cudagraph_entries
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
-class TestCudagraphIntegration:
-    def setup_method(self):
-        # only FULL mode for non-uniform batches
-        self.comp_config = CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
-            cudagraph_mode="FULL",
-            cudagraph_capture_sizes=[10, 20],
-        )
-        self.vllm_config = _create_vllm_config(self.comp_config)
-        self.dispatcher = CudagraphDispatcher(self.vllm_config)
-        self.dispatcher.initialize_cudagraph_keys(
-            self.comp_config.cudagraph_mode, uniform_decode_query_len=1
-        )
-
-    def _run_and_monitor_call(
-        self, wrapper, input_tensor, runtime_mode, batch_descriptor
+def _run_and_monitor_call(
+    wrapper, input_tensor, runtime_mode, batch_descriptor, vllm_config
+):
+    """Helper to run a single call and monitor the action."""
+
+    with (
+        patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_graph_context,
+        patch.object(wrapper, "runnable", wraps=wrapper.runnable) as mock_runnable,
     ):
-        """Helper to run a single call and monitor the action."""
-
-        with (
-            patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_graph_context,
-            patch.object(wrapper, "runnable", wraps=wrapper.runnable) as mock_runnable,
-        ):
-            entry = wrapper.concrete_cudagraph_entries.get(batch_descriptor, None)
-
-            context = set_forward_context(
-                attn_metadata=None,
-                vllm_config=self.vllm_config,
-                cudagraph_runtime_mode=runtime_mode,
-                batch_descriptor=batch_descriptor,
-            )
-            mock_replay = MagicMock()
-            if entry and entry.cudagraph:
-                with (
-                    context,
-                    patch.object(
-                        entry.cudagraph, "replay", new_callable=MagicMock
-                    ) as mock_replay,
-                ):
-                    wrapper(input_tensor)
-            else:
-                with context:
-                    wrapper(input_tensor)
-
-            if mock_graph_context.called:
-                # note that this is globally mocked, so it will be detected
-                # even whether called by the inner or outer wrapper
-                return "capture_global"
-            if mock_replay.called:
-                # only for outer wrapper
-                return "replay"
-            if mock_runnable.call_count > 0:
-                # only for outer wrapper
-                return "bypass"
-            return "unknown"
-
-    @create_new_process_for_each_test("spawn")
-    def test_capture_replay_bypass_logic(self):
-        model = SimpleMLP().to(DEVICE_TYPE)
-        full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL)
-        max_bs = 16
-        persistent_input_buffer = torch.zeros(max_bs, 10, device=DEVICE_TYPE)
-        input_1 = persistent_input_buffer[:1]
-        input_2 = persistent_input_buffer[:2]
-        input_3 = persistent_input_buffer[:3]
-
-        desc_1 = BatchDescriptor(num_tokens=1)
-        desc_2 = BatchDescriptor(num_tokens=2)
-        desc_3_unseen = BatchDescriptor(num_tokens=3)
+        entry = wrapper.concrete_cudagraph_entries.get(batch_descriptor, None)
 
-        # 0. global warmup
-        with set_forward_context(
+        context = set_forward_context(
             attn_metadata=None,
-            vllm_config=self.vllm_config,
-            cudagraph_runtime_mode=CUDAGraphMode.NONE,
-            batch_descriptor=None,
-        ):
-            full_wrapper(input_1)
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=runtime_mode,
+            batch_descriptor=batch_descriptor,
+        )
+        mock_replay = MagicMock()
+        if entry and entry.cudagraph:
+            with (
+                context,
+                patch.object(
+                    entry.cudagraph, "replay", new_callable=MagicMock
+                ) as mock_replay,
+            ):
+                wrapper(input_tensor)
+        else:
+            with context:
+                wrapper(input_tensor)
+
+        if mock_graph_context.called:
+            # note that this is globally mocked, so it will be detected
+            # even whether called by the inner or outer wrapper
+            return "capture_global"
+        if mock_replay.called:
+            # only for outer wrapper
+            return "replay"
+        if mock_runnable.call_count > 0:
+            # only for outer wrapper
+            return "bypass"
+        return "unknown"
+
+
+@create_new_process_for_each_test("spawn")
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+def test_capture_replay_bypass_logic():
+    comp_config = CompilationConfig(
+        mode=CompilationMode.VLLM_COMPILE,
+        cudagraph_mode="FULL",
+        cudagraph_capture_sizes=[1, 2],
+    )
+    vllm_config = _create_vllm_config(comp_config)
+    dispatcher = CudagraphDispatcher(vllm_config)
+    dispatcher.initialize_cudagraph_keys(
+        comp_config.cudagraph_mode, uniform_decode_query_len=1
+    )
+    model = SimpleMLP().to(DEVICE_TYPE)
+    full_wrapper = CUDAGraphWrapper(model, vllm_config, CUDAGraphMode.FULL)
+    max_bs = 16
+    persistent_input_buffer = torch.zeros(max_bs, 10, device=DEVICE_TYPE)
+    input_1 = persistent_input_buffer[:1]
+    input_2 = persistent_input_buffer[:2]
+    input_3 = persistent_input_buffer[:3]
+
+    desc_1 = BatchDescriptor(num_tokens=1)
+    desc_2 = BatchDescriptor(num_tokens=2)
+    desc_3_unseen = BatchDescriptor(num_tokens=3)
+
+    # 0. global warmup
+    with set_forward_context(
+        attn_metadata=None,
+        vllm_config=vllm_config,
+        cudagraph_runtime_mode=CUDAGraphMode.NONE,
+        batch_descriptor=None,
+    ):
+        full_wrapper(input_1)
 
-        rt_mode, key = self.dispatcher.dispatch(num_tokens=desc_1.num_tokens)
-        # 1. Capture first shape
-        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode, key)
-        assert action == "capture_global"
+    rt_mode, key = dispatcher.dispatch(num_tokens=desc_1.num_tokens)
+    # 1. Capture first shape
+    action = _run_and_monitor_call(full_wrapper, input_1, rt_mode, key, vllm_config)
+    assert action == "capture_global"
 
-        # 2. Replay first shape
-        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode, key)
-        assert action == "replay"
+    # 2. Replay first shape
+    action = _run_and_monitor_call(full_wrapper, input_1, rt_mode, key, vllm_config)
+    assert action == "replay"
 
-        rt_mode, key = self.dispatcher.dispatch(num_tokens=desc_2.num_tokens)
-        # 3. Capture second shape
-        action = self._run_and_monitor_call(full_wrapper, input_2, rt_mode, key)
-        assert action == "capture_global"
+    rt_mode, key = dispatcher.dispatch(num_tokens=desc_2.num_tokens)
+    # 3. Capture second shape
+    action = _run_and_monitor_call(full_wrapper, input_2, rt_mode, key, vllm_config)
+    assert action == "capture_global"
 
-        # 4. Replay second shape
-        action = self._run_and_monitor_call(
-            full_wrapper, input_2, CUDAGraphMode.FULL, desc_2
+    # 4. Replay second shape
+    action = _run_and_monitor_call(
+        full_wrapper, input_2, CUDAGraphMode.FULL, key, vllm_config
+    )
+    assert action == "replay"
+
+    # 5. Bypass if no key match
+    rt_mode, key = dispatcher.dispatch(num_tokens=desc_3_unseen.num_tokens)
+    assert rt_mode == CUDAGraphMode.NONE
+    action = _run_and_monitor_call(full_wrapper, input_3, rt_mode, key, vllm_config)
+    assert action == "bypass"
+
+    # capture unseen shape is not allowed after disable
+    set_cudagraph_capturing_enabled(False)
+    with pytest.raises(RuntimeError):
+        _run_and_monitor_call(
+            full_wrapper, input_3, CUDAGraphMode.FULL, desc_3_unseen, vllm_config
         )
-        assert action == "replay"
+    set_cudagraph_capturing_enabled(True)
 
-        # 5. Bypass if no key match
-        rt_mode, key = self.dispatcher.dispatch(num_tokens=desc_3_unseen.num_tokens)
-        assert rt_mode == CUDAGraphMode.NONE
-        action = self._run_and_monitor_call(full_wrapper, input_3, rt_mode, key)
-        assert action == "bypass"
-
-        # capture unseen shape is not allowed after disable
-        set_cudagraph_capturing_enabled(False)
-        with pytest.raises(RuntimeError):
-            self._run_and_monitor_call(
-                full_wrapper, input_3, CUDAGraphMode.FULL, desc_3_unseen
-            )
-        set_cudagraph_capturing_enabled(True)
-
-    @create_new_process_for_each_test("spawn")
-    def test_nested_wrappers(self):
-        """Tests a scenario with a PIECEWISE wrapper inside a FULL one."""
-        model = SimpleMLP().to(DEVICE_TYPE)
-        full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL)
-        input_1 = torch.randn(1, 10, device=DEVICE_TYPE)
-
-        # Setup: Inner model is wrapped with PIECEWISE, outer with FULL
-        inner_model = SimpleMLP().to(DEVICE_TYPE)
-        piecewise_wrapper = CUDAGraphWrapper(
-            inner_model, self.vllm_config, CUDAGraphMode.PIECEWISE
-        )
-        inner_model.forward = MagicMock(wraps=inner_model.forward)
-        outer_model = SimpleMLP().to(DEVICE_TYPE)
-        # When outer model is called, it calls the piecewise_wrapper
-        outer_model.forward = MagicMock(
-            wraps=outer_model.forward, side_effect=piecewise_wrapper
-        )
-        full_wrapper = CUDAGraphWrapper(
-            outer_model, self.vllm_config, CUDAGraphMode.FULL
-        )
 
-        desc_1 = BatchDescriptor(num_tokens=1)
+@create_new_process_for_each_test("spawn")
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+def test_nested_wrappers():
+    """Tests a scenario with a PIECEWISE wrapper inside a FULL one."""
+    comp_config = CompilationConfig(
+        mode=CompilationMode.VLLM_COMPILE,
+        cudagraph_mode="FULL",
+        cudagraph_capture_sizes=[1],
+    )
+    vllm_config = _create_vllm_config(comp_config)
+    dispatcher = CudagraphDispatcher(vllm_config)
+    dispatcher.initialize_cudagraph_keys(
+        comp_config.cudagraph_mode, uniform_decode_query_len=1
+    )
+    model = SimpleMLP().to(DEVICE_TYPE)
+    full_wrapper = CUDAGraphWrapper(model, vllm_config, CUDAGraphMode.FULL)
+    input_1 = torch.randn(1, 10, device=DEVICE_TYPE)
+
+    # Setup: Inner model is wrapped with PIECEWISE, outer with FULL
+    inner_model = SimpleMLP().to(DEVICE_TYPE)
+    piecewise_wrapper = CUDAGraphWrapper(
+        inner_model, vllm_config, CUDAGraphMode.PIECEWISE
+    )
+    inner_model.forward = MagicMock(wraps=inner_model.forward)
+    outer_model = SimpleMLP().to(DEVICE_TYPE)
+    # When outer model is called, it calls the piecewise_wrapper
+    outer_model.forward = MagicMock(
+        wraps=outer_model.forward, side_effect=piecewise_wrapper
+    )
+    full_wrapper = CUDAGraphWrapper(outer_model, vllm_config, CUDAGraphMode.FULL)
 
-        # 0. global warmup
-        with set_forward_context(
-            attn_metadata=None,
-            vllm_config=self.vllm_config,
-            cudagraph_runtime_mode=CUDAGraphMode.NONE,
-            batch_descriptor=None,
-        ):
-            full_wrapper(input_1)
-
-        # --- Test runtime mode FULL---
-        # Run with FULL mode context. Expect outer wrapper to capture.
-        # The inner mock should be called once inside the graph capture.
-        outer_model.forward.reset_mock()
-        inner_model.forward.reset_mock()
-        action = self._run_and_monitor_call(
-            full_wrapper, input_1, CUDAGraphMode.FULL, desc_1
-        )
-        assert action == "capture_global"
-        assert outer_model.forward.call_count == 1
-        assert inner_model.forward.call_count == 1
-
-        # Run again. Expect outer wrapper to replay.
-        # The outer model should NOT be called because the whole graph
-        # is replayed.
-        action = self._run_and_monitor_call(
-            full_wrapper, input_1, CUDAGraphMode.FULL, desc_1
-        )
-        assert action == "replay"
-        assert outer_model.forward.call_count == 1  # No new call
-        assert inner_model.forward.call_count == 1
-
-        # --- Test runtime mode PIECEWISE ---
-        outer_model.forward.reset_mock()
-        inner_model.forward.reset_mock()
-        # Run with PIECEWISE mode context.
-        # Expect outer wrapper to bypass and call inner wrapper.
-        # Inner wrapper should capture.
-        action = self._run_and_monitor_call(
-            full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1
-        )
-        assert action == "capture_global"
-        assert outer_model.forward.call_count == 1
-        assert inner_model.forward.call_count == 1
-
-        # Run again with PIECEWISE.
-        # Outer bypasses, inner replays.
-        action = self._run_and_monitor_call(
-            full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1
-        )
-        assert action == "bypass"
-        assert outer_model.forward.call_count == 2
-        assert inner_model.forward.call_count == 1
+    desc_1 = BatchDescriptor(num_tokens=1)
+
+    # 0. global warmup
+    with set_forward_context(
+        attn_metadata=None,
+        vllm_config=vllm_config,
+        cudagraph_runtime_mode=CUDAGraphMode.NONE,
+        batch_descriptor=None,
+    ):
+        full_wrapper(input_1)
+
+    # --- Test runtime mode FULL---
+    # Run with FULL mode context. Expect outer wrapper to capture.
+    # The inner mock should be called once inside the graph capture.
+    outer_model.forward.reset_mock()
+    inner_model.forward.reset_mock()
+    action = _run_and_monitor_call(
+        full_wrapper, input_1, CUDAGraphMode.FULL, desc_1, vllm_config
+    )
+    assert action == "capture_global"
+    assert outer_model.forward.call_count == 1
+    assert inner_model.forward.call_count == 1
+
+    # Run again. Expect outer wrapper to replay.
+    # The outer model should NOT be called because the whole graph
+    # is replayed.
+    action = _run_and_monitor_call(
+        full_wrapper, input_1, CUDAGraphMode.FULL, desc_1, vllm_config
+    )
+    assert action == "replay"
+    assert outer_model.forward.call_count == 1  # No new call
+    assert inner_model.forward.call_count == 1
+
+    # --- Test runtime mode PIECEWISE ---
+    outer_model.forward.reset_mock()
+    inner_model.forward.reset_mock()
+    # Run with PIECEWISE mode context.
+    # Expect outer wrapper to bypass and call inner wrapper.
+    # Inner wrapper should capture.
+    action = _run_and_monitor_call(
+        full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1, vllm_config
+    )
+    assert action == "capture_global"
+    assert outer_model.forward.call_count == 1
+    assert inner_model.forward.call_count == 1
+
+    # Run again with PIECEWISE.
+    # Outer bypasses, inner replays.
+    action = _run_and_monitor_call(
+        full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1, vllm_config
+    )
+    assert action == "bypass"
+    assert outer_model.forward.call_count == 2
+    assert inner_model.forward.call_count == 1
diff --git a/tests/v1/determinism/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py
index 5c036c1b3802..2e9f77881273 100644
--- a/tests/v1/determinism/test_rms_norm_batch_invariant.py
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -12,7 +12,7 @@
 from utils import skip_unsupported
 
 from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm
-from vllm.model_executor.layers.layernorm import RMSNorm, fused_add_rms_norm
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
 
 DEVICE_TYPE = current_platform.device_type
@@ -105,6 +105,12 @@ def test_fused_add_rms_norm_batch_invariant_residual_path(
         dim=0,
     )
 
+    def fused_add_rms_norm(x, residual, w, e) -> tuple[torch.Tensor, torch.Tensor]:
+        import vllm._custom_ops as ops
+
+        ops.fused_add_rms_norm(x, residual, w, e)
+        return x, residual
+
     out_single, residual_out_single = fused_add_rms_norm(
         x_single.clone(),
         residual_single.clone(),
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index f9bebec98619..bbef61477232 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -26,6 +26,7 @@
 BACKENDS: list[str] = [
     "FLASH_ATTN",
     "TRITON_ATTN",
+    "FLEX_ATTENTION",
 ]
 
 # FlashInfer temporarily disabled due to invariant CTA sizes.
diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
index 1b7739d2f071..70a5136a57ce 100644
--- a/tests/v1/distributed/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
@@ -365,10 +365,13 @@ async def test_dp_pause_keep_race_staggered_engines():
         async def staggered_pause_keep(method: str, *args) -> Any:
             if method != "pause_scheduler" or not args or args[0] != "keep":
                 return await original_call_utility(method, *args)
-            # Send pause(keep) to engine 0 first
-            await client._call_utility_async(
-                method, *args, engine=client.core_engines[0]
+            # Fire pause(keep) to engine 0 (don't await — with DP
+            # two-phase pause, consensus requires all ranks).
+            pause_0 = asyncio.create_task(
+                client._call_utility_async(method, *args, engine=client.core_engines[0])
             )
+            # Let the event loop send the message to engine 0.
+            await asyncio.sleep(0.5)
             # In the middle: send two requests (race window)
             sp = SamplingParams(max_tokens=5, ignore_eos=True)
 
@@ -384,11 +387,13 @@ async def consume_gen(req_id: str) -> None:
             t2 = asyncio.create_task(consume_gen("race-2"))
             mid_pause_tasks.extend([t1, t2])
             await asyncio.sleep(3)
-            # Then send pause(keep) to engine 1
-            result = await client._call_utility_async(
-                method, *args, engine=client.core_engines[1]
+            # Fire pause(keep) to engine 1, then await both so
+            # consensus can be reached.
+            pause_1 = asyncio.create_task(
+                client._call_utility_async(method, *args, engine=client.core_engines[1])
             )
-            return result
+            results = await asyncio.gather(pause_0, pause_1)
+            return results[0]
 
         client.call_utility_async = staggered_pause_keep
 
@@ -398,3 +403,113 @@ async def consume_gen(req_id: str) -> None:
         assert not await engine.is_paused()
         # Let the two requests we sent mid-pause complete
         await asyncio.gather(*mid_pause_tasks)
+
+
+@pytest.mark.asyncio
+async def test_dp_pause_barrier_request_deadlock():
+    """
+    Test that start_dp_wave is ignored while paused.
+
+    Sequence:
+      1. Pause all engines (PAUSED_ALL).
+      2. Send barrier to engine 0 only — blocks in dist.barrier(dp_group).
+      3. Send a request routed to engine 1.
+      4. Wait for any (buggy) START_DP_WAVE propagation.
+      5. Send barrier to engine 1 — completes in fixed code, deadlocks
+         in buggy code because engine 1 is stuck in EP all-to-all.
+    """
+    if DP_SIZE != 2:
+        pytest.skip("requires DP_SIZE=2")
+
+    with ExitStack() as after:
+        engine_args = _get_dp_pause_engine_args(expert_parallel=True)
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        client = engine.engine_core
+
+        # Cache get_supported_tasks so that generate() won't need to
+        # send a utility call to all engines (which would hang once
+        # engine 0 is blocked in the barrier).
+        await engine.get_supported_tasks()
+
+        # Pause all engines normally — no staggering.
+        await engine.pause_generation(mode="keep")
+        assert await engine.is_paused()
+
+        original_call_utility = client.call_utility_async
+        mid_barrier_tasks: list[asyncio.Task] = []
+
+        async def staggered_barrier(method: str, *args) -> Any:
+            if method != "barrier":
+                return await original_call_utility(method, *args)
+
+            # Send barrier to engine 0 only — it blocks in
+            # dist.barrier(dp_group) waiting for engine 1.
+            barrier_0 = asyncio.create_task(
+                client._call_utility_async(method, *args, engine=client.core_engines[0])
+            )
+            await asyncio.sleep(1)
+
+            # While engine 0 is blocked, send a request routed
+            # specifically to engine 1.
+            sp = SamplingParams(max_tokens=5, ignore_eos=True)
+
+            engine_1 = client.core_engines[1]
+            original_get_engine = client.get_core_engine_for_request
+
+            def route_to_engine_1(req):
+                client.reqs_in_flight[req.request_id] = engine_1
+                return engine_1
+
+            client.get_core_engine_for_request = route_to_engine_1
+
+            async def consume_gen(req_id: str) -> None:
+                async for _ in engine.generate(
+                    request_id=req_id,
+                    prompt=DP_PAUSE_PROMPT,
+                    sampling_params=sp,
+                ):
+                    pass
+
+            t1 = asyncio.create_task(consume_gen("race-1"))
+            mid_barrier_tasks.append(t1)
+
+            # Yield so generate() preprocessing completes and
+            # add_request_async is called (which, in buggy code,
+            # would send FIRST_REQ and wake engine 1).
+            for _ in range(200):
+                await asyncio.sleep(0)
+
+            client.get_core_engine_for_request = original_get_engine
+
+            # Wait for any START_DP_WAVE to propagate and for
+            # engine 1 to potentially enter execute_dummy_batch.
+            await asyncio.sleep(5)
+
+            # Now send barrier to engine 1.  In buggy code engine 1
+            # is stuck in execute_dummy_batch (EP all-to-all) while
+            # engine 0 is stuck in dist.barrier(dp_group) — deadlock.
+            result = await client._call_utility_async(
+                method, *args, engine=client.core_engines[1]
+            )
+            await barrier_0
+            return result
+
+        client.call_utility_async = staggered_barrier
+
+        # Drive the staggered barrier.  Old code deadlocks here.
+        try:
+            await asyncio.wait_for(client.call_utility_async("barrier"), timeout=30)
+        except asyncio.TimeoutError:
+            for t in mid_barrier_tasks:
+                t.cancel()
+            pytest.fail(
+                "Staggered barrier deadlocked — FIRST_REQ sent while "
+                "paused caused collective-op mismatch between engines"
+            )
+
+        await engine.resume_generation()
+        assert not await engine.is_paused()
+        # Let the two requests we sent mid-barrier complete.
+        await asyncio.gather(*mid_barrier_tasks)
diff --git a/tests/v1/distributed/test_external_lb_dp.py b/tests/v1/distributed/test_external_lb_dp.py
index 912f8cffe7f6..06e8e574a05d 100644
--- a/tests/v1/distributed/test_external_lb_dp.py
+++ b/tests/v1/distributed/test_external_lb_dp.py
@@ -14,7 +14,7 @@
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 
-MODEL_NAME = "ibm-research/PowerMoE-3b"
+MODEL_NAME = os.getenv("MODEL_NAME", "ibm-research/PowerMoE-3b")
 
 # Number of data parallel ranks for external LB testing
 DP_SIZE = int(os.getenv("DP_SIZE", "2"))
@@ -111,11 +111,12 @@ def start_server(r: int, sargs: list[str]):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Stop all server instances."""
-        while self.servers:
-            try:
-                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
-            except Exception as e:
-                print(f"Error stopping server: {e}")
+        servers = [s for s, _ in self.servers]
+        self.servers.clear()
+        try:
+            RemoteOpenAIServer.shutdown_many(servers)
+        except Exception as e:
+            print(f"Error stopping servers: {e}")
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/v1/distributed/test_hybrid_lb_dp.py b/tests/v1/distributed/test_hybrid_lb_dp.py
index aa25130752a4..fcd3c69af54a 100644
--- a/tests/v1/distributed/test_hybrid_lb_dp.py
+++ b/tests/v1/distributed/test_hybrid_lb_dp.py
@@ -134,11 +134,12 @@ def start_server(node: int, sargs: list[str]):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Stop all server instances."""
-        while self.servers:
-            try:
-                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
-            except Exception as e:
-                print(f"Error stopping server: {e}")
+        servers = [s for s, _ in self.servers]
+        self.servers.clear()
+        try:
+            RemoteOpenAIServer.shutdown_many(servers)
+        except Exception as e:
+            print(f"Error stopping servers: {e}")
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/v1/distributed/test_internal_lb_dp.py b/tests/v1/distributed/test_internal_lb_dp.py
index efd9fc607dbb..31859ffba4c9 100644
--- a/tests/v1/distributed/test_internal_lb_dp.py
+++ b/tests/v1/distributed/test_internal_lb_dp.py
@@ -228,13 +228,13 @@ def start_server(sidx: int, r: int, sargs: list[str]):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Stop all server instances."""
-        while self.servers:
-            if server := self.servers.pop():
-                try:
-                    server[0].__exit__(exc_type, exc_val, exc_tb)
-                except Exception as e:
-                    print(f"Error stopping server: {e}")
-                    traceback.print_exc()
+        servers = [entry[0] for entry in self.servers if entry is not None]
+        self.servers.clear()
+        try:
+            RemoteOpenAIServer.shutdown_many(servers)
+        except Exception as e:
+            print(f"Error stopping servers: {e}")
+            traceback.print_exc()
 
 
 class APIOnlyServerManager:
@@ -370,13 +370,13 @@ def start_engines_server():
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Stop both server instances."""
-        while self.servers:
-            if server := self.servers.pop():
-                try:
-                    server[0].__exit__(exc_type, exc_val, exc_tb)
-                except Exception as e:
-                    print(f"Error stopping server: {e}")
-                    traceback.print_exc()
+        servers = [entry[0] for entry in self.servers if entry is not None]
+        self.servers.clear()
+        try:
+            RemoteOpenAIServer.shutdown_many(servers)
+        except Exception as e:
+            print(f"Error stopping servers: {e}")
+            traceback.print_exc()
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/v1/e2e/general/test_async_scheduling.py b/tests/v1/e2e/general/test_async_scheduling.py
index 8e1eddb0f64e..c3c4970de382 100644
--- a/tests/v1/e2e/general/test_async_scheduling.py
+++ b/tests/v1/e2e/general/test_async_scheduling.py
@@ -57,6 +57,8 @@ def test_without_spec_decoding(
         dict(bad_words=["the", " the"]),
         dict(logprobs=2),
         dict(logprobs=2, frequency_penalty=-1.0),
+        dict(prompt_logprobs=2),
+        dict(prompt_logprobs=2, logprobs=2),
         dict(structured_outputs=struct_outputs),
         dict(
             structured_outputs=struct_outputs,
@@ -126,6 +128,8 @@ def test_with_eagle3_spec_decoding(sample_json_schema, monkeypatch: pytest.Monke
         dict(bad_words=["the", " the"]),
         dict(logprobs=2),
         dict(logprobs=2, frequency_penalty=-1.0),
+        dict(prompt_logprobs=2),
+        dict(prompt_logprobs=2, logprobs=2),
         dict(structured_outputs=struct_outputs),
         dict(
             structured_outputs=struct_outputs,
@@ -324,10 +328,13 @@ def run_test(
 ):
     spec_decoding = spec_config is not None
     cache_arg: dict[str, Any] = (
-        # Force preemptions
-        dict(num_gpu_blocks_override=32)
+        # Force preemptions: with 32 blocks the cache holds at most a single
+        # max-length request, so the ~34 concurrent prompts contend and trigger
+        # preemption. (Prompts here are << max_model_len, so dropping
+        # max_model_len from 4096 to 512 doesn't change generation behavior.)
+        dict(num_gpu_blocks_override=32, max_model_len=512)
         if test_preemption
-        else dict(gpu_memory_utilization=0.9)
+        else dict(gpu_memory_utilization=0.9, max_model_len=4096)
     )
     spec_mml = (spec_config or {}).get("max_model_len")
     spec_method = (spec_config or {}).get("method", "none")
@@ -343,7 +350,6 @@ def run_test(
 
     with VllmRunner(
         model,
-        max_model_len=4096,
         enable_chunked_prefill=test_prefill_chunking,
         # Force prefill chunking
         max_num_batched_tokens=48 if test_prefill_chunking else None,
@@ -411,7 +417,12 @@ def _all_logprobs_match(req_a, req_b) -> bool:
     )
 
 
-def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
+def _logprobs_match(
+    lps_a: dict[int, Logprob] | None,
+    lps_b: dict[int, Logprob] | None,
+) -> bool:
+    if lps_a is None or lps_b is None:
+        return lps_a is lps_b
     rel_tol, abs_tol = 1e-3, 1e-6
     return (
         len(lps_a) == len(lps_b)
diff --git a/tests/v1/e2e/general/test_mamba_prefix_cache.py b/tests/v1/e2e/general/test_mamba_prefix_cache.py
index 8b9f7bb6c5ad..6ec9e7656e31 100644
--- a/tests/v1/e2e/general/test_mamba_prefix_cache.py
+++ b/tests/v1/e2e/general/test_mamba_prefix_cache.py
@@ -179,6 +179,7 @@ def fake_allocate_slots_fn(
         num_external_computed_tokens: int = 0,
         delay_cache_blocks: bool = False,
         num_encoder_tokens: int = 0,
+        full_sequence_must_fit: bool = False,
     ):
         ret = original_allocate_slots_fn(
             self,
@@ -190,6 +191,7 @@ def fake_allocate_slots_fn(
             num_external_computed_tokens,
             delay_cache_blocks,
             num_encoder_tokens,
+            full_sequence_must_fit,
         )
         if cur_step_action is not None:
             cur_block_ids = self.coordinator.single_type_managers[0].req_to_blocks[
diff --git a/tests/v1/e2e/spec_decode/test_async_spec_decode.py b/tests/v1/e2e/spec_decode/test_async_spec_decode.py
index c00dbd6d8197..b19f90e2cdc6 100644
--- a/tests/v1/e2e/spec_decode/test_async_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_async_spec_decode.py
@@ -82,6 +82,13 @@ def assert_no_sync(self, msg: str = ""):
         2,
         id="eagle-mla-deepseek",
     ),
+    pytest.param(
+        "Qwen/Qwen3.5-0.8B-Base",
+        "Qwen/Qwen3.5-0.8B-Base",
+        "mtp",
+        1,
+        id="mtp-qwen3_5-hybrid",
+    ),
 ]
 
 
@@ -104,6 +111,14 @@ def test_no_sync_with_spec_decode(
     from vllm import LLM, SamplingParams
     from vllm.distributed import cleanup_dist_env_and_memory
 
+    # Qwen3.5 is a VLM; without this, profile_run runs the ViT warmup
+    # and peaks well above the 18GB MIG slice used by one of the CI lanes.
+    # This test only exercises text generation, so the vision tower is
+    # never needed.
+    extra_kwargs: dict = {}
+    if "Qwen3.5" in model:
+        extra_kwargs["limit_mm_per_prompt"] = {"image": 0, "video": 0}
+
     llm = LLM(
         model=model,
         max_model_len=256,
@@ -114,6 +129,7 @@ def test_no_sync_with_spec_decode(
         },
         enforce_eager=True,
         async_scheduling=True,
+        **extra_kwargs,
     )
 
     # Assert async scheduling is actually active before running inference.
diff --git a/tests/v1/e2e/spec_decode/test_spec_decode.py b/tests/v1/e2e/spec_decode/test_spec_decode.py
index 926cdd830bc8..2ab2245b790c 100644
--- a/tests/v1/e2e/spec_decode/test_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_spec_decode.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import random
 from collections.abc import Iterable
 from dataclasses import dataclass
@@ -723,15 +724,25 @@ def test_eagle_correctness_heavy(
     [
         (("mtp", "XiaomiMiMo/MiMo-7B-Base", 1), False, 0.5),  # ref: 65%-70%
         (("mtp", "ZixiQi/DeepSeek-V3-4layers-MTP-FP8", 1), False, 0.0),  # dummy model
+        (
+            ("mtp", "Qwen/Qwen3.5-0.8B-Base", 1),
+            False,
+            0.20,
+        ),  # hybrid + MTP, ref: ~34%-35%
+        (
+            ("mtp", "google/gemma-4-E4B-it", 1, "google/gemma-4-E4B-it-assistant"),
+            False,
+            0.50,
+        ),  # gemma4 MTP with assistant model, ref: ~62%
     ],
-    ids=["mimo", "deepseek"],
+    ids=["mimo", "deepseek", "qwen3_5-hybrid", "gemma4-e4b"],
 )
 @single_gpu_only
 @large_gpu_mark(min_gb=20)
 def test_mtp_correctness(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
-    model_setup: tuple[str, str, int],
+    model_setup: tuple[str, str, int] | tuple[str, str, int, str],
     mm_enabled: bool,
     expected_accuracy_threshold: float,
 ):
@@ -747,16 +758,45 @@ def test_mtp_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_MLA_DISABLE", "1")
 
-        method, model_name, tp_size = model_setup
+        if len(model_setup) == 4:
+            method, model_name, tp_size, draft_model = model_setup
+        else:
+            method, model_name, tp_size = model_setup
+            draft_model = None
         _skip_if_insufficient_gpus_for_tp(tp_size)
 
+        if "Qwen3.5" in model_name and os.environ.get("VLLM_USE_V2_MODEL_RUNNER"):
+            pytest.skip(
+                "Model Runner V2 does not yet support hybrid models "
+                "(Qwen3.5 mixes Mamba-style GDN with attention layers)."
+            )
+
         attn_backend = "TRITON_ATTN" if current_platform.is_rocm() else "auto"
+
+        # Skip multimodal profiling for models that don't need it in this test.
+        extra_kwargs: dict[str, Any] = {}
+        if "Qwen3.5" in model_name:
+            extra_kwargs["limit_mm_per_prompt"] = {"image": 0, "video": 0}
+        elif "gemma-4" in model_name:
+            extra_kwargs["limit_mm_per_prompt"] = {"image": 0, "audio": 0}
+
+        if draft_model is not None and "gemma-4" in draft_model:
+            import transformers
+            from packaging.version import Version
+
+            if Version(transformers.__version__) < Version("5.8.0"):
+                pytest.skip(
+                    "Gemma4 MTP assistant requires transformers>=5.8.0, "
+                    f"got {transformers.__version__}"
+                )
+
         ref_llm = LLM(
             model=model_name,
             max_model_len=2048,
             tensor_parallel_size=tp_size,
             trust_remote_code=True,
             attention_backend=attn_backend,
+            **extra_kwargs,
         )
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         evaluate_llm_for_gsm8k(
@@ -766,17 +806,23 @@ def test_mtp_correctness(
         torch.accelerator.empty_cache()
         cleanup_dist_env_and_memory()
 
+        speculative_config: dict[str, Any] = {
+            "method": method,
+            "num_speculative_tokens": 1,
+            "max_model_len": 2048,
+        }
+        if draft_model is not None:
+            speculative_config["model"] = draft_model
+            speculative_config["num_speculative_tokens"] = 2
+
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,
             tensor_parallel_size=tp_size,
-            speculative_config={
-                "method": method,
-                "num_speculative_tokens": 1,
-                "max_model_len": 2048,
-            },
+            speculative_config=speculative_config,
             max_model_len=2048,
             attention_backend=attn_backend,
+            **extra_kwargs,
         )
         # MTP supports async scheduling; assert it is active by default.
         assert spec_llm.llm_engine.vllm_config.scheduler_config.async_scheduling
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
index 2dbcb307fda3..2e122680ca40 100644
--- a/tests/v1/ec_connector/integration/README.md
+++ b/tests/v1/ec_connector/integration/README.md
@@ -122,7 +122,7 @@ Quick sanity check:
 - Encoder cache should enable exact output reproduction
 - Test cleans up all instances and cache files after completion
 - Safe to run multiple times (idempotent)
-- We setup the PD disagg part with NixlConnector. Please read details about EPD in `examples/online_serving/disaggregated_encoder/README.md`
+- We setup the PD disagg part with NixlConnector. Please read details about EPD in `examples/disaggregated/disaggregated_encoder/README.md`
 
 ## Requirements
 
diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
index e199a3ecea43..65716444a57c 100644
--- a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
@@ -185,7 +185,7 @@ run_epd_1e_1pd() {
 
     # Start proxy
     echo "Starting EPD proxy on port $PROXY_PORT"
-    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+    python "${GIT_ROOT}/examples/disaggregated/disaggregated_encoder/disagg_epd_proxy.py" \
         --host "0.0.0.0" \
         --port "$PROXY_PORT" \
         --encode-servers-urls "http://localhost:$ENCODE_PORT" \
@@ -411,7 +411,7 @@ run_epd_1e_1p_1d() {
     
     # Start proxy
     echo "Starting EPD proxy on port $PROXY_PORT"
-    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+    python "${GIT_ROOT}/examples/disaggregated/disaggregated_encoder/disagg_epd_proxy.py" \
         --host "0.0.0.0" \
         --port "$PROXY_PORT" \
         --encode-servers-urls "http://localhost:$ENCODE_PORT" \
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 21a651c62ab3..92de5a7e9819 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -256,8 +256,10 @@ async def test_multi_abort(output_kind: RequestOutputKind):
                 )
             )
 
-        # Let requests start
-        await asyncio.sleep(0.5)
+        # Let requests start generating, use a longer sleep to ensure all
+        # requests have exited prefill and produced at least one
+        # decode token before we abort.
+        await asyncio.sleep(1.0)
 
         # Use multi-abort to abort multiple requests at once
         abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
@@ -369,9 +371,10 @@ async def test_mid_stream_cancellation(
         # Wait for all tasks to complete
         results = await asyncio.gather(*tasks)
 
-        # Verify all tasks were cancelled at the expected point
+        # Verify all tasks were cancelled at the expected point.
+        # Uses >= because the cancel check is `count >= cancel_after`.
         for num_generated_tokens, request_id in results:
-            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+            assert num_generated_tokens >= NUM_EXPECTED_TOKENS, (
                 f"{request_id} generated {num_generated_tokens} tokens but "
                 f"expected to cancel after {NUM_EXPECTED_TOKENS}"
             )
diff --git a/tests/v1/kv_connector/unit/offloading_connector/test_scheduler.py b/tests/v1/kv_connector/unit/offloading_connector/test_scheduler.py
index 8d2c45f7bd20..22eb1a86d4fb 100644
--- a/tests/v1/kv_connector/unit/offloading_connector/test_scheduler.py
+++ b/tests/v1/kv_connector/unit/offloading_connector/test_scheduler.py
@@ -4,6 +4,7 @@
 from unittest.mock import MagicMock
 
 import pytest
+import torch
 
 from tests.v1.kv_connector.unit.offloading_connector.utils import (
     generate_store_output,
@@ -15,7 +16,12 @@
     OffloadingConnectorScheduler,
 )
 from vllm.v1.core.kv_cache_utils import BlockHash
-from vllm.v1.kv_offload.abstract import (
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheGroupSpec,
+    SlidingWindowSpec,
+)
+from vllm.v1.kv_offload.base import (
     OffloadingEvent,
     OffloadingManager,
     ReqContext,
@@ -26,32 +32,32 @@
 
 @pytest.mark.parametrize("async_scheduling", [True, False])
 def test_offloading_connector(request_runner, async_scheduling: bool):
-    offloaded_block_size = 12
-    gpu_block_size = 4
+    block_size = 4
+    block_size_factor = 3
+    offloaded_block_size = block_size * block_size_factor
     num_gpu_blocks = 100
-    block_size_factor = offloaded_block_size // gpu_block_size
 
     runner = request_runner(
-        offloaded_block_size=offloaded_block_size,
-        gpu_block_size=gpu_block_size,
+        block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
         async_scheduling=async_scheduling,
+        block_size_factor=block_size_factor,
     )
 
     # 3 blocks, store just the middle block (skip first and last)
     # blocks = [0, 1, 2], [3, 4, 5], [6, 7, 8]
     runner.new_request(token_ids=[0] * offloaded_block_size * 3)
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(list(keys)[1:2])
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(list(keys)[1:2])
     )
     runner.run(decoded_tokens=[0])
 
     # add block missing 1 token -> no offload
     runner.run(
         decoded_tokens=[0] * (offloaded_block_size - 1),
-        expected_stored_gpu_block_indexes=(3, 4, 5),
+        expected_stored=(3, 4, 5),
     )
-    runner.manager.prepare_store.assert_not_called()
+    runner.manager.touch.assert_not_called()
 
     # +1 token -> single block, fail prepare_store
     runner.manager.prepare_store.side_effect = lambda keys, req_context: None
@@ -60,19 +66,19 @@ def test_offloading_connector(request_runner, async_scheduling: bool):
 
     # 1 more block (+ token for async scheduling)
     # now set block_hashes_to_store = []
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output([])
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
     )
     runner.run(decoded_tokens=[0] * (offloaded_block_size + 1))
 
     # 1 more block (+ token for kicking off offloading)
     # now check touch was called with all 6 blocks
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(keys)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
     )
     runner.run(
         decoded_tokens=[0] * (offloaded_block_size + 1),
-        expected_stored_gpu_block_indexes=(15, 16, 17),
+        expected_stored=(15, 16, 17),
     )
     runner.manager.touch.assert_called()
     block_hashes1 = list(runner.manager.touch.call_args.args[0])
@@ -95,23 +101,23 @@ def test_offloading_connector(request_runner, async_scheduling: bool):
     # terminate request
     runner.run(
         decoded_tokens=[EOS_TOKEN_ID],
-        expected_stored_gpu_block_indexes=tuple(range(6 * block_size_factor)),
+        expected_stored=tuple(range(6 * block_size_factor)),
     )
 
     # full_block_tokens - num_computed_tokens < offloaded_block_size
     runner.new_request(
-        token_ids=[0] * gpu_block_size + [1] * (offloaded_block_size - gpu_block_size)
+        token_ids=[0] * block_size + [1] * (offloaded_block_size - block_size)
     )
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output([])
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
     )
     runner.run(decoded_tokens=[EOS_TOKEN_ID])
     runner.manager.lookup.assert_not_called()
 
     # single block lookup with no hits
     runner.new_request(token_ids=[1] * offloaded_block_size)
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output([])
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
     )
     runner.run(decoded_tokens=[EOS_TOKEN_ID])
     runner.manager.lookup.assert_called_once()
@@ -119,25 +125,21 @@ def test_offloading_connector(request_runner, async_scheduling: bool):
     # single block lookup with a hit
     runner.scheduler.reset_prefix_cache()
     runner.new_request(token_ids=[0] * offloaded_block_size)
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output([])
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
     )
     runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
-    runner.run(
-        decoded_tokens=[EOS_TOKEN_ID], expected_loaded_gpu_block_indexes=(0, 1, 2)
-    )
+    runner.run(decoded_tokens=[EOS_TOKEN_ID], expected_loaded=(0, 1, 2))
 
     # single block lookup with a hit in a middle block
     runner.new_request(
         token_ids=[0] * offloaded_block_size * 2 + [1] * offloaded_block_size
     )
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output([])
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
     )
     runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
-    runner.run(
-        decoded_tokens=[EOS_TOKEN_ID], expected_loaded_gpu_block_indexes=(3, 4, 5)
-    )
+    runner.run(decoded_tokens=[EOS_TOKEN_ID], expected_loaded=(3, 4, 5))
 
     # test take_events
     def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
@@ -167,15 +169,16 @@ def take_events() -> Iterable[OffloadingEvent]:
 
 @pytest.mark.parametrize("async_scheduling", [True, False])
 def test_request_preemption(request_runner, async_scheduling: bool):
-    offloaded_block_size = 12
-    gpu_block_size = 4
+    block_size = 4
+    block_size_factor = 3
+    offloaded_block_size = block_size * block_size_factor
     num_gpu_blocks = 100
 
     runner = request_runner(
-        offloaded_block_size=offloaded_block_size,
-        gpu_block_size=gpu_block_size,
+        block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
         async_scheduling=async_scheduling,
+        block_size_factor=block_size_factor,
     )
 
     free_block_queue = runner.scheduler.kv_cache_manager.block_pool.free_block_queue
@@ -184,8 +187,8 @@ def test_request_preemption(request_runner, async_scheduling: bool):
     # 2 blocks, store all, without flushing
     # blocks = [0, 1, 2], [3, 4, 5]
     runner.new_request(token_ids=[0] * offloaded_block_size * 2)
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(keys)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
     )
     runner.run(
         decoded_tokens=[0],
@@ -193,11 +196,11 @@ def test_request_preemption(request_runner, async_scheduling: bool):
     )
 
     # decode 2 more blocks - 1 gpu block, storing [6, 7, 8] (no flush)
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(keys)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
     )
     runner.run(
-        decoded_tokens=[0] * (2 * offloaded_block_size - gpu_block_size),
+        decoded_tokens=[0] * (2 * offloaded_block_size - block_size),
         complete_transfers=False,
     )
 
@@ -208,8 +211,8 @@ def test_request_preemption(request_runner, async_scheduling: bool):
     runner.run(
         decoded_tokens=[],
         complete_transfers=False,
-        expected_flushed_gpu_block_indexes=(0, 1, 2, 3, 4, 5, 6, 7, 8),
-        expected_stored_gpu_block_indexes=(0, 1, 2, 3, 4, 5, 6, 7, 8),
+        expected_flushed=(0, 1, 2, 3, 4, 5, 6, 7, 8),
+        expected_stored=(0, 1, 2, 3, 4, 5, 6, 7, 8),
     )
 
     # restore KV cache space and reset GPU prefix cache
@@ -219,41 +222,45 @@ def test_request_preemption(request_runner, async_scheduling: bool):
     # request should now return from preemption
     # re-load [0, ..., 8] from the CPU and store [9, 10, 11]
     runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 3
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(keys)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
     )
     runner.run(
-        decoded_tokens=[0] * gpu_block_size,
-        expected_loaded_gpu_block_indexes=(0, 1, 2, 3, 4, 5, 6, 7, 8),
+        decoded_tokens=[0] * block_size,
+        expected_loaded=(0, 1, 2, 3, 4, 5, 6, 7, 8),
     )
 
     runner.run(
         decoded_tokens=[EOS_TOKEN_ID],
-        expected_stored_gpu_block_indexes=(9, 10, 11),
+        expected_stored=(9, 10, 11),
     )
 
+    # All stores completed before request_finished -> fence index empty.
+    assert runner.connector_scheduler._block_id_to_pending_jobs == {}
+
 
 @pytest.mark.parametrize("async_scheduling", [True, False])
 def test_concurrent_lookups_of_the_same_prefix(request_runner, async_scheduling: bool):
-    offloaded_block_size = 12
-    gpu_block_size = 4
+    block_size = 4
+    block_size_factor = 3
+    offloaded_block_size = block_size * block_size_factor
     num_gpu_blocks = 100
 
     runner = request_runner(
-        offloaded_block_size=offloaded_block_size,
-        gpu_block_size=gpu_block_size,
+        block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
         async_scheduling=async_scheduling,
+        block_size_factor=block_size_factor,
     )
 
     # store 1 blocks
     runner.new_request(token_ids=[0] * offloaded_block_size)
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(keys)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
     )
     runner.run(
         decoded_tokens=[EOS_TOKEN_ID],
-        expected_stored_gpu_block_indexes=(0, 1, 2),
+        expected_stored=(0, 1, 2),
     )
 
     # start a request to load the first block, but don't complete
@@ -281,39 +288,43 @@ def test_concurrent_lookups_of_the_same_prefix(request_runner, async_scheduling:
     assert transfer_jobs == list(runner.offloading_spec.handler.transfer_specs)
 
     # complete transfers
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output([])
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
     )
     runner.run(
         decoded_tokens=[EOS_TOKEN_ID],
-        expected_loaded_gpu_block_indexes=(0, 1, 2),
+        expected_loaded=(0, 1, 2),
     )
 
     # second request will use the GPU prefix cache
     assert transfer_jobs == list(runner.offloading_spec.handler.transfer_specs)
 
+    # Fence index drained: stores completed before request_finished ran.
+    assert runner.connector_scheduler._block_id_to_pending_jobs == {}
+
 
 @pytest.mark.parametrize("async_scheduling", [True, False])
 def test_abort_loading_requests(request_runner, async_scheduling: bool):
-    offloaded_block_size = 12
-    gpu_block_size = 4
+    block_size = 4
+    block_size_factor = 3
+    offloaded_block_size = block_size * block_size_factor
     num_gpu_blocks = 100
 
     runner = request_runner(
-        offloaded_block_size=offloaded_block_size,
-        gpu_block_size=gpu_block_size,
+        block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
         async_scheduling=async_scheduling,
+        block_size_factor=block_size_factor,
     )
 
     # store 1 blocks
     runner.new_request(token_ids=[0] * offloaded_block_size)
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(keys)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
     )
     runner.run(
         decoded_tokens=[EOS_TOKEN_ID],
-        expected_stored_gpu_block_indexes=(0, 1, 2),
+        expected_stored=(0, 1, 2),
     )
 
     # start a request to load the first block, but don't complete
@@ -339,13 +350,264 @@ def test_abort_loading_requests(request_runner, async_scheduling: bool):
     # complete loading request
     runner.run(
         decoded_tokens=[],
-        expected_loaded_gpu_block_indexes=(0, 1, 2),
+        expected_loaded=(0, 1, 2),
     )
 
     # assert request is deleted
     assert req_id not in runner.scheduler.requests
 
 
+@pytest.mark.parametrize("async_scheduling", [True, False])
+def test_two_groups_full_and_sliding_window(request_runner, async_scheduling: bool):
+    block_size = 4
+    num_gpu_blocks = 100
+    # sliding_window=8 -> 2 offloaded blocks (block_size_factor=1)
+    sliding_window = 8
+
+    kv_cache_groups = [
+        KVCacheGroupSpec(
+            ["layer0"],
+            FullAttentionSpec(
+                block_size=block_size,
+                num_kv_heads=1,
+                head_size=1,
+                dtype=torch.float32,
+            ),
+        ),
+        KVCacheGroupSpec(
+            ["layer1"],
+            SlidingWindowSpec(
+                block_size=block_size,
+                num_kv_heads=1,
+                head_size=1,
+                dtype=torch.float32,
+                sliding_window=sliding_window,
+            ),
+        ),
+    ]
+
+    runner = request_runner(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        async_scheduling=async_scheduling,
+        kv_cache_groups=kv_cache_groups,
+    )
+
+    # Verify group configs: group 0 = full attention, group 1 = sliding window
+    kv_group_configs = runner.connector_scheduler.config.kv_group_configs
+    assert len(kv_group_configs) == 2
+    assert kv_group_configs[0].sliding_window_size_in_blocks is None
+    assert kv_group_configs[1].sliding_window_size_in_blocks == 2
+
+    # Blocks [0, 1, 2] miss
+    runner.new_request(token_ids=[0] * block_size * 3)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
+    )
+    runner.run(decoded_tokens=[0])
+    # _touch called from get_num_new_matched_tokens (2 groups) and
+    # _get_reqs_to_store (2 groups) → 4 touch calls total.
+    touch_calls = runner.manager.touch.call_args_list
+    assert len(touch_calls) == 4
+    assert len(touch_calls[0].args[0]) == 3
+    assert len(touch_calls[1].args[0]) == 3
+    assert len(touch_calls[2].args[0]) == 3
+    assert len(touch_calls[3].args[0]) == 3
+
+    # store 3 more block
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
+    )
+    runner.run(
+        decoded_tokens=[0] * (block_size * 3 + 2),
+        expected_stored=(0, 1, 2, 3, 4, 5),
+    )
+
+    # touch called from _get_reqs_to_store * 3 blocks, once for each group
+    touch_calls = runner.manager.touch.call_args_list
+    assert len(touch_calls) == 6
+
+    runner.run(decoded_tokens=[EOS_TOKEN_ID])
+
+    runner.scheduler.reset_prefix_cache()
+
+    # full 3 blocks hit [0, 1, 2]
+    runner.new_request(token_ids=[0] * (block_size * 3 + 1))
+    runner.manager.lookup.return_value = True
+    runner.run(
+        decoded_tokens=[EOS_TOKEN_ID],
+        # Group 0 (full attn): prefix lookup hits 3 → loads blocks 0,1,2
+        # Group 1 (sliding window, window=2): only the last 2 blocks
+        #   are within the window → loads blocks 1,2
+        expected_loaded=((0, 0), (0, 1), (0, 2), (1, 1), (1, 2)),
+    )
+
+    # one touch in get_num_new_matched_tokens x 2 groups
+    touch_calls = runner.manager.touch.call_args_list
+    assert len(touch_calls) == 2
+    # full attention group touched all 3 blocks
+    assert len(touch_calls[0].args[0]) == 3
+    # sliding window group touched just the last 2 blocks
+    assert len(touch_calls[1].args[0]) == 2
+
+    # 3 blocks are hit on GPU [0, 1, 2]
+    # 1 block loaded [3,]
+    runner.new_request(token_ids=[0] * (block_size * 4 + 1))
+    runner.manager.lookup.return_value = True
+    runner.run(
+        decoded_tokens=[EOS_TOKEN_ID],
+        # Group 0 (full attn): prefix lookup hits 3 → loads blocks 0,1,2
+        # Group 1 (sliding window, window=2): only the last 2 blocks
+        #   are within the window → loads blocks 1,2
+        expected_loaded=((0, 3), (1, 3)),
+    )
+
+
+@pytest.mark.parametrize("async_scheduling", [True, False])
+def test_two_groups_different_block_sizes(request_runner, async_scheduling: bool):
+    hash_block_size = 4
+    num_gpu_blocks = 100
+
+    # Group 0: block_size=12 (offloaded_block_size=12)
+    # Group 1: block_size=16 (offloaded_block_size=16)
+    kv_cache_groups = [
+        KVCacheGroupSpec(
+            ["layer0"],
+            FullAttentionSpec(
+                block_size=hash_block_size * 3,
+                num_kv_heads=1,
+                head_size=1,
+                dtype=torch.float32,
+            ),
+        ),
+        KVCacheGroupSpec(
+            ["layer1"],
+            FullAttentionSpec(
+                block_size=hash_block_size * 4,
+                num_kv_heads=1,
+                head_size=1,
+                dtype=torch.float32,
+            ),
+        ),
+    ]
+
+    runner = request_runner(
+        block_size=hash_block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        async_scheduling=async_scheduling,
+        kv_cache_groups=kv_cache_groups,
+    )
+
+    # Verify group configs
+    kv_group_configs = runner.connector_scheduler.config.kv_group_configs
+    assert len(kv_group_configs) == 2
+    assert kv_group_configs[0].gpu_block_size == 12
+    assert kv_group_configs[0].offloaded_block_size == 12
+    assert kv_group_configs[1].gpu_block_size == 16
+    assert kv_group_configs[1].offloaded_block_size == 16
+
+    # Prompt: 25 tokens, unaligned to both block sizes.
+    # Group 0 blocks: [0, 1], ending_token_offset = 24
+    # Group 1 blocks: [0,], ending_token_offset = 16
+    runner.new_request(token_ids=[0] * 25)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
+    )
+    runner.run(decoded_tokens=[0])
+    # _touch called from get_num_new_matched_tokens (2 groups) and
+    # _get_reqs_to_store (2 groups) → 4 touch calls total.
+    # Group 0 has 2 offload keys, group 1 has 1.
+    touch_calls = runner.manager.touch.call_args_list
+    assert len(touch_calls) == 4
+    assert len(touch_calls[0].args[0]) == 2
+    assert len(touch_calls[1].args[0]) == 1
+    assert len(touch_calls[2].args[0]) == 2
+    assert len(touch_calls[3].args[0]) == 1
+
+    # Get to 31 tokens
+    # No further blocks offloaded
+    runner.run(decoded_tokens=[0] * 6, expected_stored=((0, 0), (0, 1), (1, 0)))
+
+    # Get to 32 tokens
+    # Group 0 blocks: [0, 1], ending_token_offset = 24
+    # Group 1 blocks: [0, 1], ending_token_offset = 32
+    runner.run(decoded_tokens=[0])
+    # _get_reqs_to_store touch: only group 1 has a new block to store
+    touch_calls = runner.manager.touch.call_args_list
+    assert len(touch_calls) == 2
+    assert len(touch_calls[0].args[0]) == 2
+    assert len(touch_calls[1].args[0]) == 2
+
+    # Get to 35 tokens
+    # No further blocks offloaded
+    runner.run(decoded_tokens=[0] * 3, expected_stored=((1, 1),))
+
+    # Get to 36 tokens
+    # Group 0 blocks: [0, 1, 2], ending_token_offset = 36
+    # Group 1 blocks: [0, 1], ending_token_offset = 32
+    runner.run(decoded_tokens=[0])
+    # _get_reqs_to_store touch: only group 0 has a new block to store
+    touch_calls = runner.manager.touch.call_args_list
+    assert len(touch_calls) == 2
+    assert len(touch_calls[0].args[0]) == 3
+    assert len(touch_calls[1].args[0]) == 2
+
+    # Get to 47 tokens
+    # No further blocks offloaded
+    runner.run(decoded_tokens=[0] * 11, expected_stored=((0, 2),))
+
+    # Get to 48 tokens
+    # Group 0 blocks: [0, 1, 2, 3], ending_token_offset = 4
+    # Group 1 blocks: [0, 1, 2], ending_token_offset = 48
+    runner.run(decoded_tokens=[0])
+    # _get_reqs_to_store touch: both groups have a new block, each with 1 key
+    touch_calls = runner.manager.touch.call_args_list
+    assert len(touch_calls) == 2
+    assert len(touch_calls[0].args[0]) == 4
+    assert len(touch_calls[1].args[0]) == 3
+
+    runner.run(decoded_tokens=[0], expected_stored=((0, 3), (1, 2)))
+
+    # Get to 96 tokens
+    runner.run(
+        decoded_tokens=[0] * 47 + [EOS_TOKEN_ID],
+        expected_stored=((0, 4), (0, 5), (0, 6), (0, 7), (1, 3), (1, 4), (1, 5)),
+    )
+
+    runner.scheduler.reset_prefix_cache()
+
+    # Request with 48 matching tokens
+    # will match 48 tokens (4 block) from the first group
+    # 48 tokens (3 block) from the second group
+    # Total 48 tokens can be loaded
+    runner.new_request(token_ids=[0] * 48)
+    runner.manager.lookup.return_value = True
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
+    )
+    runner.run(
+        decoded_tokens=[0],
+        expected_loaded=((0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2)),
+    )
+    runner.run(decoded_tokens=[EOS_TOKEN_ID])
+
+    # Request with 48+37 matching tokens
+    # 48 tokens will be hit on GPU
+    # extra 32 tokens will be loaded
+    # extra tokens [0, 36] (blocks [4, 5, 6]) from the first group
+    # extra tokens [0, 32] (block [3, 4]) from the second group
+    runner.new_request(token_ids=[0] * (48 + 37))
+    runner.manager.lookup.return_value = True
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
+    )
+    runner.run(
+        decoded_tokens=[0],
+        expected_loaded=((0, 4), (0, 5), (0, 6), (1, 3), (1, 4)),
+    )
+    runner.run(decoded_tokens=[EOS_TOKEN_ID])
+
+
 # ---------------------------------------------------------------------------
 # Unit tests for _maximal_prefix_lookup / _sliding_window_lookup
 # ---------------------------------------------------------------------------
@@ -487,26 +749,24 @@ def test_do_remote_decode_stores_all_blocks(request_runner, async_scheduling: bo
 
     This supports P/D disaggregation where the prefill instance offloads the
     complete KV cache so a remote decode node can consume it."""
-    offloaded_block_size = 12
     gpu_block_size = 4
+    block_size_factor = 3
+    offloaded_block_size = gpu_block_size * block_size_factor
     num_gpu_blocks = 100
 
     runner = request_runner(
-        offloaded_block_size=offloaded_block_size,
-        gpu_block_size=gpu_block_size,
+        block_size_factor=block_size_factor,
+        block_size=gpu_block_size,
         num_gpu_blocks=num_gpu_blocks,
         async_scheduling=async_scheduling,
     )
 
     # Store 1 offloaded block (3 GPU blocks) via a normal request.
     runner.new_request(token_ids=[0] * offloaded_block_size)
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(keys)
-    )
-    runner.run(
-        decoded_tokens=[EOS_TOKEN_ID],
-        expected_stored_gpu_block_indexes=(0, 1, 2),
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
     )
+    runner.run(decoded_tokens=[EOS_TOKEN_ID], expected_stored=(0, 1, 2))
 
     # Reset GPU prefix cache so the next request must load from CPU.
     runner.scheduler.reset_prefix_cache()
@@ -518,19 +778,144 @@ def test_do_remote_decode_stores_all_blocks(request_runner, async_scheduling: bo
         kv_transfer_params={"do_remote_decode": True},
     )
     runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
-    runner.manager.prepare_store.side_effect = (
-        lambda keys, req_context: generate_store_output(keys)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
     )
 
     # Load the first offloaded block from CPU.
-    runner.run(
-        decoded_tokens=[0],
-        expected_loaded_gpu_block_indexes=(0, 1, 2),
-    )
+    runner.run(decoded_tokens=[0], expected_loaded=(0, 1, 2))
 
     # Store must include ALL 6 GPU blocks (both the loaded prefix and
     # the newly computed block), not just the 3 new ones.
+    runner.run(decoded_tokens=[EOS_TOKEN_ID], expected_stored=(0, 1, 2, 3, 4, 5))
+
+    # All stores completed before request_finished -> fence index empty.
+    assert runner.connector_scheduler._block_id_to_pending_jobs == {}
+
+
+# ---------------------------------------------------------------------------
+# Tests for the per-job-store-completion design and fence invariants.
+# ---------------------------------------------------------------------------
+
+
+def test_loads_do_not_populate_fence_index(request_runner):
+    """Loads don't populate _block_id_to_pending_jobs (protected by
+    delay_free_blocks while in flight)."""
+    runner = request_runner(
+        block_size_factor=3,
+        block_size=4,
+        num_gpu_blocks=100,
+        async_scheduling=False,
+    )
+    runner.new_request(token_ids=[0] * 12)
+    runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
+    runner.run(decoded_tokens=[], complete_transfers=False)
+    assert runner.connector_scheduler._block_id_to_pending_jobs == {}
+
+
+def test_fence_at_update_state_after_alloc(request_runner):
+    """A load reusing a finished request's pending-store block triggers
+    a flush via update_state_after_alloc's fence.
+
+    num_gpu_blocks=2 forces the BlockPool to give req2 the same block
+    req1 just freed.
+    """
+    runner = request_runner(
+        block_size_factor=1,
+        block_size=4,
+        num_gpu_blocks=2,
+        async_scheduling=False,
+    )
+
+    runner.new_request(token_ids=[0] * 4)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
+    )
+    runner.run(decoded_tokens=[EOS_TOKEN_ID], complete_transfers=False)
+    assert runner.connector_scheduler._block_id_to_pending_jobs
+
+    runner.scheduler.reset_prefix_cache()
+    runner.new_request(token_ids=[0] * 4)
+    runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
+    )
+    runner.run(
+        decoded_tokens=[],
+        complete_transfers=False,
+        expected_stored=(0,),
+        expected_flushed=(0,),
+    )
+    assert runner.connector_scheduler._block_id_to_pending_jobs == {}
+
+
+def test_fence_at_build_store_jobs(request_runner):
+    """A new prefill (no load -> update_state_after_alloc returns early)
+    reusing a finished request's pending-store block is flushed by
+    _build_store_jobs's fence."""
+    runner = request_runner(
+        block_size_factor=1,
+        block_size=4,
+        num_gpu_blocks=2,
+        async_scheduling=False,
+    )
+
+    runner.new_request(token_ids=[0] * 4)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
+    )
+    runner.run(decoded_tokens=[EOS_TOKEN_ID], complete_transfers=False)
+    assert runner.connector_scheduler._block_id_to_pending_jobs
+
+    runner.scheduler.reset_prefix_cache()
+    runner.new_request(token_ids=[1] * 4)
+    runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 0
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output([])
+    )
     runner.run(
         decoded_tokens=[EOS_TOKEN_ID],
-        expected_stored_gpu_block_indexes=(0, 1, 2, 3, 4, 5),
+        expected_stored=(0,),
+        expected_flushed=(0,),
     )
+    assert runner.connector_scheduler._block_id_to_pending_jobs == {}
+
+
+@pytest.mark.parametrize("async_scheduling", [True, False])
+def test_complete_store_called_per_job(request_runner, async_scheduling: bool):
+    """complete_store fires per-job, not deferred to request finish.
+    Each call carries only that store's keys."""
+    gpu_block_size = 4
+    block_size_factor = 3
+    offloaded_block_size = gpu_block_size * block_size_factor
+    runner = request_runner(
+        block_size_factor=block_size_factor,
+        block_size=gpu_block_size,
+        num_gpu_blocks=100,
+        async_scheduling=async_scheduling,
+    )
+    runner.new_request(token_ids=[0] * offloaded_block_size)
+    runner.manager.prepare_store.side_effect = lambda keys, req_context: (
+        generate_store_output(keys)
+    )
+
+    # First store: fires when block 0 is fully populated.
+    runner.run(decoded_tokens=[0, 0], expected_stored=(0, 1, 2))
+    assert runner.manager.complete_store.call_count == 1
+    first_call_keys = set(runner.manager.complete_store.call_args.args[0])
+    assert len(first_call_keys) == 1
+    runner.manager.complete_store.reset_mock()
+
+    # Second store: fires when block 1 is fully populated, with different keys.
+    runner.run(
+        decoded_tokens=[0] * (offloaded_block_size + 1),
+        expected_stored=(3, 4, 5),
+    )
+    assert runner.manager.complete_store.call_count == 1
+    second_call_keys = set(runner.manager.complete_store.call_args.args[0])
+    assert first_call_keys != second_call_keys
+    runner.manager.complete_store.reset_mock()
+
+    # Finish: no store pending -> no further call.
+    runner.run(decoded_tokens=[EOS_TOKEN_ID])
+    assert runner.manager.complete_store.call_count == 0
diff --git a/tests/v1/kv_connector/unit/offloading_connector/test_worker.py b/tests/v1/kv_connector/unit/offloading_connector/test_worker.py
index 3cfb32b3068f..b2fb0846d912 100644
--- a/tests/v1/kv_connector/unit/offloading_connector/test_worker.py
+++ b/tests/v1/kv_connector/unit/offloading_connector/test_worker.py
@@ -20,7 +20,7 @@
     MLAAttentionSpec,
     UniformTypeKVCacheSpecs,
 )
-from vllm.v1.kv_offload.spec import (
+from vllm.v1.kv_offload.base import (
     CanonicalKVCacheRef,
     CanonicalKVCaches,
     OffloadingSpec,
diff --git a/tests/v1/kv_connector/unit/offloading_connector/test_worker_metadata.py b/tests/v1/kv_connector/unit/offloading_connector/test_worker_metadata.py
new file mode 100644
index 000000000000..ab9d676cb4ae
--- /dev/null
+++ b/tests/v1/kv_connector/unit/offloading_connector/test_worker_metadata.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.distributed.kv_transfer.kv_connector.v1.offloading.common import (
+    OffloadingWorkerMetadata,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+def test_aggregate_sums_counts():
+    meta1 = OffloadingWorkerMetadata(completed_jobs={42: 1, 7: 1})
+    meta2 = OffloadingWorkerMetadata(completed_jobs={42: 1, 7: 1})
+    result = meta1.aggregate(meta2)
+    assert result.completed_jobs == {42: 2, 7: 2}
+
+
+def test_aggregate_disjoint_jobs():
+    meta1 = OffloadingWorkerMetadata(completed_jobs={42: 1, 7: 1})
+    meta2 = OffloadingWorkerMetadata(completed_jobs={43: 1, 8: 1})
+    result = meta1.aggregate(meta2)
+    assert result.completed_jobs == {42: 1, 7: 1, 43: 1, 8: 1}
+
+
+def test_aggregate_multiple_workers():
+    meta1 = OffloadingWorkerMetadata(completed_jobs={42: 1, 43: 1, 7: 1})
+    meta2 = OffloadingWorkerMetadata(completed_jobs={42: 1, 7: 1, 8: 1})
+    meta3 = OffloadingWorkerMetadata(completed_jobs={42: 1, 43: 1, 8: 1})
+    result = meta1.aggregate(meta2).aggregate(meta3)
+    assert result.completed_jobs == {42: 3, 43: 2, 7: 2, 8: 2}
diff --git a/tests/v1/kv_connector/unit/offloading_connector/utils.py b/tests/v1/kv_connector/unit/offloading_connector/utils.py
index 60dc11f4ca4b..89f05f1053a1 100644
--- a/tests/v1/kv_connector/unit/offloading_connector/utils.py
+++ b/tests/v1/kv_connector/unit/offloading_connector/utils.py
@@ -1,10 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import copy
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
 from typing import Any
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
@@ -19,6 +18,7 @@
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
 from vllm.distributed.kv_transfer.kv_connector.v1.offloading.common import (
     OffloadingConnectorMetadata,
+    OffloadingWorkerMetadata,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector import (
     OffloadingConnector,
@@ -37,21 +37,20 @@
     KVCacheConfig,
     KVCacheGroupSpec,
 )
-from vllm.v1.kv_offload.abstract import (
+from vllm.v1.kv_offload.base import (
+    GPULoadStoreSpec,
     LoadStoreSpec,
     OffloadingManager,
+    OffloadingSpec,
     OffloadKey,
     PrepareStoreOutput,
     make_offload_key,
 )
-from vllm.v1.kv_offload.mediums import GPULoadStoreSpec
-from vllm.v1.kv_offload.spec import OffloadingSpec
 from vllm.v1.kv_offload.worker.worker import (
     OffloadingHandler,
     TransferResult,
     TransferSpec,
 )
-from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
 
@@ -151,46 +150,61 @@ def get_flushed_transfers(self):
         return specs
 
 
+@dataclass(frozen=True)
+class GPUBlock:
+    group_idx: int
+    request_block_offset: int
+
+
 @dataclass
 class TransferSummary:
-    gpu_block_indices: list[int]
+    gpu_blocks: list[GPUBlock]
     offload_addresses: list[Any]
 
 
 class RequestRunner:
     def __init__(
         self,
-        offloaded_block_size: int,
-        gpu_block_size: int,
+        block_size: int,
         num_gpu_blocks: int,
+        block_size_factor: int = 1,
         async_scheduling: bool = True,
+        kv_cache_groups: list[KVCacheGroupSpec] | None = None,
     ):
-        self.offloaded_block_size: int = offloaded_block_size
-        self.gpu_block_size: int = gpu_block_size
+        assert block_size_factor == 1 or kv_cache_groups is None, (
+            "block_size_factor > 1 requires all groups to have the same "
+            "block size, so kv_cache_groups must be None (use default group)"
+        )
+
+        self.block_size_factor: int = block_size_factor
+        self.block_size: int = block_size
         self.num_gpu_blocks: int = num_gpu_blocks
         self.async_scheduling: bool = async_scheduling
 
         self.req_id: int = -1
 
         vllm_config = create_vllm_config(
-            block_size=gpu_block_size, max_num_batched_tokens=1000
+            block_size=block_size,
+            max_num_batched_tokens=1000,
+            disable_hybrid_kv_cache_manager=False,
         )
         vllm_config.scheduler_config.async_scheduling = async_scheduling
+
+        extra_config: dict[str, Any] = {
+            "spec_name": "MockOffloadingSpec",
+            "spec_module_path": "tests.v1.kv_connector.unit.offloading_connector.utils",  # noqa: E501
+        }
+        if block_size_factor > 1:
+            extra_config["block_size"] = block_size * block_size_factor
+
         vllm_config.kv_transfer_config = KVTransferConfig(
             kv_connector="OffloadingConnector",
             kv_role="kv_both",
-            kv_connector_extra_config={
-                "spec_name": "MockOffloadingSpec",
-                "spec_module_path": "tests.v1.kv_connector.unit.offloading_connector.utils",  # noqa: E501
-                "block_size": offloaded_block_size,
-            },
+            kv_connector_extra_config=extra_config,
         )
 
-        block_size = vllm_config.cache_config.block_size
-        kv_cache_config = KVCacheConfig(
-            num_blocks=num_gpu_blocks,
-            kv_cache_tensors=[],
-            kv_cache_groups=[
+        if kv_cache_groups is None:
+            kv_cache_groups = [
                 KVCacheGroupSpec(
                     ["layer"],
                     FullAttentionSpec(
@@ -200,7 +214,12 @@ def __init__(
                         dtype=torch.float32,
                     ),
                 )
-            ],
+            ]
+
+        kv_cache_config = KVCacheConfig(
+            num_blocks=num_gpu_blocks,
+            kv_cache_tensors=[],
+            kv_cache_groups=kv_cache_groups,
         )
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         self.num_kv_groups = len(kv_cache_config.kv_cache_groups)
@@ -220,11 +239,38 @@ def __init__(
 
         # register worker kv_caches to enable OffloadingWorker creations
         # set_current_vllm_config is needed for get_kv_cache_layout() to work
-        with set_current_vllm_config(vllm_config):
-            self.worker_connector.register_cross_layers_kv_cache(
-                kv_cache=torch.empty(0),
-                attn_backend=FlashAttentionBackend,
-            )
+        # Mock get_layers_from_vllm_config so that mock layer names
+        # resolve to layers whose get_attn_backend() returns
+        # FlashAttentionBackend.
+        def _mock_get_layers(_vllm_config, _layer_type, layer_names):
+            mock_layer = MagicMock()
+            mock_layer.get_attn_backend.return_value = FlashAttentionBackend
+            return {name: mock_layer for name in layer_names}
+
+        kv_caches: dict[str, torch.Tensor] = {}
+        for group in kv_cache_groups:
+            spec = group.kv_cache_spec
+            for layer_name in group.layer_names:
+                # Shape follows FlashAttention layout:
+                # (2, num_blocks, block_size, num_kv_heads, head_size)
+                kv_caches[layer_name] = torch.empty(
+                    2,
+                    num_gpu_blocks,
+                    spec.block_size,
+                    spec.num_kv_heads,
+                    spec.head_size,
+                    dtype=spec.dtype,
+                )
+
+        with (
+            set_current_vllm_config(vllm_config),
+            patch(
+                "vllm.distributed.kv_transfer.kv_connector.v1"
+                ".offloading.worker.get_layers_from_vllm_config",
+                side_effect=_mock_get_layers,
+            ),
+        ):
+            self.worker_connector.register_kv_caches(kv_caches)
 
         # extract connector of scheduler
         scheduler_connector = self.scheduler.connector
@@ -239,10 +285,17 @@ def __init__(
         assert isinstance(manager, MagicMock)
         self.manager: MagicMock = manager
 
-        assert len(self.connector_scheduler.config.kv_group_configs) == 1
-        kv_group_config = self.connector_scheduler.config.kv_group_configs[0]
-        assert kv_group_config.gpu_block_size == gpu_block_size
-        assert kv_group_config.offloaded_block_size == offloaded_block_size
+        num_kv_groups = len(kv_cache_config.kv_cache_groups)
+        assert len(self.connector_scheduler.config.kv_group_configs) == num_kv_groups
+        for group_config, kv_cache_group in zip(
+            self.connector_scheduler.config.kv_group_configs,
+            kv_cache_config.kv_cache_groups,
+        ):
+            gpu_block_size = kv_cache_group.kv_cache_spec.block_size
+            assert group_config.gpu_block_size == gpu_block_size
+            assert (
+                group_config.offloaded_block_size == gpu_block_size * block_size_factor
+            )
 
         # extract OffloadingSpec of worker_connector
         connector_worker = self.worker_connector.connector_worker
@@ -251,18 +304,18 @@ def __init__(
         assert isinstance(offloading_spec, MockOffloadingSpec)
         self.offloading_spec: MockOffloadingSpec = offloading_spec
 
-        # mapping (offloading address) -> gpu_block_index
-        self.offloaded: dict[Any, int] = {}
+        # mapping (offloading address) -> GPUBlock
+        self.offloaded: dict[Any, GPUBlock] = {}
 
         self.completed_loads: list[TransferSummary] = []
         self.completed_stores: list[TransferSummary] = []
-        self.flushed_gpu_block_indexes: set[int] = set()
+        self.flushed_gpu_blocks: set[GPUBlock] = set()
 
-        # maps {block_id: block_offset}
-        self.gpu_block_index: dict[int, int] = {}
+        # block_id -> GPUBlock
+        self.gpu_blocks: dict[int, GPUBlock] = {}
 
         init_none_hash(sha256)
-        self._block_hasher = get_request_block_hasher(gpu_block_size, sha256)
+        self._block_hasher = get_request_block_hasher(block_size, sha256)
 
         self._dummy_ctx: ForwardContext = ForwardContext(
             no_compile_layers={},
@@ -298,11 +351,9 @@ def _parse_transfers(self):
             assert isinstance(src_spec, GPULoadStoreSpec)
 
             for block_id in src_spec.block_ids:
-                self.flushed_gpu_block_indexes.add(
-                    self.gpu_block_index[block_id.item()]
-                )
+                self.flushed_gpu_blocks.add(self.gpu_blocks[block_id.item()])
 
-        block_size_factor = self.offloaded_block_size // self.gpu_block_size
+        block_size_factor = self.block_size_factor
 
         for transfer_spec in self.offloading_spec.get_completed_transfers():
             src_spec, dst_spec = transfer_spec
@@ -318,10 +369,11 @@ def _parse_transfers(self):
 
             assert isinstance(offload_spec, MockLoadStoreSpec)
             assert isinstance(gpu_spec, GPULoadStoreSpec)
+            assert len(gpu_spec.group_sizes) == self.num_kv_groups
 
-            gpu_block_indices: list[int] = []
+            gpu_blocks: list[GPUBlock] = []
             for block_id in gpu_spec.block_ids:
-                gpu_block_indices.append(self.gpu_block_index[block_id.item()])
+                gpu_blocks.append(self.gpu_blocks[block_id.item()])
 
             # list of (offload_key, sub_block_offset)
             offload_addresses: list[Any] = []
@@ -329,30 +381,49 @@ def _parse_transfers(self):
                 for sub_block_idx in range(block_size_factor):
                     offload_addresses.append((offload_key, sub_block_idx))
 
-            if store:
-                assert len(gpu_block_indices) == len(offload_addresses)
+            assert gpu_spec.block_indices is not None
+            assert len(gpu_spec.block_indices) == self.num_kv_groups
 
-                self.completed_stores.append(
-                    TransferSummary(gpu_block_indices, offload_addresses)
-                )
-            else:
-                remainder_sub_block_count = len(offload_addresses) - len(
-                    gpu_block_indices
+            gpu_block_offset = 0
+            offload_address_offset = 0
+            for group_size, logical_offset in zip(
+                gpu_spec.group_sizes, gpu_spec.block_indices
+            ):
+                gpu_block_end_offset = gpu_block_offset + group_size
+                assert gpu_block_end_offset <= len(gpu_blocks)
+
+                offload_addresses_to_skip = logical_offset % block_size_factor
+                offload_addresses_end_offset = (
+                    offload_address_offset + offload_addresses_to_skip + group_size
                 )
-                assert remainder_sub_block_count >= 0
-                assert remainder_sub_block_count < block_size_factor
-                offload_addresses = offload_addresses[remainder_sub_block_count:]
+                assert offload_addresses_end_offset <= len(offload_addresses)
 
-                self.completed_loads.append(
-                    TransferSummary(gpu_block_indices, offload_addresses)
+                offload_addresses = (
+                    offload_addresses[:offload_address_offset]
+                    + offload_addresses[
+                        offload_address_offset + offload_addresses_to_skip :
+                    ]
                 )
 
-    def _update_gpu_block_idx(self):
-        for blocks in self.scheduler.kv_cache_manager.coordinator.single_type_managers[
-            0
-        ].req_to_blocks.values():
-            for block_idx, block in enumerate(blocks):
-                self.gpu_block_index[block.block_id] = block_idx
+                gpu_block_offset += group_size
+                offload_address_offset += group_size
+
+            assert gpu_block_offset == len(gpu_blocks)
+            assert offload_address_offset == len(offload_addresses)
+
+            transfer_summary = TransferSummary(gpu_blocks, offload_addresses)
+            if store:
+                self.completed_stores.append(transfer_summary)
+            else:
+                self.completed_loads.append(transfer_summary)
+
+    def _update_gpu_blocks(self):
+        for group_idx, manager in enumerate(
+            self.scheduler.kv_cache_manager.coordinator.single_type_managers
+        ):
+            for blocks in manager.req_to_blocks.values():
+                for block_idx, block in enumerate(blocks):
+                    self.gpu_blocks[block.block_id] = GPUBlock(group_idx, block_idx)
 
     def _run(self, decoded_tokens: list[int], complete_transfers: bool):
         """
@@ -369,10 +440,15 @@ def _run(self, decoded_tokens: list[int], complete_transfers: bool):
         prev_scheduler_output = None
         prev_model_runner_output = None
         while True:
-            assert self.scheduler.requests
+            # Strict-always-False frees the request immediately on EOS, but
+            # the worker may still have a deferred store queued. In production
+            # the next request's step drains it; in single-request tests we
+            # must keep stepping until the scheduler sees no in-flight jobs.
+            if not self.scheduler.requests and not self.connector_scheduler._jobs:
+                break
 
             scheduler_output = self.scheduler.schedule()
-            self._update_gpu_block_idx()
+            self._update_gpu_blocks()
 
             kv_connector_metadata = scheduler_output.kv_connector_metadata
             assert kv_connector_metadata is not None
@@ -392,6 +468,10 @@ def _run(self, decoded_tokens: list[int], complete_transfers: bool):
             finished_sending, finished_recving = self.worker_connector.get_finished(
                 scheduler_output.finished_req_ids
             )
+            worker_meta = (
+                self.worker_connector.build_connector_worker_meta()
+                or OffloadingWorkerMetadata()
+            )
 
             self.worker_connector.clear_connector_metadata()
 
@@ -400,6 +480,7 @@ def _run(self, decoded_tokens: list[int], complete_transfers: bool):
                 finished_sending=finished_sending,
                 finished_recving=finished_recving,
                 token_id=token_id or 0,
+                kv_connector_worker_meta=worker_meta,
             )
 
             prev_token_id = token_id
@@ -420,7 +501,7 @@ def _run(self, decoded_tokens: list[int], complete_transfers: bool):
             if (
                 prev_token_id == EOS_TOKEN_ID
                 and prev_token_id != token_id
-                and self.scheduler.requests
+                and (self.scheduler.requests or self.connector_scheduler._jobs)
             ):
                 # continue for one more step to allow offloading to kick off
                 continue
@@ -435,33 +516,33 @@ def _run(self, decoded_tokens: list[int], complete_transfers: bool):
 
         self._parse_transfers()
 
-        # run one more step to update finished stored
         if EOS_TOKEN_ID in decoded_tokens:
             assert not self.scheduler.running
 
-            while self.scheduler.requests:
-                scheduler_output = self.scheduler.schedule()
-
-                finished_sending, finished_recving = self.worker_connector.get_finished(
-                    scheduler_output.finished_req_ids
-                )
-
-                assert not finished_recving
-
-                model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
-                model_runner_output.kv_connector_output = KVConnectorOutput(
-                    finished_sending=finished_sending
+    def _to_gpu_blocks(
+        self, blocks: tuple[int | tuple[int, int], ...]
+    ) -> list[GPUBlock]:
+        gpu_blocks: list[GPUBlock] = []
+        for block in blocks:
+            if isinstance(block, int):
+                for group_idx in range(self.num_kv_groups):
+                    gpu_blocks.append(
+                        GPUBlock(group_idx=group_idx, request_block_offset=block)
+                    )
+            else:
+                group_idx, offset = block
+                gpu_blocks.append(
+                    GPUBlock(group_idx=group_idx, request_block_offset=offset)
                 )
-
-                self.scheduler.update_from_output(scheduler_output, model_runner_output)
+        return gpu_blocks
 
     def run(
         self,
         decoded_tokens: list[int],
         complete_transfers: bool = True,
-        expected_stored_gpu_block_indexes: tuple[int, ...] = (),
-        expected_loaded_gpu_block_indexes: tuple[int, ...] = (),
-        expected_flushed_gpu_block_indexes: tuple[int, ...] = (),
+        expected_stored: tuple[int | tuple[int, int], ...] = (),
+        expected_loaded: tuple[int | tuple[int, int], ...] = (),
+        expected_flushed: tuple[int | tuple[int, int], ...] = (),
     ):
         """
         Runs multiple engine (scheduler + worker) steps.
@@ -470,41 +551,49 @@ def run(
         Args:
             decoded_tokens: the tokens to yield at each step.
             complete_transfers: complete transfers immediately
-            expected_stored_gpu_block_indexes: GPU block indexes
+            expected_stored: GPU blocks
                 that are expected to be written during the run.
-            expected_loaded_gpu_block_indexes: GPU block indexes
+            expected_loaded: GPU blocks
                 that are expected to be loaded during the run.
-            expected_flushed_gpu_block_indexes: GPU block indexes
+            expected_flushed: GPU blocks
                 that are expected to be flushed during the run.
+
+            A GPU block is either a (group_idx: int, request_block_offset: int)
+            or just request_block_offset: int.
+            The latter case is a convenience for representing all groups.
         """
 
+        expected_stored_gpu_blocks = self._to_gpu_blocks(expected_stored)
+        expected_loaded_gpu_blocks = self._to_gpu_blocks(expected_loaded)
+        expected_flushed_gpu_blocks = self._to_gpu_blocks(expected_flushed)
+
         self.manager.reset_mock()
         self._run(decoded_tokens, complete_transfers)
 
-        loaded_gpu_block_indexes: set[int] = set()
+        loaded_gpu_blocks: set[GPUBlock] = set()
         for transfer in self.completed_loads:
-            for gpu_block_idx, offloaded_address in zip(
-                transfer.gpu_block_indices, transfer.offload_addresses
+            for gpu_block, offloaded_address in zip(
+                transfer.gpu_blocks, transfer.offload_addresses
             ):
-                loaded_gpu_block_indexes.add(gpu_block_idx)
-                assert gpu_block_idx == self.offloaded[offloaded_address]
+                loaded_gpu_blocks.add(gpu_block)
+                assert gpu_block == self.offloaded[offloaded_address]
 
-        assert set(expected_loaded_gpu_block_indexes) == loaded_gpu_block_indexes
+        assert set(expected_loaded_gpu_blocks) == loaded_gpu_blocks
         self.completed_loads.clear()
 
-        stored_gpu_block_indexes: set[int] = set()
+        stored_gpu_blocks: set[GPUBlock] = set()
         for transfer in self.completed_stores:
-            for gpu_block_idx, offloaded_address in zip(
-                transfer.gpu_block_indices, transfer.offload_addresses
+            for gpu_block, offloaded_address in zip(
+                transfer.gpu_blocks, transfer.offload_addresses
             ):
-                stored_gpu_block_indexes.add(gpu_block_idx)
-                self.offloaded[offloaded_address] = gpu_block_idx
+                stored_gpu_blocks.add(gpu_block)
+                self.offloaded[offloaded_address] = gpu_block
 
-        assert set(expected_stored_gpu_block_indexes) == stored_gpu_block_indexes
+        assert set(expected_stored_gpu_blocks) == stored_gpu_blocks
         self.completed_stores.clear()
 
-        assert set(expected_flushed_gpu_block_indexes) == self.flushed_gpu_block_indexes
-        self.flushed_gpu_block_indexes.clear()
+        assert set(expected_flushed_gpu_blocks) == self.flushed_gpu_blocks
+        self.flushed_gpu_blocks.clear()
 
 
 @pytest.fixture
@@ -512,13 +601,18 @@ def request_runner():
     runners = []
 
     def runner_factory(
-        offloaded_block_size, gpu_block_size, num_gpu_blocks, async_scheduling
+        block_size,
+        num_gpu_blocks,
+        async_scheduling,
+        block_size_factor=1,
+        kv_cache_groups=None,
     ):
         runner = RequestRunner(
-            offloaded_block_size=offloaded_block_size,
-            gpu_block_size=gpu_block_size,
+            block_size=block_size,
             num_gpu_blocks=num_gpu_blocks,
+            block_size_factor=block_size_factor,
             async_scheduling=async_scheduling,
+            kv_cache_groups=kv_cache_groups,
         )
         runners.append(runner)
         return runner
diff --git a/tests/v1/kv_connector/unit/test_bidirectional_kv_transfer.py b/tests/v1/kv_connector/unit/test_bidirectional_kv_transfer.py
new file mode 100644
index 000000000000..dc76d61178d8
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_bidirectional_kv_transfer.py
@@ -0,0 +1,915 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for bi-directional KV cache transfer between P and D nodes.
+
+Tests cover the new behaviors added by the bi-directional KV transfer PR:
+1. P-node scheduler lifecycle: P pulls KV from D using remote_block_ids,
+   eliminating redundant prefill computation in multi-turn conversations.
+2. P-node metadata: NixlConnectorMetadata correctly populates recv metadata
+   when P pulls KV from D (do_remote_decode=True + remote_block_ids).
+3. P-node worker: start_load_kv processes reqs_to_recv for KV pull from D.
+4. D-node request_finished: returns kv_transfer_params with remote_block_ids
+   and remote_num_tokens so P can pull KV in future turns.
+5. Edge cases:
+   - No double read after reschedule (_remote_blocks_processed flag)
+   - remote_num_tokens bounded by block capacity (num_computed_tokens)
+   - kv_recompute_threshold skips small transfers
+   - P-node holds blocks for D after finishing
+   - Cache MISS first turn falls back to local prefill
+   - Partial remote coverage: P pulls partial, computes the rest
+   - _remote_blocks_processed flag persists across reschedules
+
+P-node flags: do_remote_prefill=False (prefill locally),
+do_remote_decode=True (don't decode locally, send KV to D).
+P pulls KV from D when remote_block_ids is not None and
+external tokens > 0.
+"""
+
+import copy
+import time
+from unittest.mock import patch
+
+import pytest
+
+from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorRole
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl.connector import (
+    NixlConnector,
+    NixlConnectorMetadata,
+)
+from vllm.forward_context import ForwardContext
+from vllm.v1.outputs import (
+    EMPTY_MODEL_RUNNER_OUTPUT,
+    KVConnectorOutput,
+)
+from vllm.v1.request import RequestStatus
+
+from .test_nixl_connector import FakeNixlConnectorWorker, FakeNixlWrapper
+from .utils import (
+    assert_scheduler_empty,
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+    make_kv_cache_config,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+# Common extra config for all bi-directional KV transfer tests.
+BIDIR_KV_EXTRA_CONFIG = {"bidirectional_kv_xfer": True, "kv_recompute_threshold": 0}
+
+
+# Helpers
+
+
+def _make_p_node_turn2_request(
+    request_id, block_size, num_tokens, num_remote_blocks=3, remote_num_tokens=None
+):
+    """Create a P-node Turn 2 request with remote_block_ids from D."""
+    request = create_request(
+        request_id=request_id,
+        block_size=block_size,
+        num_tokens=num_tokens,
+        do_remote_decode=True,
+    )
+    if remote_num_tokens is None:
+        remote_num_tokens = num_remote_blocks * block_size
+    request.kv_transfer_params["remote_block_ids"] = [list(range(num_remote_blocks))]
+    request.kv_transfer_params["remote_num_tokens"] = remote_num_tokens
+    request.kv_transfer_params["remote_engine_id"] = "decode-engine"
+    request.kv_transfer_params["remote_request_id"] = f"decode-{request_id}"
+    request.kv_transfer_params["remote_host"] = "decode-host"
+    request.kv_transfer_params["remote_port"] = 5678
+    return request
+
+
+def _make_connector_with_fake_worker(
+    hand_shake_latency=0, cycles_before_done=0, do_handshake=True
+):
+    """Create a NixlConnector with FakeNixlConnectorWorker."""
+    vllm_config = create_vllm_config()
+    kv_cache_config = make_kv_cache_config(block_size=16, num_blocks=2)
+    connector = NixlConnector(vllm_config, KVConnectorRole.WORKER, kv_cache_config)
+    connector.connector_worker = FakeNixlConnectorWorker(
+        vllm_config,
+        connector.engine_id,
+        hand_shake_latency=hand_shake_latency,
+        kv_cache_config=kv_cache_config,
+    )
+    worker = connector.connector_worker
+    assert isinstance(worker.nixl_wrapper, FakeNixlWrapper)
+    worker.nixl_wrapper.set_cycles_before_xfer_done(cycles_before_done)
+    worker.kv_cache_layout = "HND"
+    if do_handshake:
+        remote_agents = worker._nixl_handshake(
+            host="localhost",
+            port=1234,
+            remote_tp_size=1,
+            expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+        )
+        worker._remote_agents[FakeNixlConnectorWorker.REMOTE_ENGINE_ID] = remote_agents
+    return connector, worker
+
+
+def _make_p_node_recv_metadata(request_id, local_blocks, remote_blocks):
+    """Build NixlConnectorMetadata for P-node pulling KV from D."""
+    meta = NixlConnectorMetadata()
+    meta.add_new_req_to_recv(
+        request_id=request_id,
+        local_block_ids=(local_blocks,),
+        kv_transfer_params={
+            "do_remote_prefill": False,
+            "do_remote_decode": True,
+            "remote_block_ids": (remote_blocks,),
+            "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"decode-{request_id}",
+            "remote_host": "localhost",
+            "remote_port": 1234,
+            "remote_tp_size": 1,
+        },
+    )
+    return meta
+
+
+def _do_load_kv(connector, metadata):
+    """Bind metadata and call start_load_kv."""
+    connector.bind_connector_metadata(metadata)
+    ctx = ForwardContext(no_compile_layers={}, attn_metadata={}, slot_mapping={})
+    connector.start_load_kv(ctx)
+
+
+# 1. P-node scheduler lifecycle tests
+
+
+def test_multiturn_lifecycle():
+    """Full two-turn lifecycle on the P node:
+    Turn 1: P prefills locally (do_remote_prefill=False), sends KV to D
+    (do_remote_decode=True). Finishes LENGTH_CAPPED with remote_block_ids.
+    Turn 2: P receives remote_block_ids from D. P pulls KV from D because
+    remote_block_ids is not None and external tokens > 0. Computes only
+    new tokens, finishes LENGTH_CAPPED."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+
+    t1 = create_request(
+        request_id=100, block_size=BS, num_tokens=int(BS * 2.5), do_remote_decode=True
+    )
+    scheduler.add_request(t1)
+    t1_id = t1.request_id
+    so = scheduler.schedule()
+    mro = create_model_runner_output(reqs=[t1])
+    eco = scheduler.update_from_output(so, mro)
+    assert t1.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    kv = eco[0].outputs[0].kv_transfer_params
+    assert kv and sum(len(g) for g in kv["remote_block_ids"]) > 0
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+
+    t2 = _make_p_node_turn2_request(200, BS, int(BS * 2.5))
+    scheduler.add_request(t2)
+    t2_id = t2.request_id
+    so = scheduler.schedule()
+    assert t2.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_recving={t2_id})
+    scheduler.update_from_output(so, mro)
+    so = scheduler.schedule()
+    mro = create_model_runner_output(reqs=[t2])
+    scheduler.update_from_output(so, mro)
+    assert t2.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={t1_id, t2_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_first_turn_no_remote_blocks():
+    """First turn: P has no remote_block_ids from D yet.
+    Standard local prefill, returns kv_transfer_params for future turns."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=3, block_size=BS, num_tokens=int(BS * 2.5), do_remote_decode=True
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    assert req.status != RequestStatus.WAITING_FOR_REMOTE_KVS
+    mro = create_model_runner_output(reqs=[req])
+    eco = scheduler.update_from_output(so, mro)
+    assert req.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    assert eco[0].outputs[0].kv_transfer_params is not None
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_abort_p_side_during_send():
+    """P-side do_remote_decode=True: blocks held until finished_sending."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=42, block_size=BS, num_tokens=int(BS * 2.5), do_remote_decode=True
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    assert req_id in scheduler.requests
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    assert req_id in scheduler.requests
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_abort_p_side_non_length_capped():
+    """P-side abort with non-LENGTH_CAPPED → immediate block free."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=44, block_size=BS, num_tokens=int(BS * 2.5), do_remote_decode=True
+    )
+    req.sampling_params.max_tokens = 100
+    req.max_tokens = 100
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    scheduler.finish_requests([req_id], RequestStatus.FINISHED_ABORTED)
+    conn = scheduler.connector.connector_scheduler
+    assert req_id in conn._reqs_not_processed
+    assert req_id not in scheduler.requests
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    assert_scheduler_empty(scheduler)
+
+
+def test_remote_blocks_exceed_prompt_tokens():
+    """D provides more remote tokens than P's prompt needs.
+    P caps external tokens to prompt length."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    NUM_TOKENS = int(BS * 2.5)
+    req = _make_p_node_turn2_request(
+        300, BS, NUM_TOKENS, num_remote_blocks=5, remote_num_tokens=5 * BS
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert req.num_computed_tokens == NUM_TOKENS
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_recving={req_id})
+    scheduler.update_from_output(so, mro)
+    so = scheduler.schedule()
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    assert req.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_p_node_pulls_partial_last_block_from_d():
+    """D sends remote_block_ids with partially filled last block.
+    remote_num_tokens < len(remote_block_ids) * block_size.
+    P pulls only remote_num_tokens worth of external tokens."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    num_remote_blocks = 3
+    remote_num_tokens = int(BS * 2.5)
+    assert remote_num_tokens < num_remote_blocks * BS
+    NUM_TOKENS = int(BS * 3.5)
+    req = _make_p_node_turn2_request(
+        400,
+        BS,
+        NUM_TOKENS,
+        num_remote_blocks=num_remote_blocks,
+        remote_num_tokens=remote_num_tokens,
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_recving={req_id})
+    scheduler.update_from_output(so, mro)
+    so = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    assert req.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+# 2. P-node metadata tests
+
+
+def test_add_new_req_to_recv_populates_remote_meta():
+    """add_new_req_to_recv correctly populates RemoteMeta for P-node
+    bi-directional KV pull from D."""
+    meta = NixlConnectorMetadata()
+    kv_params = {
+        "remote_block_ids": [[0, 1, 2]],
+        "remote_engine_id": "decode-engine",
+        "remote_request_id": "decode-req-123",
+        "remote_host": "decode-host",
+        "remote_port": 5678,
+    }
+    local_block_ids = ([10, 11, 12],)
+    meta.add_new_req_to_recv(
+        request_id="test-req",
+        local_block_ids=local_block_ids,
+        kv_transfer_params=kv_params,
+    )
+    assert "test-req" in meta.reqs_to_recv
+    rm = meta.reqs_to_recv["test-req"]
+    assert rm.remote is not None
+    assert rm.remote.block_ids == kv_params["remote_block_ids"]
+    assert rm.remote.engine_id == "decode-engine"
+    assert rm.remote.request_id == "decode-req-123"
+    assert rm.remote.host == "decode-host"
+    assert rm.remote.port == 5678
+    assert rm.local_block_ids == local_block_ids
+
+
+def test_build_connector_meta_recv_entries():
+    """P-node scheduler: do_remote_decode=True + remote_block_ids →
+    _reqs_need_recv populated, build_connector_meta produces reqs_to_recv."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = _make_p_node_turn2_request(1, BS, int(BS * 2.5))
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    meta = so.kv_connector_metadata
+    assert isinstance(meta, NixlConnectorMetadata)
+    assert req_id in meta.reqs_to_recv
+    rm = meta.reqs_to_recv[req_id]
+    assert rm.remote is not None
+    assert rm.remote.engine_id == "decode-engine"
+
+
+def test_build_connector_meta_clears_reqs_need_recv():
+    """After build_connector_meta, _reqs_need_recv is cleared."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = _make_p_node_turn2_request(2, BS, int(BS * 2.5))
+    scheduler.add_request(req)
+    conn = scheduler.connector.connector_scheduler
+    scheduler.schedule()
+    assert len(conn._reqs_need_recv) == 0
+
+
+def test_build_connector_meta_multiple_requests():
+    """Multiple P-node requests all included in reqs_to_recv."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    reqs = [_make_p_node_turn2_request(10 + i, BS, int(BS * 2.5)) for i in range(3)]
+    for r in reqs:
+        scheduler.add_request(r)
+    so = scheduler.schedule()
+    meta = so.kv_connector_metadata
+    assert isinstance(meta, NixlConnectorMetadata)
+    assert len(meta.reqs_to_recv) == 3
+    for r in reqs:
+        assert r.request_id in meta.reqs_to_recv
+
+
+# 3. P-node worker tests (FakeNixlWrapper)
+
+
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_p_node_pull_kv_from_d(dist_init):
+    """P node pulls KV from D via start_load_kv with reqs_to_recv."""
+    connector, worker = _make_connector_with_fake_worker()
+    meta = _make_p_node_recv_metadata("req-p1", [10, 11, 12], [20, 21, 22])
+    _do_load_kv(connector, meta)
+    assert "req-p1" in worker._recving_metadata
+    _, done_recving = connector.get_finished(finished_req_ids=set())
+    assert "req-p1" in done_recving
+
+
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_p_node_pull_then_send_kv(dist_init):
+    """Full P-node bi-directional: pull KV from D → prefill →
+    send KV back to D via notification."""
+    connector, worker = _make_connector_with_fake_worker()
+    meta = _make_p_node_recv_metadata("req-p2", [10, 11], [20, 21])
+    _do_load_kv(connector, meta)
+    _, done_recving = connector.get_finished(finished_req_ids=set())
+    assert "req-p2" in done_recving
+    worker._reqs_to_send["req-p2"] = time.perf_counter() + 60
+    worker._reqs_to_process.add("req-p2")
+    notif = f"req-p2:{worker.world_size}".encode()
+    orig = worker.nixl_wrapper.get_new_notifs
+    worker.nixl_wrapper.get_new_notifs = lambda: {"agent": [notif]}
+    done_sending, _ = connector.get_finished(finished_req_ids=set())
+    assert "req-p2" in done_sending
+    worker.nixl_wrapper.get_new_notifs = orig
+
+
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_p_node_deferred_pull_on_no_handshake(dist_init):
+    """P defers KV pull when no prior handshake exists."""
+    connector, worker = _make_connector_with_fake_worker(
+        hand_shake_latency=0, do_handshake=False
+    )
+    meta = _make_p_node_recv_metadata("req-p3", [10, 11], [20, 21])
+    _do_load_kv(connector, meta)
+    assert "req-p3" in worker._recving_metadata
+    timeout = 3.0
+    start = time.perf_counter()
+    while time.perf_counter() - start < timeout:
+        connector.bind_connector_metadata(NixlConnectorMetadata())
+        ctx = ForwardContext(no_compile_layers={}, attn_metadata={}, slot_mapping={})
+        connector.start_load_kv(ctx)
+        _, done = connector.get_finished(finished_req_ids=set())
+        if "req-p3" in done:
+            return
+        time.sleep(0.2)
+    raise AssertionError("Transfer did not complete after async handshake")
+
+
+# 4. D-node request_finished returns kv_transfer_params (new behavior)
+
+
+def test_d_node_request_finished_returns_kv_params():
+    """D-node request_finished returns kv_transfer_params with
+    do_remote_decode=True, remote_block_ids, remote_num_tokens
+    for P to pull. These params go directly to P node."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=1, block_size=BS, num_tokens=int(BS * 2.5), do_remote_prefill=True
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[], finished_recving={req_id})
+    )
+    so = scheduler.schedule()
+    eco = scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[req], use_eos=True)
+    )
+    assert req.status == RequestStatus.FINISHED_STOPPED
+    kv = eco[0].outputs[0].kv_transfer_params
+    assert kv is not None
+    assert kv["do_remote_decode"] is True
+    assert kv["do_remote_prefill"] is False
+    assert "remote_block_ids" in kv
+    assert "remote_num_tokens" in kv
+    assert kv["remote_num_tokens"] > 0
+
+
+def test_d_node_request_finished_delays_block_free():
+    """D-node holds blocks (delay_free=True) until P reads them."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=2, block_size=BS, num_tokens=int(BS * 2.5), do_remote_prefill=True
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[], finished_recving={req_id})
+    )
+    so = scheduler.schedule()
+    scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[req], use_eos=True)
+    )
+    assert req_id in scheduler.requests
+    conn = scheduler.connector.connector_scheduler
+    assert req_id in conn._reqs_need_send
+
+
+def test_d_node_request_finished_remote_num_tokens():
+    """D-node kv_transfer_params includes correct remote_num_tokens."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=3, block_size=BS, num_tokens=int(BS * 2.5), do_remote_prefill=True
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[], finished_recving={req_id})
+    )
+    so = scheduler.schedule()
+    eco = scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[req], use_eos=True)
+    )
+    kv = eco[0].outputs[0].kv_transfer_params
+    assert kv["remote_num_tokens"] > 0
+    assert sum(len(g) for g in kv["remote_block_ids"]) > 0
+
+
+def test_d_node_partial_last_block_remote_num_tokens():
+    """D-node: remote_num_tokens < len(remote_block_ids) * block_size
+    when last block is partially filled."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=5, block_size=BS, num_tokens=int(BS * 2.5), do_remote_prefill=True
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[], finished_recving={req_id})
+    )
+    so = scheduler.schedule()
+    eco = scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[req], use_eos=True)
+    )
+    kv = eco[0].outputs[0].kv_transfer_params
+    total_blocks = sum(len(g) for g in kv["remote_block_ids"])
+    assert total_blocks == 3
+    assert kv["remote_num_tokens"] < total_blocks * BS
+    assert kv["remote_num_tokens"] > 0
+
+
+# 5. Edge case tests
+
+
+def test_no_double_read_blocks_after_reschedule():
+    """Edge case 1: update_state_after_alloc called twice for the same
+    bidirectional request (once on initial schedule, once after
+    WAITING_FOR_REMOTE_KVS → reschedule). The _remote_blocks_processed
+    flag must prevent the request from being added to _reqs_need_recv
+    twice, which would cause P to read D's blocks twice."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = _make_p_node_turn2_request(500, BS, int(BS * 2.5))
+    scheduler.add_request(req)
+    req_id = req.request_id
+    conn = scheduler.connector.connector_scheduler
+
+    # First schedule: request enters WAITING_FOR_REMOTE_KVS,
+    # _reqs_need_recv populated then cleared by build_connector_meta.
+    so = scheduler.schedule()
+    assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    meta = so.kv_connector_metadata
+    assert isinstance(meta, NixlConnectorMetadata)
+    assert req_id in meta.reqs_to_recv
+    # _reqs_need_recv should be cleared after build_connector_meta
+    assert len(conn._reqs_need_recv) == 0
+
+    # Simulate recv completion
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_recving={req_id})
+    scheduler.update_from_output(so, mro)
+
+    # Second schedule after recv: update_state_after_alloc called again.
+    # The _remote_blocks_processed flag should prevent re-entry.
+    so = scheduler.schedule()
+    meta2 = so.kv_connector_metadata
+    assert isinstance(meta2, NixlConnectorMetadata)
+    # Must NOT be in reqs_to_recv again
+    assert req_id not in meta2.reqs_to_recv
+
+    # Clean up
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    assert req.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_remote_num_tokens_bounded_by_blocks():
+    """Edge case 2: D-node request_finished must return
+    remote_num_tokens <= len(remote_block_ids) * block_size.
+    request.num_tokens includes the last sampled token which has no KV
+    in the cache, so remote_num_tokens must use num_computed_tokens."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=501,
+        block_size=BS,
+        num_tokens=int(BS * 2.5),
+        do_remote_prefill=True,
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[], finished_recving={req_id})
+    )
+    so = scheduler.schedule()
+    eco = scheduler.update_from_output(
+        so, create_model_runner_output(reqs=[req], use_eos=True)
+    )
+    kv = eco[0].outputs[0].kv_transfer_params
+    assert kv is not None
+    total_blocks = sum(len(g) for g in kv["remote_block_ids"])
+    max_tokens_in_blocks = total_blocks * BS
+    assert kv["remote_num_tokens"] <= max_tokens_in_blocks, (
+        f"remote_num_tokens ({kv['remote_num_tokens']}) exceeds "
+        f"block capacity ({max_tokens_in_blocks})"
+    )
+    assert kv["remote_num_tokens"] > 0
+
+
+def test_kv_recompute_threshold_skips_small_transfer():
+    """Edge case 3: When remote tokens are below kv_recompute_threshold,
+    P should skip the remote pull and compute locally instead of
+    entering WAITING_FOR_REMOTE_KVS."""
+    threshold = 256
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config={
+            "bidirectional_kv_xfer": True,
+            "kv_recompute_threshold": threshold,
+        },
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+
+    # Create request where remote tokens (48) < threshold (256)
+    req = _make_p_node_turn2_request(
+        502,
+        BS,
+        int(BS * 2.5),
+        num_remote_blocks=3,
+        remote_num_tokens=3 * BS,
+    )
+    scheduler.add_request(req)
+    so = scheduler.schedule()
+    # Should NOT enter WAITING_FOR_REMOTE_KVS — threshold not met
+    assert req.status != RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert req.status == RequestStatus.RUNNING
+
+    # Clean up
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    assert req.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req.request_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_p_node_finished_holds_blocks_for_d():
+    """Edge case 4: P-node finishes with FINISHED_LENGTH_CAPPED and
+    do_remote_decode=True. P must hold blocks (delay_free=True) and
+    return kv_transfer_params with do_remote_prefill=True so D can
+    read P's blocks."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=503,
+        block_size=BS,
+        num_tokens=int(BS * 2.5),
+        do_remote_decode=True,
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    mro = create_model_runner_output(reqs=[req])
+    eco = scheduler.update_from_output(so, mro)
+    assert req.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    kv = eco[0].outputs[0].kv_transfer_params
+    assert kv is not None
+    # P-node finished: should tell D to pull (do_remote_prefill=True)
+    assert kv["do_remote_prefill"] is True
+    assert kv["do_remote_decode"] is False
+    assert "remote_block_ids" in kv
+    assert sum(len(g) for g in kv["remote_block_ids"]) > 0
+    # Blocks should be held (request still tracked)
+    assert req_id in scheduler.requests
+
+    # Clean up: simulate D reading and notifying
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_cache_miss_first_turn_no_remote_pull():
+    """Edge case 5: First turn with do_remote_decode=True but no
+    remote_block_ids (cache MISS). P should prefill locally with
+    num_external_tokens=0 and not enter WAITING_FOR_REMOTE_KVS."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = create_request(
+        request_id=504,
+        block_size=BS,
+        num_tokens=int(BS * 2.5),
+        do_remote_decode=True,
+    )
+    # No remote_block_ids set — this is a cache MISS
+    assert req.kv_transfer_params.get("remote_block_ids") is None
+    scheduler.add_request(req)
+    so = scheduler.schedule()
+    # Should NOT wait for remote KVs
+    assert req.status != RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert req.status == RequestStatus.RUNNING
+
+    # Clean up
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req.request_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_partial_remote_tokens_less_than_prompt():
+    """Edge case 6: D's remote_num_tokens covers only part of P's
+    prompt. P should pull remote_num_tokens worth of external tokens
+    and compute the rest locally."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    NUM_TOKENS = int(BS * 4.5)  # 72 tokens
+    # D provides only 2 blocks (32 tokens) out of 72
+    req = _make_p_node_turn2_request(
+        505,
+        BS,
+        NUM_TOKENS,
+        num_remote_blocks=2,
+        remote_num_tokens=2 * BS,
+    )
+    scheduler.add_request(req)
+    req_id = req.request_id
+    so = scheduler.schedule()
+    assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    # num_computed_tokens should reflect the external tokens pulled
+    # (capped to remote_num_tokens, not full prompt)
+    assert req.num_computed_tokens < NUM_TOKENS
+
+    # Complete the transfer and finish
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_recving={req_id})
+    scheduler.update_from_output(so, mro)
+    so = scheduler.schedule()
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    assert req.status == RequestStatus.FINISHED_LENGTH_CAPPED
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
+
+
+def test_remote_blocks_processed_flag_persists():
+    """Edge case 7: After recv completes and request is rescheduled,
+    the _remote_blocks_processed flag in kv_transfer_params prevents
+    the bidirectional path from re-entering _reqs_need_recv."""
+    vllm_config = create_vllm_config(
+        kv_connector_extra_config=BIDIR_KV_EXTRA_CONFIG,
+    )
+    scheduler = create_scheduler(vllm_config)
+    BS = vllm_config.cache_config.block_size
+    req = _make_p_node_turn2_request(506, BS, int(BS * 2.5))
+    scheduler.add_request(req)
+    req_id = req.request_id
+    conn = scheduler.connector.connector_scheduler
+
+    # First schedule → WAITING_FOR_REMOTE_KVS
+    so = scheduler.schedule()
+    assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+
+    # Recv completes
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_recving={req_id})
+    scheduler.update_from_output(so, mro)
+
+    # Verify the flag is set
+    assert req.kv_transfer_params.get("_remote_blocks_processed") is True
+
+    # Next schedule: update_state_after_alloc is called again.
+    # _reqs_need_recv must NOT contain this request.
+    so = scheduler.schedule()
+    assert req_id not in conn._reqs_need_recv
+    meta = so.kv_connector_metadata
+    assert isinstance(meta, NixlConnectorMetadata)
+    assert req_id not in meta.reqs_to_recv
+
+    # Clean up
+    mro = create_model_runner_output(reqs=[req])
+    scheduler.update_from_output(so, mro)
+    so = scheduler.schedule()
+    scheduler.update_from_output(so, EMPTY_MODEL_RUNNER_OUTPUT)
+    so = scheduler.schedule()
+    mro = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    mro.kv_connector_output = KVConnectorOutput(finished_sending={req_id})
+    scheduler.update_from_output(so, mro)
+    assert_scheduler_empty(scheduler)
diff --git a/tests/v1/kv_connector/unit/test_mooncake_stats.py b/tests/v1/kv_connector/unit/test_mooncake_stats.py
new file mode 100644
index 000000000000..a20fcb505330
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_mooncake_stats.py
@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import threading
+from unittest.mock import MagicMock
+
+from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.mooncake_connector import (
+    MooncakeConnector,
+    MooncakeConnectorWorker,
+    SendBlockMeta,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.stats import (
+    MooncakeKVConnectorStats,
+)
+
+
+def test_is_empty_on_fresh_stats():
+    stats = MooncakeKVConnectorStats()
+    assert stats.is_empty()
+    assert stats.num_successful_transfers == 0
+
+
+def test_record_transfer_and_reduce():
+    stats = MooncakeKVConnectorStats()
+    # 1 MB transfer in 1 ms -> 1000 MB/s throughput
+    stats.record_transfer(duration_s=0.001, total_bytes=1 * 2**20, num_descs=4)
+    # 2 MB transfer in 2 ms
+    stats.record_transfer(duration_s=0.002, total_bytes=2 * 2**20, num_descs=6)
+    assert not stats.is_empty()
+    assert stats.num_successful_transfers == 2
+
+    reduced = stats.reduce()
+    assert reduced["Num successful transfers"] == 2
+    # avg = (1 + 2) / 2 = 1.5 ms
+    assert reduced["Avg xfer time (ms)"] == 1.5
+    assert reduced["Avg MB per transfer"] == 1.5
+    # 3 MB total / 3 ms total = 1000 MB/s
+    assert reduced["Throughput (MB/s)"] == 1000.0
+    assert reduced["Avg number of descriptors"] == 5.0
+    assert reduced["Num failed transfers"] == 0
+    assert reduced["Num failed recvs"] == 0
+    assert reduced["Num KV expired reqs"] == 0
+
+
+def test_record_failures_keeps_stats_non_empty():
+    stats = MooncakeKVConnectorStats()
+    stats.record_failed_transfer()
+    stats.record_failed_recv()
+    stats.record_kv_expired_req()
+    assert not stats.is_empty()
+
+    reduced = stats.reduce()
+    # No successful transfers -> latency/throughput all zero, but failure
+    # counters still surface.
+    assert reduced["Num successful transfers"] == 0
+    assert reduced["Num failed transfers"] == 1
+    assert reduced["Num failed recvs"] == 1
+    assert reduced["Num KV expired reqs"] == 1
+
+
+def test_aggregate_sums_observations():
+    a = MooncakeKVConnectorStats()
+    b = MooncakeKVConnectorStats()
+    a.record_transfer(duration_s=0.001, total_bytes=1 * 2**20, num_descs=1)
+    b.record_transfer(duration_s=0.002, total_bytes=2 * 2**20, num_descs=2)
+    b.record_failed_transfer()
+
+    a.aggregate(b)
+
+    assert a.num_successful_transfers == 2
+    reduced = a.reduce()
+    assert reduced["Num successful transfers"] == 2
+    assert reduced["Num failed transfers"] == 1
+
+
+def test_aggregate_with_empty_other_is_noop():
+    a = MooncakeKVConnectorStats()
+    a.record_transfer(duration_s=0.001, total_bytes=1, num_descs=1)
+    b = MooncakeKVConnectorStats()
+
+    a.aggregate(b)
+
+    assert a.num_successful_transfers == 1
+
+
+def test_getstate_drops_lock_and_setstate_recreates_it():
+    # KVConnectorStats subclasses must be picklable (worker→scheduler IPC),
+    # but threading.Lock isn't — so __getstate__ strips it and __setstate__
+    # rebuilds a fresh per-process lock.
+    original = MooncakeKVConnectorStats()
+    original.record_transfer(duration_s=0.01, total_bytes=2048, num_descs=3)
+
+    state = original.__getstate__()
+    assert "_lock" not in state
+
+    rebuilt = MooncakeKVConnectorStats.__new__(MooncakeKVConnectorStats)
+    rebuilt.__setstate__(state)
+    assert rebuilt.data == original.data
+    # Lock works on the receiver side.
+    rebuilt.record_transfer(duration_s=0.02, total_bytes=4096, num_descs=5)
+    assert rebuilt.num_successful_transfers == 2
+
+
+def test_concurrent_writers_keep_row_lengths_aligned():
+    # Multiple writers + a snapshot reader must never produce a snapshot
+    # with mismatched column lengths — reduce()'s
+    # len(descs) == num_successful_transfers assertion would fire.
+    stats = MooncakeKVConnectorStats()
+    stop = threading.Event()
+    writer_count = 4
+    snapshots: list[MooncakeKVConnectorStats] = []
+
+    def writer():
+        i = 0
+        while not stop.is_set():
+            stats.record_transfer(
+                duration_s=0.001 + i * 1e-9,
+                total_bytes=1024 + i,
+                num_descs=1 + (i % 8),
+            )
+            i += 1
+
+    def snapper():
+        while not stop.is_set():
+            snap = stats.clone_and_reset()
+            if not snap.is_empty():
+                # Force the same path the logger walks; reduce() will
+                # blow up on torn rows via its internal assert.
+                snap.reduce()
+                snapshots.append(snap)
+
+    threads = [threading.Thread(target=writer) for _ in range(writer_count)]
+    snapshotter = threading.Thread(target=snapper)
+    for t in threads:
+        t.start()
+    snapshotter.start()
+    # Short fixed window — long enough to interleave thousands of ops.
+    threading.Event().wait(0.2)
+    stop.set()
+    for t in threads:
+        t.join()
+    snapshotter.join()
+
+    # Final drain so we don't lose the in-flight tail.
+    final = stats.clone_and_reset()
+    if not final.is_empty():
+        final.reduce()
+        snapshots.append(final)
+
+    # Every snapshot's columns must have identical lengths (the invariant
+    # the lock protects), and the union must contain at least one row.
+    total_rows = 0
+    for snap in snapshots:
+        n = len(snap.data["transfer_duration"])
+        assert len(snap.data["bytes_transferred"]) == n
+        assert len(snap.data["num_descriptors"]) == n
+        total_rows += n
+    assert total_rows > 0
+
+
+def test_clone_and_reset_hands_off_old_data():
+    stats = MooncakeKVConnectorStats()
+    stats.record_transfer(duration_s=0.001, total_bytes=1, num_descs=1)
+    stats.record_failed_recv()
+
+    snapshot = stats.clone_and_reset()
+
+    assert snapshot.num_successful_transfers == 1
+    assert not snapshot.is_empty()
+    # Original is now empty.
+    assert stats.is_empty()
+    assert stats.num_successful_transfers == 0
+    # Recording on the original does not mutate the snapshot.
+    stats.record_transfer(duration_s=0.005, total_bytes=2, num_descs=2)
+    assert snapshot.num_successful_transfers == 1
+
+
+def test_build_kv_connector_stats_none_returns_empty_instance():
+    out = MooncakeConnector.build_kv_connector_stats()
+    assert isinstance(out, MooncakeKVConnectorStats)
+    assert out.is_empty()
+
+
+def test_build_kv_connector_stats_with_data_round_trips():
+    original = MooncakeKVConnectorStats()
+    original.record_transfer(duration_s=0.01, total_bytes=1024, num_descs=3)
+    original.record_failed_transfer()
+
+    # Serialized form is the .data dict; build should reconstruct an instance
+    # that behaves the same.
+    rebuilt = MooncakeConnector.build_kv_connector_stats(data=original.data)
+
+    assert isinstance(rebuilt, MooncakeKVConnectorStats)
+    assert rebuilt.num_successful_transfers == 1
+    assert rebuilt.reduce()["Num failed transfers"] == 1
+
+
+def _bare_worker() -> MooncakeConnectorWorker:
+    """Construct a MooncakeConnectorWorker skipping __init__ (full init requires
+    a live TransferEngine). Only the attributes touched by the methods under
+    test are populated; role flags and async_zmq_ctx keep __del__'s shutdown
+    path a no-op."""
+    worker = MooncakeConnectorWorker.__new__(MooncakeConnectorWorker)
+    worker.xfer_stats = MooncakeKVConnectorStats()
+    worker.engine = MagicMock()
+    worker.async_zmq_ctx = MagicMock()
+    worker.is_kv_consumer = True
+    worker.is_kv_producer = True
+    return worker
+
+
+def test_send_blocks_records_success():
+    worker = _bare_worker()
+    worker.engine.batch_transfer_sync_write.return_value = 0
+
+    ret = worker._send_blocks(
+        "host:1234",
+        src_ptrs=[0x1000, 0x2000],
+        dst_ptrs=[0x3000, 0x4000],
+        lengths=[1024, 2048],
+    )
+
+    assert ret == 0
+    assert worker.xfer_stats.num_successful_transfers == 1
+    data = worker.xfer_stats.data
+    assert data["bytes_transferred"] == [1024 + 2048]
+    assert data["num_descriptors"] == [2]
+    assert data["num_failed_transfers"] == []
+
+
+def test_send_blocks_records_failure():
+    worker = _bare_worker()
+    worker.engine.batch_transfer_sync_write.return_value = 1  # non-zero = fail
+
+    ret = worker._send_blocks("host:1234", [0x1000], [0x2000], [4096])
+
+    assert ret == 1
+    assert worker.xfer_stats.num_successful_transfers == 0
+    assert worker.xfer_stats.data["num_failed_transfers"] == [1]
+
+
+def test_get_kv_connector_stats_returns_none_when_empty():
+    worker = _bare_worker()
+
+    assert worker.get_kv_connector_stats() is None
+
+
+def test_get_kv_connector_stats_returns_and_resets():
+    worker = _bare_worker()
+    worker.engine.batch_transfer_sync_write.return_value = 0
+    worker._send_blocks("host:1234", [0x1000], [0x2000], [4096])
+
+    snapshot = worker.get_kv_connector_stats()
+    assert isinstance(snapshot, MooncakeKVConnectorStats)
+    assert snapshot.num_successful_transfers == 1
+
+    # Second call returns None because the worker's stats were reset.
+    assert worker.get_kv_connector_stats() is None
+
+
+def test_expired_request_bumps_counter():
+    import asyncio
+
+    worker = _bare_worker()
+    worker.reqs_need_send = {
+        "tid1": SendBlockMeta(
+            p_req_id="req1",
+            transfer_id="tid1",
+            local_block_ids=[0, 1],
+            ready=asyncio.Event(),
+            expire_time=-1.0,  # Already expired.
+            sending=0,
+        ),
+    }
+    worker.finished_sending_reqs = set()
+
+    asyncio.run(worker.fetch_finished_sending_reqs())
+
+    assert worker.xfer_stats.data["num_kv_expired_reqs"] == [1]
+    # Expired transfer also cleaned out of reqs_need_send.
+    assert "tid1" not in worker.reqs_need_send
diff --git a/tests/v1/kv_connector/unit/test_moriio_connector.py b/tests/v1/kv_connector/unit/test_moriio_connector.py
index 902957e18309..16d34d90896b 100644
--- a/tests/v1/kv_connector/unit/test_moriio_connector.py
+++ b/tests/v1/kv_connector/unit/test_moriio_connector.py
@@ -3,6 +3,7 @@
 import importlib.util
 import os
 import subprocess
+import uuid
 from unittest.mock import MagicMock, patch
 
 import msgspec
@@ -99,6 +100,11 @@ def _setup_kv_transfer_request(
             "remote_engine_id": "test_engine",
         }
     )
+    zmq_addr = f"host:{remote_host},handshake:{fake_port},notify:{fake_port}"
+    fake_uuid = uuid.uuid4().hex
+    request.request_id = (
+        f"___prefill_addr_{zmq_addr}___decode_addr_{zmq_addr}_{fake_uuid}"
+    )
     return request
 
 
@@ -254,13 +260,14 @@ def test_write_mode_saves_local_block_ids():
         do_remote_decode=True,
         do_remote_prefill=False,
     )
+
+    # Setup KV transfer params and embed ZMQ addrs in request_id before
+    # adding to scheduler so the ID is consistent everywhere.
+    request = _setup_kv_transfer_request(request)
     request_id = request.request_id
 
     scheduler.add_request(request)
 
-    # Fake Config
-    request = _setup_kv_transfer_request(request)
-
     # Remote Prefill, triggers MoRIIOConnectorMetadata.
     scheduler_output = scheduler.schedule()
     kv_connector_metadata = scheduler_output.kv_connector_metadata
@@ -312,13 +319,14 @@ def test_write_mode_with_chunked_prefill_saves_local_block_ids():
         do_remote_decode=True,
         do_remote_prefill=False,
     )
+
+    # Setup KV transfer params and embed ZMQ addrs in request_id before
+    # adding to scheduler so the ID is consistent everywhere.
+    request = _setup_kv_transfer_request(request)
     request_id = request.request_id
 
     scheduler.add_request(request)
 
-    # Fake Config
-    request = _setup_kv_transfer_request(request)
-
     # Remote Prefill with chunked prefill, triggers multiple schedules.
     expected_counts = [(0, 0, 0), (0, 0, 0), (1, 0, 0)]
     kv_connector_metadata = None
@@ -363,6 +371,10 @@ def test_read_mode_loads_remote_block_ids(moriio_read_mode):
         do_remote_decode=False,
         do_remote_prefill=True,
     )
+
+    # Setup KV transfer params and embed ZMQ addrs in request_id before
+    # adding to scheduler so the ID is consistent everywhere.
+    request = _setup_kv_transfer_request(request)
     request_id = request.request_id
 
     scheduler.add_request(request)
@@ -370,8 +382,6 @@ def test_read_mode_loads_remote_block_ids(moriio_read_mode):
         0
     ].req_to_blocks[request_id]
 
-    request = _setup_kv_transfer_request(request)
-
     # Set remote block ids to be fetched.
     request.kv_transfer_params["remote_block_ids"] = block_list
 
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index 855c34117137..51024fb92171 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -8,13 +8,18 @@
 from unittest.mock import MagicMock
 
 import pytest
+import torch
 
 from tests.v1.kv_connector.unit.utils import create_vllm_config
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
-from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorBase_V1
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    SupportsHMA,
+    supports_hma,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
     MultiConnector,
@@ -83,8 +88,43 @@ def update_state_after_alloc(self, request, blocks, num_tokens) -> None:
         pass
 
 
-# Register the mock connector
+class MockHMAConnector(KVConnectorBase_V1, SupportsHMA):
+    """Mock connector that supports HMA for testing."""
+
+    def __new__(cls, *args, **kwargs):
+        mock = MagicMock(spec_set=cls)
+        return mock
+
+    def start_load_kv(self, forward_context, **kwargs):
+        pass
+
+    def wait_for_layer_load(self, layer_name):
+        pass
+
+    def save_kv_layer(self, layer_name, kv_layer, attn_metadata, **kwargs):
+        pass
+
+    def wait_for_save(self):
+        pass
+
+    def build_connector_meta(self, scheduler_output):
+        return None
+
+    def get_num_new_matched_tokens(self, request, num_computed_tokens):
+        return (0, False)
+
+    def update_state_after_alloc(self, request, blocks, num_tokens) -> None:
+        pass
+
+    def request_finished_all_groups(self, request, block_ids):
+        return (False, None)
+
+
+# Register mock connectors
 KVConnectorFactory.register_connector("MockConnector", __name__, MockConnector.__name__)
+KVConnectorFactory.register_connector(
+    "MockHMAConnector", __name__, MockHMAConnector.__name__
+)
 
 
 @pytest.fixture
@@ -920,3 +960,133 @@ def assert_update_connector_output_called(mc: MultiConnector):
     mc.update_connector_output(kv_connector_output)
     assert_update_connector_output_called(mc)
     assert kv_connector_output.kv_connector_worker_meta == mc_worker_meta_01a_01b
+
+
+def _make_multi_connector(connector_names: list[str]) -> MultiConnector:
+    """Build a MultiConnector wrapping the given registered connectors."""
+    vllm_config = create_vllm_config()
+    connectors = [
+        {
+            "kv_connector": name,
+            "kv_role": "kv_both",
+            "kv_connector_module_path": "tests.v1.kv_connector.unit.test_multi_connector",  # noqa: E501
+        }
+        for name in connector_names
+    ]
+    vllm_config.kv_transfer_config = KVTransferConfig(
+        kv_connector="MultiConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"connectors": connectors},
+    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=0,
+        kv_cache_tensors=[],
+        kv_cache_groups=[],
+    )
+    return MultiConnector(
+        vllm_config=vllm_config,
+        role=KVConnectorRole.WORKER,
+        kv_cache_config=kv_cache_config,
+    )
+
+
+def test_multi_connector_hma_opt_in():
+    """
+    MultiConnector currently assumes HMA is opt-in: it needs
+    --no-disable-hybrid-kv-cache-manager to be enabled.
+
+    At runtime, _all_support_hma is True only when every sub-connector
+    implements SupportsHMA. Test all combinations of HMA / non-HMA
+    sub-connectors.
+    """
+
+    assert supports_hma(MultiConnector)
+
+    # -- All non-HMA connectors => _all_support_hma is False --
+    mc_none = _make_multi_connector(["MockConnector", "MockConnector"])
+    assert not supports_hma(mc_none._connectors[0])
+    assert not supports_hma(mc_none._connectors[1])
+    assert mc_none._all_support_hma is False
+
+    # -- All HMA connectors => _all_support_hma is True --
+    mc_all = _make_multi_connector(["MockHMAConnector", "MockHMAConnector"])
+    assert supports_hma(mc_all._connectors[0])
+    assert supports_hma(mc_all._connectors[1])
+    assert mc_all._all_support_hma is True
+
+    # -- Mixed: first HMA, second non-HMA => _all_support_hma is False --
+    mc_mixed1 = _make_multi_connector(["MockHMAConnector", "MockConnector"])
+    assert supports_hma(mc_mixed1._connectors[0])
+    assert not supports_hma(mc_mixed1._connectors[1])
+    assert mc_mixed1._all_support_hma is False
+
+    # -- Mixed: first non-HMA, second HMA => _all_support_hma is False --
+    mc_mixed2 = _make_multi_connector(["MockConnector", "MockHMAConnector"])
+    assert not supports_hma(mc_mixed2._connectors[0])
+    assert supports_hma(mc_mixed2._connectors[1])
+    assert mc_mixed2._all_support_hma is False
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="Requires GPU to instantiate LLM"
+)
+def test_multi_connector_mixed_hma_disables_hybrid_kv_cache(monkeypatch):
+    """
+    When MultiConnector wraps a mix of HMA (NixlConnector) and non-HMA
+    (MockConnector) sub-connectors, verify that:
+    1. The scheduler's MultiConnector has _all_support_hma == False.
+    2. vLLM auto-disables the hybrid KV cache manager (no preference expressed by user)
+    """
+    from unittest.mock import patch
+
+    from tests.v1.kv_connector.unit.test_nixl_connector import FakeNixlWrapper
+
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="MultiConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={
+            "connectors": [
+                {
+                    "kv_connector": "NixlConnector",
+                    "kv_role": "kv_both",
+                },
+                {
+                    "kv_connector": "MockConnector",
+                    "kv_role": "kv_both",
+                    "kv_connector_module_path": (
+                        "tests.v1.kv_connector.unit.test_multi_connector"
+                    ),
+                },
+            ],
+        },
+    )
+
+    with patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker.NixlWrapper",
+        FakeNixlWrapper,
+    ):
+        llm = LLM(
+            model="Qwen/Qwen3-0.6B",
+            enforce_eager=True,
+            gpu_memory_utilization=0.3,
+            max_model_len=128,
+            max_num_seqs=1,
+            max_num_batched_tokens=128,
+            kv_transfer_config=kv_transfer_config,
+        )
+        try:
+            # HMA should be auto-disabled when user has not expressed a preference.
+            assert (
+                llm.llm_engine.vllm_config.scheduler_config.disable_hybrid_kv_cache_manager
+                is True
+            )
+            # The scheduler-side MultiConnector should detect the mixed
+            # HMA support among its sub-connectors.
+            scheduler = llm.llm_engine.engine_core.engine_core.scheduler
+            mc = scheduler.connector
+            assert isinstance(mc, MultiConnector)
+            assert mc._all_support_hma is False
+        finally:
+            llm.llm_engine.engine_core.shutdown()
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 50e83aa2ef20..3803e4fd3869 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -527,6 +527,7 @@ def _nixl_handshake(
                     block_size=self.block_size,
                     ssm_sizes=(0, 0),
                     attn_backend_name=self.backend_name,
+                    physical_blocks_per_logical_kv_block=1,
                 ),
                 remote_tp_rank=remote_tp_rank,
                 remote_tp_size=remote_tp_size,
@@ -726,6 +727,7 @@ def test_prefill_tp_size_greater_than_decode_tp_size(
         worker.num_blocks = 1
         worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
         worker.src_blocks_data = [(0, worker.block_len_per_layer[0], worker.tp_rank)]
+        worker.num_descs = len(worker.src_blocks_data)
 
         def check_handshake(remote_tp_size: int):
             tp_ratio = remote_tp_size // local_tp_size
@@ -978,6 +980,7 @@ def test_handshake_fails_on_kv_cache_layout_mismatch(
                 block_size=worker.block_size,
                 ssm_sizes=(0, 0),
                 attn_backend_name=worker.backend_name,
+                physical_blocks_per_logical_kv_block=1,
             )
 
             with pytest.raises(RuntimeError):
@@ -1035,6 +1038,7 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
                 block_size=worker.block_size,
                 ssm_sizes=(0, 0),
                 attn_backend_name=worker.backend_name,
+                physical_blocks_per_logical_kv_block=1,
             )
 
             # We don't check layout for homogeneous TP and MLA for now, as the
@@ -2354,6 +2358,7 @@ def test_compatibility_hash_validation(
         block_size=prefill_block_size,
         ssm_sizes=(0, 0),
         attn_backend_name=decode_worker.backend_name,
+        physical_blocks_per_logical_kv_block=1,
     )
     handshake_payload = NixlHandshakePayload(
         compatibility_hash=remote_hash,
@@ -2479,3 +2484,122 @@ def test_handshake_decode_errors(default_vllm_config, dist_init, error_scenario)
                 remote_tp_size=1,
                 expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
             )
+
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker.NixlWrapper",
+        FakeNixlWrapper,
+    )
+    def test_mla_broadcast_notif_uses_remote_request_id(
+        self, default_vllm_config, dist_init
+    ):
+        """MLA + remote TP > local TP: the broadcast notification sent to
+        non-read prefill ranks must be keyed by the prefill-side request
+        id (``meta.remote.request_id``), not the local decode request id.
+
+        Prefill ranks key ``_reqs_to_send`` by their own request id, so a
+        broadcast keyed by the decode id is rejected in
+        ``_get_new_notifs`` with "Potentially invalid KV blocks for
+        unrecognized request" and the blocks only release via the abort
+        timeout. See ``_read_blocks_for_req`` in
+        ``vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py``.
+        """
+        decode_tp_size = 1
+        prefill_tp_size = 4
+
+        vllm_config = create_vllm_config()
+        vllm_config.parallel_config.tensor_parallel_size = decode_tp_size
+
+        connector = NixlConnector(
+            vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16)
+        )
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id, hand_shake_latency=0
+        )
+        worker = connector.connector_worker
+
+        # Force the MLA path; only `self.use_mla` gates the branches we
+        # exercise inside `_read_blocks_for_req`.
+        worker.use_mla = True
+
+        # Manually register the remote (P) engine and pre-populate the
+        # per-rank state the handshake would normally fill in. The real
+        # `_nixl_handshake` is unnecessary here — we only need
+        # `transfer_topo` to know `remote_tp_size`, and `_remote_agents`
+        # / `dst_xfer_side_handles` to be keyed by remote rank.
+        remote_engine_id = "remote_engine"
+        worker.transfer_topo.register_remote_engine(
+            remote_engine_id=remote_engine_id,
+            remote_tp_size=prefill_tp_size,
+            remote_block_size=worker.block_size,
+            remote_block_len=worker.block_size * 4096,
+            remote_physical_blocks_per_logical=1,
+            local_block_len=worker.block_size * 4096,
+        )
+        worker._remote_agents[remote_engine_id] = {
+            rank: f"agent_p{rank}" for rank in range(prefill_tp_size)
+        }
+        worker.dst_xfer_side_handles = {
+            remote_engine_id: {rank: 100 + rank for rank in range(prefill_tp_size)}
+        }
+        # Sanity: D TP=1, P TP=4 => tp_ratio = -4 (P > D).
+        assert worker.transfer_topo.tp_ratio(prefill_tp_size) == -prefill_tp_size
+
+        # Distinct ids on each side — that's the whole point of the bug.
+        decode_req_id = "decode-req-AAAA"
+        prefill_req_id = "prefill-req-BBBB"
+        assert decode_req_id != prefill_req_id
+
+        metadata = NixlConnectorMetadata()
+        metadata.add_new_req_to_recv(
+            request_id=decode_req_id,
+            local_block_ids=([0, 1, 2],),
+            kv_transfer_params={
+                "remote_block_ids": ([10, 11, 12],),
+                "remote_engine_id": remote_engine_id,
+                "remote_request_id": prefill_req_id,
+                "remote_host": "localhost",
+                "remote_port": 1234,
+                "remote_tp_size": prefill_tp_size,
+            },
+        )
+        meta = metadata.reqs_to_recv[decode_req_id]
+
+        # Capture broadcast send_notif calls; stub `_read_blocks` so we
+        # don't need a working xfer path. Real `_read_blocks` emits its
+        # auto-notif via `make_prepped_xfer`, not via `send_notif`, so
+        # any captured `send_notif` here is a broadcast.
+        send_notif_calls: list[tuple[str, bytes]] = []
+        worker.nixl_wrapper.send_notif = (  # type: ignore[method-assign]
+            lambda agent_name, notif_msg: send_notif_calls.append(
+                (agent_name, notif_msg)
+            )
+        )
+        worker._read_blocks = MagicMock()  # type: ignore[method-assign]
+
+        worker._read_blocks_for_req(decode_req_id, meta)
+
+        # MLA: read once from rank 0 and broadcast to the other ranks.
+        worker._read_blocks.assert_called_once()
+        assert worker._read_blocks.call_args.kwargs["remote_rank"] == 0
+        assert (
+            worker._read_blocks.call_args.kwargs["remote_request_id"] == prefill_req_id
+        )
+
+        # Broadcast goes to ranks {1, 2, 3} only, never to the read target.
+        expected_recipients = {
+            worker._remote_agents[remote_engine_id][r]
+            for r in range(1, prefill_tp_size)
+        }
+        assert {agent for agent, _ in send_notif_calls} == expected_recipients
+
+        # Every broadcast notif must be keyed by the prefill request id.
+        # Pre-fix this used the *decode* request id, which prefill ranks
+        # didn't recognize.
+        expected_notif = f"{prefill_req_id}:{decode_tp_size}".encode()
+        bad_notif = f"{decode_req_id}:{decode_tp_size}".encode()
+        for agent, notif in send_notif_calls:
+            assert notif == expected_notif, (
+                f"Broadcast notif to {agent!r} must use prefill_req_id; "
+                f"got {notif!r} (expected {expected_notif!r}, "
+                f"buggy form would be {bad_notif!r})"
+            )
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py
index 3f5a9b9cc031..edc591e06090 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py
@@ -93,75 +93,82 @@ def test_logical_to_kernel_block_ids_with_hma():
 
 @pytest.mark.cpu_test
 @pytest.mark.parametrize(
-    "has_mamba,swa_enabled,mamba_enabled,remote_ratio,"
-    "remote_block_ids,expected_remote_block_ids",
+    "group_spec_types,expansion_stride,remote_block_ids,expected_remote_block_ids",
     [
-        # Non-mamba (FA+SWA): both groups expanded via _logical_to_kernel_block_ids.
-        # Regression for https://github.com/vllm-project/vllm/pull/39724
-        (
-            False,
-            True,
-            False,
-            1,
+        pytest.param(
+            ("FullAttentionSpec", "SlidingWindowSpec"),
+            2,
             ([0, 1, 2], [3, 4]),
             [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9]],
+            id="dense_fa_swa",
         ),
-        # Mamba (FA+Mamba): FA expanded via _logical_to_remote_kernel_block_ids,
-        # Mamba passed through unchanged.
-        # remote_ratio=261 (Nemotron 30B TP=1) != local_ratio=2 so that using
-        # the wrong conversion method produces different FA results.
-        (
-            True,
-            False,
-            True,
+        pytest.param(
+            ("FullAttentionSpec", "MambaSpec"),
             261,
             ([0, 1, 2], [10, 11]),
             [[0, 1, 261, 262, 522, 523], [10, 11]],
+            id="mamba_fa_ssm",
         ),
     ],
-    ids=["non_mamba_fa_swa", "mamba_fa_ssm"],
 )
 def test_read_blocks_for_req_expands_remote_ids(
-    has_mamba,
-    swa_enabled,
-    mamba_enabled,
-    remote_ratio,
+    group_spec_types,
+    expansion_stride,
     remote_block_ids,
     expected_remote_block_ids,
 ):
     """_read_blocks_for_req must expand remote logical block IDs to kernel
     block IDs when kernel block size != logical block size.
 
-    Non-mamba path uses _logical_to_kernel_block_ids (all groups expanded).
-    Mamba path uses _logical_to_remote_kernel_block_ids (FA expanded, Mamba
-    passed through).
+    The hot path always calls _logical_to_remote_kernel_block_ids with
+    remote_info.remote_physical_blocks_per_logical (model-agnostic).
     """
     from unittest.mock import MagicMock
 
     from vllm.distributed.kv_transfer.kv_connector.v1.nixl.metadata import (
         NixlConnectorMetadata,
     )
+    from vllm.distributed.kv_transfer.kv_connector.v1.nixl.tp_mapping import (
+        TPMapping,
+    )
     from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
         NixlConnectorWorker,
     )
+    from vllm.v1.kv_cache_interface import (
+        FullAttentionSpec,
+        MambaSpec,
+        SlidingWindowSpec,
+    )
+
+    spec_name_to_type = {
+        "FullAttentionSpec": FullAttentionSpec,
+        "SlidingWindowSpec": SlidingWindowSpec,
+        "MambaSpec": MambaSpec,
+    }
+    resolved_types = tuple(spec_name_to_type[n] for n in group_spec_types)
 
     worker = object.__new__(NixlConnectorWorker)
-    worker._has_mamba = has_mamba
     worker._physical_blocks_per_logical_kv_block = 2
+
+    has_mamba = any(t is MambaSpec for t in resolved_types)
+    has_swa = any(t is SlidingWindowSpec for t in resolved_types)
     worker.kv_cache_config = make_kv_cache_config(
-        block_size=16, swa_enabled=swa_enabled, mamba_enabled=mamba_enabled
+        block_size=16, swa_enabled=has_swa, mamba_enabled=has_mamba
     )
 
     remote_engine_id = "remote-engine"
-    if has_mamba:
-        worker._physical_blocks_per_logical = {remote_engine_id: remote_ratio}
 
-    # Mock transfer_topo: empty remote ranks skips the transfer machinery
-    # entirely, isolating the block-ID expansion logic.
     worker.transfer_topo = MagicMock()
-    worker.transfer_topo.target_remote_ranks.return_value = []
-    worker.transfer_topo.get_engine_info.return_value = MagicMock(remote_tp_size=1)
     worker.transfer_topo.tp_ratio.return_value = 1
+    remote_info = MagicMock()
+    remote_info.remote_physical_blocks_per_logical = expansion_stride
+    worker.transfer_topo.get_engine_info.return_value = remote_info
+    worker.use_mla = False
+
+    mock_plan = MagicMock(spec=TPMapping)
+    mock_plan.all_source_ranks = ()
+    mock_plan.source_ranks_per_group = ()
+    worker.tp_mappings = {remote_engine_id: mock_plan}
 
     metadata = NixlConnectorMetadata()
     metadata.add_new_req_to_recv(
@@ -306,75 +313,82 @@ def test_nixl_metadata_hma_block_ids_structure():
     assert list(req_meta.remote.block_ids[1]) == [18, 19, 20, 21]
 
 
-@pytest.mark.cpu_test
-def test_get_block_descs_ids_hybrid_ssm():
-    """Test _get_block_descs_ids uses per-group strides for hybrid FA+SSM
-    when ratio=1 (no kernel block size mismatch)."""
+def _make_mock_worker_for_desc_ids(
+    num_regions: int,
+    has_mamba: bool,
+    group_spec_types: tuple,
+    block_len_per_layer: list[int] | None = None,
+):
+    """Build a mock NixlConnectorWorker with attrs needed by _compute_desc_ids."""
+    from unittest.mock import MagicMock
+
     from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
         NixlConnectorWorker,
     )
 
-    worker = object.__new__(NixlConnectorWorker)
+    worker = MagicMock(spec=NixlConnectorWorker)
+    worker.num_regions = num_regions
+    worker._has_mamba = has_mamba
+    worker._group_spec_types = group_spec_types
+    worker.block_len_per_layer = block_len_per_layer or [100]
+    worker._compute_desc_ids = NixlConnectorWorker._compute_desc_ids.__get__(
+        worker, NixlConnectorWorker
+    )
+    return worker
 
-    num_blocks = 100
-    engine_id = "test-engine"
-    worker.num_regions = 2
-    worker.dst_num_blocks = {engine_id: num_blocks}
-    worker._has_mamba = True
-    worker._is_mamba_group = [False, True]
-    worker._physical_blocks_per_logical_kv_block = 1
-    worker._physical_blocks_per_logical = {engine_id: 1}
-    worker.block_len_per_layer = [100]
-    # num_descs = num_regions * num_blocks (no blocks_first doubling)
-    worker.num_descs = 2 * num_blocks
+
+@pytest.mark.cpu_test
+def test_get_block_descs_ids_hybrid_ssm():
+    """Test _compute_desc_ids uses per-group strides for hybrid
+    FA+SSM when ratio=1 (no kernel block size mismatch)."""
+    from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
+
+    worker = _make_mock_worker_for_desc_ids(
+        num_regions=2,
+        has_mamba=True,
+        group_spec_types=(FullAttentionSpec, MambaSpec),
+        block_len_per_layer=[100],
+    )
 
     fa_blocks = [3, 5]
     ssm_blocks = [1, 2]
-    result = worker._get_block_descs_ids(engine_id, (fa_blocks, ssm_blocks))
-
-    # FA group: stride=num_blocks=100, offset=0
-    #   region0: [3, 5],  region1: [103, 105]
-    # SSM group: stride=logical_blocks=100 (=num_blocks/ratio=100/1),
-    #   offset=num_fa_descs=200, 4 regions per Mamba layer (x, B, C, ssm)
-    #   region0: [201, 202], region1: [301, 302],
-    #   region2: [401, 402], region3: [501, 502]
+    result = worker._compute_desc_ids(
+        block_ids=(fa_blocks, ssm_blocks),
+        dst_num_blocks=100,
+        block_size_ratio=None,
+        physical_blocks_per_logical=1,
+    )
+
     expected = [3, 5, 103, 105, 201, 202, 301, 302, 401, 402, 501, 502]
     assert list(result) == expected, f"Expected {expected}, got {list(result)}"
 
 
 @pytest.mark.cpu_test
 def test_get_block_descs_ids_kernel_block_mismatch():
-    """Test _get_block_descs_ids uses different strides for FA (kernel blocks)
-    vs SSM (logical blocks) when ratio > 1."""
-    from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
-        NixlConnectorWorker,
-    )
-
-    worker = object.__new__(NixlConnectorWorker)
+    """Test _compute_desc_ids uses different strides for FA
+    (kernel blocks) vs SSM (logical blocks) when ratio > 1."""
+    from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
 
     ratio = 4
     logical_blocks = 100
     num_blocks = logical_blocks * ratio  # 400 kernel blocks
-    engine_id = "test-engine"
-    worker.num_regions = 2
-    worker.dst_num_blocks = {engine_id: num_blocks}
-    worker._has_mamba = True
-    worker._is_mamba_group = [False, True]
-    worker._physical_blocks_per_logical_kv_block = ratio
-    worker._physical_blocks_per_logical = {engine_id: ratio}
-    worker.block_len_per_layer = [100]
-    worker.num_descs = 2 * num_blocks  # 800
-
-    fa_blocks = [3, 7]  # kernel-level block IDs
-    ssm_blocks = [1, 2]  # logical block IDs
-    result = worker._get_block_descs_ids(engine_id, (fa_blocks, ssm_blocks))
-
-    # FA group: stride=num_blocks=400, offset=0
-    #   region0: [3, 7],  region1: [403, 407]
-    # SSM group: stride=logical_blocks=400//4=100, offset=num_fa_descs=800,
-    #   4 regions per Mamba layer (x, B, C, ssm)
-    #   region0: [801, 802], region1: [901, 902],
-    #   region2: [1001, 1002], region3: [1101, 1102]
+
+    worker = _make_mock_worker_for_desc_ids(
+        num_regions=2,
+        has_mamba=True,
+        group_spec_types=(FullAttentionSpec, MambaSpec),
+        block_len_per_layer=[100],
+    )
+
+    fa_blocks = [3, 7]
+    ssm_blocks = [1, 2]
+    result = worker._compute_desc_ids(
+        block_ids=(fa_blocks, ssm_blocks),
+        dst_num_blocks=num_blocks,
+        block_size_ratio=None,
+        physical_blocks_per_logical=ratio,
+    )
+
     expected = [3, 7, 403, 407, 801, 802, 901, 902, 1001, 1002, 1101, 1102]
     assert list(result) == expected, f"Expected {expected}, got {list(result)}"
 
diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_connector/unit/test_offloading_connector.py
similarity index 56%
rename from tests/v1/kv_offload/test_cpu_offloading.py
rename to tests/v1/kv_connector/unit/test_offloading_connector.py
index b2ab21d4be75..555daea50e38 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_connector/unit/test_offloading_connector.py
@@ -14,13 +14,36 @@
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
 from vllm.platforms import current_platform
 
-CPU_BLOCK_SIZES = [48]
-ATTN_BACKENDS = []
-
+_ATTN_BACKENDS: list[str] = []
 if current_platform.is_cuda():
-    ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER", "TRITON_ATTN"]
+    _ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER", "TRITON_ATTN"]
 elif current_platform.is_rocm():
-    ATTN_BACKENDS = ["TRITON_ATTN"]
+    _ATTN_BACKENDS = ["TRITON_ATTN"]
+
+# (model, attn_backend | None, block_size | None, uses_hma)
+#
+# - Llama: tested with each attention backend and a custom block_size.
+# - Gemma-3: HMA (sliding window + full attention), default backend.
+# - Mamba-130m: HMA (attention-free, acts like sliding_window=1),
+#   default backend.  Prefix caching must be force-enabled.
+# - Falcon-H1-0.5B-Instruct: HMA (parallel SSM/attention in every layer).
+#   After page-size unification the mamba and attention groups have
+#   different block sizes.
+MODEL_PARAMS: list[tuple[str, str | None, int | None, bool]] = [
+    ("meta-llama/Llama-3.2-1B-Instruct", backend, 48, False)
+    for backend in _ATTN_BACKENDS
+]
+# HMA / Mamba models are only tested on CUDA (not ROCm).
+if current_platform.is_cuda():
+    MODEL_PARAMS += [
+        ("google/gemma-3-1b-it", None, 48, True),
+        ("state-spaces/mamba-130m-hf", None, 48, True),
+        # Falcon-H1: parallel hybrid (every layer has both attention and SSM).
+        # The mamba and attention groups end up with different GPU block sizes
+        # after page-size unification, so we leave cpu_block_size=None
+        # (block_size_factor stays 1).
+        ("tiiuae/Falcon-H1-0.5B-Instruct", None, None, True),
+    ]
 
 # Maximum time (seconds) to wait for the async CPU offload transfer
 # to complete before giving up.
@@ -110,7 +133,7 @@ def _wait_for_prefix_cache_reset(llm: LLM) -> None:
         )
 
 
-def _latency_test(llm: LLM, subscriber: MockSubscriber):
+def _latency_test(llm: LLM, subscriber: MockSubscriber | None):
     sampling_params = SamplingParams(max_tokens=1)
 
     num_times_cpu_better_than_cold = 0
@@ -118,7 +141,10 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber):
     total_cold_time = 0.0
     total_gpu_hit_time = 0.0
     total_cpu_hit_time = 0.0
-    prompt_token_ids = [0] * 10001
+    max_model_len = llm.llm_engine.vllm_config.model_config.max_model_len
+    # Use a long prompt that fits within the model's context window.
+    prompt_len = min(10001, max_model_len - 1)
+    prompt_token_ids = [0] * prompt_len
     for i in tqdm(range(num_tests), desc="Running tests"):
         prompt_token_ids[0] = i
         prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
@@ -141,10 +167,11 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber):
 
         # Verify CPU stored events arrived (offload is done before we
         # attempt to load from CPU).
-        assert subscriber.get_new_cpu_stored_events(), (
-            f"No CPU stored events received on iteration {i}; "
-            "async offload may not have completed in time"
-        )
+        if subscriber is not None:
+            assert subscriber.get_new_cpu_stored_events(), (
+                f"No CPU stored events received on iteration {i}; "
+                "async offload may not have completed in time"
+            )
 
         # run generation again - this should trigger loading from CPU
         start_time = time.time()
@@ -163,84 +190,105 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber):
     assert num_times_cpu_better_than_cold >= 0.8 * num_tests
 
 
-def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
+def _accuracy_test(llm: LLM, subscriber: MockSubscriber | None):
     sampling_params = SamplingParams(max_tokens=1)
-    cpu_block_size = (
-        llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[
-            "block_size"
-        ]
+    extra_config = (
+        llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config
     )
-
-    subscriber.get_new_cpu_stored_events()
-
-    # prepend prompt to be cpu block aligned
+    cpu_block_size = extra_config.get("block_size")
+    if cpu_block_size is None:
+        # No custom offloaded block_size: offloaded blocks match GPU blocks.
+        # Use the hash block_size (cache_config.block_size) for alignment.
+        cpu_block_size = llm.llm_engine.vllm_config.cache_config.block_size
+
+    if subscriber is not None:
+        subscriber.get_new_cpu_stored_events()
+
+    # Pad prompt so its token count is a multiple of cpu_block_size.
+    # Use the tokenizer directly to avoid expensive llm.generate() calls.
+    tokenizer = llm.get_tokenizer()
     prompt = "Let's count to 10. One, two, three, four,"
-    while (
-        len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size
-        != 0
-    ):
+    while len(tokenizer.encode(prompt)) % cpu_block_size != 0:
         prompt = ". " + prompt
 
-    assert subscriber.get_new_cpu_stored_events()
+    # Seed the CPU cache with the prompt.
+    llm.generate(prompt, sampling_params, use_tqdm=False)
 
-    test_count = 100
-    success_count = 0
-    for i in range(test_count):
-        if (
-            llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text
-            == " five"
-        ):
-            success_count += 1
+    if subscriber is not None:
+        assert subscriber.get_new_cpu_stored_events()
 
+    test_count = 20
+    results = llm.generate([prompt] * test_count, sampling_params, use_tqdm=False)
+    success_count = sum(1 for r in results if r.outputs[0].text == " five")
     assert success_count >= 0.5 * test_count
 
 
-@pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES)
-@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
-def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
+@pytest.mark.parametrize("model, attn_backend, cpu_block_size, uses_hma", MODEL_PARAMS)
+def test_cpu_offloading(
+    model: str,
+    attn_backend: str | None,
+    cpu_block_size: int | None,
+    uses_hma: bool,
+) -> None:
     """
     Tests OffloadingConnector with CPUOffloadingSpec.
     """
-
     # configure OffloadingConnector (spec_name=CPUOffloadingSpec by default)
+    extra_config: dict = {"cpu_bytes_to_use": 500 << 20}
+    if cpu_block_size is not None:
+        extra_config["block_size"] = cpu_block_size
     kv_transfer_config = KVTransferConfig(
         kv_connector="OffloadingConnector",
         kv_role="kv_both",
-        kv_connector_extra_config={
-            "cpu_bytes_to_use": 500 << 20,
-            "block_size": cpu_block_size,
-        },
+        kv_connector_extra_config=extra_config,
     )
 
-    port: int
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("0.0.0.0", 0))
-        port = s.getsockname()[1]
-
-    events_endpoint = f"tcp://*:{port}"
-    kv_events_config = KVEventsConfig(
-        enable_kv_cache_events=True,
-        publisher="zmq",
-        endpoint=events_endpoint,
-        topic="test",
-    )
+    # KV events are incompatible with HMA (setting kv_events_config
+    # would force HMA off), so only enable them for non-HMA models.
+    subscriber: MockSubscriber | None = None
+    kv_events_config: KVEventsConfig | None = None
+    if not uses_hma:
+        port: int
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("0.0.0.0", 0))
+            port = s.getsockname()[1]
+
+        events_endpoint = f"tcp://*:{port}"
+        kv_events_config = KVEventsConfig(
+            enable_kv_cache_events=True,
+            publisher="zmq",
+            endpoint=events_endpoint,
+            topic="test",
+        )
+
+    # Attention-free / hybrid models disable prefix caching by default
+    # (ModelConfig.is_prefix_caching_supported returns False).  Without it,
+    # mamba_block_size falls back to max_model_len, making GPU blocks too
+    # large for any reasonable offloaded block_size.  Force-enable it.
+    force_prefix_caching = uses_hma
 
     llm = LLM(
-        model="meta-llama/Llama-3.2-1B-Instruct",
+        model=model,
+        max_model_len=4096,
         gpu_memory_utilization=0.5,
         kv_events_config=kv_events_config,
         kv_transfer_config=kv_transfer_config,
-        attention_config={"backend": attn_backend},
+        **({"attention_config": {"backend": attn_backend}} if attn_backend else {}),
+        # HMA models need explicit opt-in when kv_transfer_config is set
+        **({"disable_hybrid_kv_cache_manager": False} if uses_hma else {}),
+        **({"enable_prefix_caching": True} if force_prefix_caching else {}),
         # ROCm: batch size 1 to reduce variability
         **({"max_num_seqs": 1} if current_platform.is_rocm() else {}),
     )
 
-    events_endpoint = events_endpoint.replace("*", "127.0.0.1")
-    subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
+    if kv_events_config is not None:
+        events_endpoint = events_endpoint.replace("*", "127.0.0.1")
+        subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
 
     try:
         _latency_test(llm, subscriber)
         _accuracy_test(llm, subscriber)
     finally:
-        subscriber.close()
+        if subscriber is not None:
+            subscriber.close()
         del llm
diff --git a/tests/v1/kv_connector/unit/test_tp_mapping.py b/tests/v1/kv_connector/unit/test_tp_mapping.py
new file mode 100644
index 000000000000..e57244a31f79
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_tp_mapping.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for TP mapping and transfer plan utilities.
+
+These tests verify that TP mapping produces correct outputs
+(source ranks, split handles, desc IDs).
+No GPU or NIXL required.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl.tp_mapping import (
+    TPMapping,
+    compute_tp_mapping,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
+    NixlConnectorWorker,
+)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
+
+# ======================================================================
+# Test fixtures / helpers
+# ======================================================================
+
+
+def _compute_mapping(
+    tp_rank: int = 0,
+    tp_size: int = 1,
+    remote_tp_size: int = 1,
+    is_mla: bool = False,
+    num_kv_heads: int = 8,
+    group_spec_types: tuple[type, ...] = (FullAttentionSpec,),
+) -> TPMapping:
+    return compute_tp_mapping(
+        tp_rank=tp_rank,
+        tp_size=tp_size,
+        remote_tp_size=remote_tp_size,
+        is_mla=is_mla,
+        total_num_kv_heads=num_kv_heads,
+        group_spec_types=group_spec_types,
+    )
+
+
+# ======================================================================
+# TP mapping structure tests
+# ======================================================================
+
+
+class TestTPMappingStructure:
+    def test_source_ranks_homogeneous(self):
+        m = _compute_mapping(tp_size=2, tp_rank=1, remote_tp_size=2)
+        assert m.all_source_ranks == (1,)
+
+    def test_source_ranks_d_gt_p(self):
+        m = _compute_mapping(tp_size=4, tp_rank=2, remote_tp_size=2)
+        assert m.all_source_ranks == (1,)
+
+    def test_source_ranks_p_gt_d(self):
+        m = _compute_mapping(tp_size=1, tp_rank=0, remote_tp_size=2)
+        assert m.all_source_ranks == (0, 1)
+
+
+# ======================================================================
+# Split handle tests
+# ======================================================================
+
+
+def _make_mock_worker_for_splits(group_spec_types):
+    """Build a mock NixlConnectorWorker with _group_spec_types for split tests."""
+    worker = object.__new__(NixlConnectorWorker)
+    worker._group_spec_types = group_spec_types
+    return worker
+
+
+class TestBuildSrcSplitHandles:
+    @pytest.mark.parametrize("remote_tp_size", [2, 4])
+    def test_build_src_split_handles(self, remote_tp_size):
+        tp_rank = 0
+        tp_size = 1
+
+        plan = _compute_mapping(
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+            remote_tp_size=remote_tp_size,
+        )
+
+        worker = _make_mock_worker_for_splits((FullAttentionSpec,))
+        src_blocks_data = [(0x2000 + i * 1024, 1024, 0) for i in range(8)]
+        num_descs = len(src_blocks_data)
+        splits = list(
+            worker._build_local_splits_from_plan(
+                plan,
+                src_blocks_data,
+                num_descs,
+            )
+        )
+
+        assert len(splits) == remote_tp_size
+        for handle in splits:
+            assert len(handle) == len(src_blocks_data)
+            for _, length, _ in handle:
+                assert length == 1024 // remote_tp_size
+
+
+class TestMambaPlanSplitHandles:
+    """Verify split handles for Mamba with FA/SSM distinction."""
+
+    def test_fa_and_ssm_different_split_factors(self):
+        """Section 0 split by num_attn_reads, section 1 by abs_tp."""
+        fa_readers = (0,)
+        ssm_readers = (0, 1)
+        plan = TPMapping(
+            source_ranks_per_group=(fa_readers, ssm_readers),
+            all_source_ranks=(0, 1),
+            rank_to_attention_slot={0: 0, 1: 0},
+            rank_offset_factor=0,
+        )
+
+        worker = _make_mock_worker_for_splits((FullAttentionSpec, MambaSpec))
+        # 2 FA descs + 1 SSM desc
+        src_blocks_data = [
+            (1000, 200, 0),  # FA desc 0
+            (2000, 200, 0),  # FA desc 1
+            (3000, 400, 0),  # SSM desc 0
+        ]
+
+        splits = list(worker._build_local_splits_from_plan(plan, src_blocks_data, 2))
+
+        assert len(splits) == 2  # 2 source ranks
+
+        # Rank 0 (FA source, p_idx=0):
+        # FA: chunk=200//1=200, slot=0 → (1000, 200, 0), (2000, 200, 0)
+        # SSM: chunk=400//2=200, idx=0 → (3000, 200, 0)
+        assert splits[0] == [(1000, 200, 0), (2000, 200, 0), (3000, 200, 0)]
+
+        # Rank 1 (not FA source, p_idx=1):
+        # FA: chunk=200//1=200, slot=0 (skip_fa) → (1000, 200, 0), (2000, 200, 0)
+        # SSM: chunk=400//2=200, idx=1 → (3200, 200, 0)
+        assert splits[1] == [(1000, 200, 0), (2000, 200, 0), (3200, 200, 0)]
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 5f0036807b0c..8e4e1cae0676 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -24,6 +24,7 @@
     KVConnectorBase_V1,
     KVConnectorMetadata,
     KVConnectorRole,
+    KVConnectorWorkerMetadata,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (  # noqa
     ExampleConnector,
@@ -102,6 +103,7 @@ def create_vllm_config(
     kv_load_failure_policy: Literal["recompute", "fail"] = "fail",
     kv_connector: str = "NixlConnector",
     kv_role: str = "kv_both",
+    disable_hybrid_kv_cache_manager: bool | None = None,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
     model_config = ModelConfig(
@@ -117,6 +119,7 @@ def create_vllm_config(
         max_model_len=max_model_len,
         enable_chunked_prefill=enable_chunked_prefill,
         is_encoder_decoder=model_config.is_encoder_decoder,
+        disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
     )
     # Cache config, optionally force APC
     cache_config = CacheConfig(
@@ -249,6 +252,7 @@ def create_model_runner_output(
     invalid_block_ids: set[int] | None = None,
     use_eos: bool = False,
     token_id: int = 0,
+    kv_connector_worker_meta: KVConnectorWorkerMetadata | None = None,
 ) -> ModelRunnerOutput:
     """Make dummy model runner output for testing."""
 
@@ -266,11 +270,13 @@ def create_model_runner_output(
             finished_sending is None
             and finished_recving is None
             and invalid_block_ids is None
+            and kv_connector_worker_meta is None
         )
         else KVConnectorOutput(
             finished_sending=finished_sending,
             finished_recving=finished_recving,
             invalid_block_ids=invalid_block_ids or set(),
+            kv_connector_worker_meta=kv_connector_worker_meta,
         )
     )
 
diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/cpu/test_gpu_worker.py
similarity index 98%
rename from tests/v1/kv_offload/test_cpu_gpu.py
rename to tests/v1/kv_offload/cpu/test_gpu_worker.py
index db851edbccbb..e4ed635b9b70 100644
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/cpu/test_gpu_worker.py
@@ -9,14 +9,15 @@
 
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed
-from vllm.v1.kv_offload.cpu.shared_offload_region import SharedOffloadRegion
-from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
-from vllm.v1.kv_offload.spec import (
+from vllm.v1.kv_offload.base import (
     CanonicalKVCacheRef,
     CanonicalKVCaches,
     CanonicalKVCacheTensor,
+    GPULoadStoreSpec,
 )
-from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
+from vllm.v1.kv_offload.cpu.common import CPULoadStoreSpec
+from vllm.v1.kv_offload.cpu.gpu_worker import CpuGpuOffloadingHandlers
+from vllm.v1.kv_offload.cpu.shared_offload_region import SharedOffloadRegion
 
 NUM_GPU_BLOCKS = [64]
 NUM_CPU_BLOCKS = [256]
diff --git a/tests/v1/kv_offload/test_cpu_manager.py b/tests/v1/kv_offload/cpu/test_manager.py
similarity index 98%
rename from tests/v1/kv_offload/test_cpu_manager.py
rename to tests/v1/kv_offload/cpu/test_manager.py
index 733f9bf519e5..e043590a4184 100644
--- a/tests/v1/kv_offload/test_cpu_manager.py
+++ b/tests/v1/kv_offload/cpu/test_manager.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-from vllm.v1.kv_offload.abstract import (
+from vllm.v1.kv_offload.base import (
     LoadStoreSpec,
     OffloadingEvent,
     OffloadKey,
@@ -14,9 +14,9 @@
     ReqContext,
     make_offload_key,
 )
+from vllm.v1.kv_offload.cpu.common import CPULoadStoreSpec
 from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
 from vllm.v1.kv_offload.cpu.policies.arc import ARCCachePolicy
-from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
 from vllm.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
 
 
@@ -163,9 +163,9 @@ def test_cpu_manager():
         ),
     )
 
-    # lookup [1, 2] -> not ready
-    assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is False
-    assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is False
+    # lookup [1, 2] -> write in-flight, not yet ready
+    assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is None
+    assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is None
 
     # no events so far
     assert list(cpu_manager.take_events()) == []
@@ -296,9 +296,9 @@ def test_basic(self):
             ),
         )
 
-        # lookup [1, 2] -> not ready
-        assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is False
-        assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is False
+        # lookup [1, 2] -> write in-flight, not yet ready
+        assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is None
+        assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is None
 
         # no events so far
         assert list(cpu_manager.take_events()) == []
diff --git a/tests/v1/kv_offload/test_shared_offload_region.py b/tests/v1/kv_offload/cpu/test_shared_offload_region.py
similarity index 100%
rename from tests/v1/kv_offload/test_shared_offload_region.py
rename to tests/v1/kv_offload/cpu/test_shared_offload_region.py
diff --git a/tests/v1/kv_offload/test_worker.py b/tests/v1/kv_offload/test_worker.py
index fbdac5f9dc7c..b291fcf1b857 100644
--- a/tests/v1/kv_offload/test_worker.py
+++ b/tests/v1/kv_offload/test_worker.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.base import LoadStoreSpec
 from vllm.v1.kv_offload.worker.worker import (
     OffloadingHandler,
     OffloadingWorker,
diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py
index 9ee6a70abe4c..1326b96346a7 100644
--- a/tests/v1/logits_processors/test_correctness.py
+++ b/tests/v1/logits_processors/test_correctness.py
@@ -30,10 +30,13 @@
     MinPLogitsProcessor,
     MinTokensLogitsProcessor,
     MoveDirectionality,
-    ThinkingTokenBudgetLogitsProcessor,
     build_logitsprocs,
 )
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.thinking_budget_state import (
+    ThinkingBudgetStateHolder,
+    maybe_create_thinking_budget_state_holder,
+)
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
@@ -48,8 +51,10 @@
 MIN_TOKENS_LEN_THRESHOLD = 5
 REQS_PER_LOGITPROC = 50
 STR_NO_LOGITPROC = "none"
+# Thinking budget uses ``ThinkingBudgetStateHolder`` (not a logits processor).
+STR_THINKING_BUDGET = "thinking_budget"
 
-# ThinkingTokenBudgetLogitsProcessor testing constants
+# Thinking token budget testing constants
 THINKING_TOKEN_BUDGET = 5
 THINK_START_TOKEN_ID = 999
 THINK_END_TOKEN_ID = 998
@@ -80,15 +85,8 @@ def __init__(self, workload_index: int, logitproc_type: LogitprocType):
         if num_tokens > 0:
             # Use diverse random tokens
             self.out_tokens = [random.randint(1, 950) for _ in range(num_tokens)]
-            # Set first token for ThinkingTokenBudget testing
-            is_thinking_processor = (
-                logitproc_type is ThinkingTokenBudgetLogitsProcessor
-                or (
-                    hasattr(logitproc_type, "__name__")
-                    and logitproc_type.__name__ == "ThinkingTokenBudgetLogitsProcessor"
-                )
-            )
-            if is_thinking_processor:
+            # Think-start seed for ``STR_THINKING_BUDGET`` rows.
+            if logitproc_type == STR_THINKING_BUDGET:
                 self.out_tokens[0] = THINK_START_TOKEN_ID
         else:
             self.out_tokens = []
@@ -102,7 +100,7 @@ def __str__(self):
 
 
 class MockReasoningConfig:
-    """Mock reasoning config for testing ThinkingTokenBudgetLogitsProcessor."""
+    """Minimal reasoning config for ``ThinkingBudgetStateHolder`` tests."""
 
     reasoning_start_token_ids = [THINK_START_TOKEN_ID]
     reasoning_end_token_ids = [THINK_END_TOKEN_ID]
@@ -137,6 +135,18 @@ def _generate_fake_sampling_metadata(
         is_pin_memory=PIN_MEMORY_AVAILABLE,
         is_pooling_model=False,
     )
+    num_spec = (
+        vllm_config.speculative_config.num_speculative_tokens
+        if vllm_config.speculative_config
+        else 0
+    )
+    thinking_holder = maybe_create_thinking_budget_state_holder(
+        vllm_config.reasoning_config,
+        vllm_config.scheduler_config.max_num_seqs,
+        num_spec,
+        device,
+        PIN_MEMORY_AVAILABLE,
+    )
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size,), 0.0),
         all_greedy=True,
@@ -156,6 +166,7 @@ def _generate_fake_sampling_metadata(
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
         logitsprocs=logitsprocs,
+        thinking_budget_state_holder=thinking_holder,
     )
     return fake_sampling_metadata
 
@@ -187,7 +198,7 @@ def _sampling_params_from_logitproc(logitproc_type: LogitprocType) -> SamplingPa
 
 def _generate_mixed_logitsprocs_batch_params(
     reqs_per_logitproc: int,
-    logitsprocs_types: list[str],
+    logitsprocs_types: list[LogitprocType],
 ) -> list[LogitsProcsRequestParams]:
     """Define key params for a batch of requests with a different
     logitproc enabled per request.
@@ -450,23 +461,21 @@ def _thinking_budget_validate(
     request_params: LogitsProcsRequestParams,
     step_idx: int,
 ) -> None:
-    """Validate thinking token budget processor behavior"""
-    # Get the ThinkingTokenBudgetLogitsProcessor instance
-    tb_processor: ThinkingTokenBudgetLogitsProcessor = next(
-        test_fakes.get_logitsprocs_by_cls(ThinkingTokenBudgetLogitsProcessor)
-    )
+    """Validate ``ThinkingBudgetStateHolder`` thinking-budget behavior.
 
-    # Get current request state
-    state = tb_processor._state.get(batch_index)
+    State is keyed by **batch slot** (same index space as logits rows), matching
+    ``sync_batch`` / sampler integration (see PR #34668 discussion).
+    """
+    holder = test_fakes.sampling_metadata.thinking_budget_state_holder
+    assert holder is not None
+    state = holder._state.get(batch_index)
     params = request_params.params
 
-    # Validate thinking token budget configuration
     if hasattr(params, "thinking_token_budget") and params.thinking_token_budget:
-        # State should exist for requests with thinking_token_budget
         if state is None:
             _raise_error_invalid(
                 msg_suffix=(
-                    f"Expected state for batch {batch_index} "
+                    f"Expected holder state for batch slot {batch_index} "
                     f"with thinking_token_budget={params.thinking_token_budget}"
                 ),
                 batch_index=batch_index,
@@ -474,10 +483,8 @@ def _thinking_budget_validate(
                 step_idx=step_idx,
             )
 
-        # Validate budget matches what was set
         expected_budget = params.thinking_token_budget
         actual_budget = state["thinking_token_budget"]
-
         if actual_budget != expected_budget:
             _raise_error_invalid(
                 msg_suffix=(
@@ -488,13 +495,9 @@ def _thinking_budget_validate(
                 step_idx=step_idx,
             )
 
-        # Check if we're in thinking mode and validate token counting
         output_tokens = request_params.out_tokens
-
-        # Find if thinking has started in output tokens
+        start_tokens = holder.think_start_token_ids
         thinking_started = False
-        start_tokens = tb_processor.reasoning_start_token_ids
-
         if len(start_tokens) > 0:
             for i in range(len(output_tokens) - len(start_tokens) + 1):
                 if output_tokens[i : i + len(start_tokens)] == start_tokens:
@@ -502,61 +505,42 @@ def _thinking_budget_validate(
                     break
 
         if thinking_started:
-            # If budget is exceeded, validate end token forcing
             think_count = state["think_count"]
             budget = state["thinking_token_budget"]
+            if think_count >= budget and not state["in_end"]:
+                _raise_error_invalid(
+                    msg_suffix=(
+                        f"Budget exceeded ({think_count} >= {budget}) but "
+                        "in_end is false"
+                    ),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx,
+                )
 
-            if think_count >= budget:
-                if not state["in_end"]:
+            end_tokens = holder.think_end_token_ids
+            if (
+                think_count >= budget
+                and state["in_end"]
+                and len(end_tokens) > 0
+                and holder.has_tracked_requests()
+            ):
+                expected_end_token_id = end_tokens[
+                    min(state["end_count"], len(end_tokens) - 1)
+                ]
+                # Holder bumps forced vocab positions to 1e9 (does not -inf others).
+                forced_logit = float(logits_new[batch_index, expected_end_token_id])
+                if forced_logit < 1.0e8:
                     _raise_error_invalid(
                         msg_suffix=(
-                            f"Budget exceeded ({think_count} >= "
-                            f"{budget}) but not "
-                            "forcing end tokens"
+                            f"Expected forced end token {expected_end_token_id} "
+                            f"with large logit, got {forced_logit}"
                         ),
                         batch_index=batch_index,
                         request_params=request_params,
                         step_idx=step_idx,
                     )
 
-                # Validate that only end tokens are allowed
-                end_tokens = tb_processor.reasoning_end_token_ids
-                if len(end_tokens) > 0:
-                    expected_end_token_id = end_tokens[
-                        min(state["end_count"], len(end_tokens) - 1)
-                    ]
-
-                    # Check logits masking
-                    batch_logits = logits_new[batch_index]
-                    for token_id in range(len(batch_logits)):
-                        logit_value = batch_logits[token_id]
-
-                        if token_id == expected_end_token_id:
-                            # End token should not be masked
-                            if logit_value == -float("inf"):
-                                _raise_error_invalid(
-                                    msg_suffix=(
-                                        f"End token {token_id} should not be "
-                                        "masked but is"
-                                    ),
-                                    batch_index=batch_index,
-                                    request_params=request_params,
-                                    step_idx=step_idx,
-                                )
-                        else:
-                            # All other tokens should be masked when forcing end
-                            if logit_value != -float("inf"):
-                                _raise_error_invalid(
-                                    msg_suffix=(
-                                        f"Token {token_id} should be masked "
-                                        f"when forcing end tokens, but "
-                                        f"logit={logit_value}"
-                                    ),
-                                    batch_index=batch_index,
-                                    request_params=request_params,
-                                    step_idx=step_idx,
-                                )
-
 
 def _none_validate(
     test_fakes: LogitsprocsTestFakes,
@@ -604,7 +588,7 @@ class LogitsprocTestHelpers(NamedTuple):
     MinTokensLogitsProcessor: LogitsprocTestHelpers(
         gen_request_fxn=_min_tokens_params, eval_fxn=_min_tokens_validate
     ),
-    ThinkingTokenBudgetLogitsProcessor: LogitsprocTestHelpers(
+    STR_THINKING_BUDGET: LogitsprocTestHelpers(
         gen_request_fxn=_thinking_budget_params, eval_fxn=_thinking_budget_validate
     ),
 }
@@ -614,20 +598,17 @@ def _get_test_cases() -> list[list[str]]:
     """Each test case is a set of logitsprocs"""
     logitsprocs_types = list(logitsprocs_test_mapping.keys())
 
-    # Isolate ThinkingTokenBudgetLogitsProcessor from all other processors
-    # to avoid unexpected modification of logits interference
-    thinking_processor = ThinkingTokenBudgetLogitsProcessor
+    # Isolate thinking-budget handling from other processors to avoid cross-talk.
+    thinking_id: LogitprocType = STR_THINKING_BUDGET
     other_processors = [
-        p
-        for p in logitsprocs_types
-        if p != STR_NO_LOGITPROC and p != thinking_processor
+        p for p in logitsprocs_types if p != STR_NO_LOGITPROC and p != thinking_id
     ]
 
     return (
         [[STR_NO_LOGITPROC]]
         + [[logitproc_type, STR_NO_LOGITPROC] for logitproc_type in other_processors]
         + [other_processors]
-        + [[thinking_processor]]
+        + [[thinking_id]]
     )
 
 
@@ -802,12 +783,23 @@ def _assert_valid(
         )
 
 
+def _slot_outputs_for_metadata(
+    persistent_batch: list[LogitsProcsRequestParams], pad_len: int
+) -> list[list[int]]:
+    """Per-batch-slot output token ids aligned with ``SamplingMetadata`` rows."""
+    rows: list[list[int]] = [[] for _ in range(pad_len)]
+    for i, req in enumerate(persistent_batch):
+        if i < pad_len:
+            rows[i] = list(req.out_tokens)
+    return rows
+
+
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
 def test_logitsprocs(
-    device: str, reqs_per_logitproc: int, logitsprocs_under_test: list[str]
+    device: str, reqs_per_logitproc: int, logitsprocs_under_test: list[LogitprocType]
 ):
     random.seed(40)
     torch.set_default_device(device)
@@ -855,9 +847,10 @@ def test_logitsprocs(
         # Apply fake batch update to logitsprocs
         fake_update_logitsprocs_state(test_fakes, batch_update)
 
-        # Emulate application of logits processors in engine
+        # Emulate application of logits processors + thinking holder (sampler order).
         slice_idxs = [req.workload_index for req in persistent_batch]
-        logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs).cpu()
+        slot_rows = _slot_outputs_for_metadata(persistent_batch, workload_size)
+        logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs, slot_rows).cpu()
 
         _assert_valid(
             batch_size=batch_size,
@@ -869,3 +862,263 @@ def test_logitsprocs(
         )
 
         step_idx += 1
+
+
+class MockReasoningNoEndTokens:
+    """Reasoning config with no end token ids (disables enforcement in holder)."""
+
+    reasoning_start_token_ids = [THINK_START_TOKEN_ID]
+    reasoning_end_token_ids: list[int] = []
+
+
+def test_maybe_create_thinking_budget_holder_without_reasoning():
+    cfg = VllmConfig()
+    assert cfg.reasoning_config is None
+    assert (
+        maybe_create_thinking_budget_state_holder(
+            None,
+            cfg.scheduler_config.max_num_seqs,
+            0,
+            torch.device("cpu"),
+            False,
+        )
+        is None
+    )
+
+
+def test_thinking_budget_holder_has_tracked_after_sync_add():
+    vc = VllmConfig()
+    vc.reasoning_config = MockReasoningConfig()
+    h = ThinkingBudgetStateHolder(
+        vc.reasoning_config,
+        vc.scheduler_config.max_num_seqs,
+        0,
+        torch.device("cpu"),
+        False,
+    )
+    assert not h.has_tracked_requests()
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=1,
+            removed=(),
+            added=[
+                (
+                    0,
+                    SamplingParams(thinking_token_budget=3),
+                    None,
+                    [THINK_START_TOKEN_ID],
+                )
+            ],
+            moved=(),
+        )
+    )
+    assert h.has_tracked_requests()
+    assert h._state[0]["thinking_token_budget"] == 3
+
+
+def test_thinking_budget_holder_sync_remove_clears_state():
+    vc = VllmConfig()
+    vc.reasoning_config = MockReasoningConfig()
+    h = ThinkingBudgetStateHolder(
+        vc.reasoning_config,
+        vc.scheduler_config.max_num_seqs,
+        0,
+        torch.device("cpu"),
+        False,
+    )
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=1,
+            removed=(),
+            added=[
+                (
+                    0,
+                    SamplingParams(thinking_token_budget=3),
+                    None,
+                    [],
+                )
+            ],
+            moved=(),
+        )
+    )
+    assert h.has_tracked_requests()
+    h.sync_batch(BatchUpdate(batch_size=0, removed=(0,), added=(), moved=()))
+    assert not h.has_tracked_requests()
+
+
+def test_thinking_budget_holder_sync_add_without_budget_drops_row():
+    vc = VllmConfig()
+    vc.reasoning_config = MockReasoningConfig()
+    h = ThinkingBudgetStateHolder(
+        vc.reasoning_config,
+        vc.scheduler_config.max_num_seqs,
+        0,
+        torch.device("cpu"),
+        False,
+    )
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=1,
+            removed=(),
+            added=[(0, SamplingParams(), None, [])],
+            moved=(),
+        )
+    )
+    assert not h.has_tracked_requests()
+
+
+def test_thinking_budget_holder_swap_exchanges_state():
+    vc = VllmConfig()
+    vc.reasoning_config = MockReasoningConfig()
+    h = ThinkingBudgetStateHolder(
+        vc.reasoning_config,
+        vc.scheduler_config.max_num_seqs,
+        0,
+        torch.device("cpu"),
+        False,
+    )
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=2,
+            removed=(),
+            added=[
+                (
+                    0,
+                    SamplingParams(thinking_token_budget=3),
+                    None,
+                    [],
+                ),
+                (
+                    1,
+                    SamplingParams(thinking_token_budget=7),
+                    None,
+                    [],
+                ),
+            ],
+            moved=(),
+        )
+    )
+    b0, b1 = h._state[0]["thinking_token_budget"], h._state[1]["thinking_token_budget"]
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=2,
+            removed=(),
+            added=(),
+            moved=[(0, 1, MoveDirectionality.SWAP)],
+        )
+    )
+    assert h._state[0]["thinking_token_budget"] == b1
+    assert h._state[1]["thinking_token_budget"] == b0
+
+
+def test_thinking_budget_holder_unidirectional_move():
+    vc = VllmConfig()
+    vc.reasoning_config = MockReasoningConfig()
+    h = ThinkingBudgetStateHolder(
+        vc.reasoning_config,
+        vc.scheduler_config.max_num_seqs,
+        0,
+        torch.device("cpu"),
+        False,
+    )
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=2,
+            removed=(),
+            added=[
+                (
+                    1,
+                    SamplingParams(thinking_token_budget=4),
+                    None,
+                    [],
+                ),
+            ],
+            moved=(),
+        )
+    )
+    assert 1 in h._state and 0 not in h._state
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=2,
+            removed=(),
+            added=(),
+            moved=[(1, 0, MoveDirectionality.UNIDIRECTIONAL)],
+        )
+    )
+    assert 0 in h._state and 1 not in h._state
+    assert h._state[0]["thinking_token_budget"] == 4
+
+
+def test_thinking_budget_holder_update_state_repeat_indices_last_row_wins():
+    vc = VllmConfig()
+    vc.reasoning_config = MockReasoningConfig()
+    h = ThinkingBudgetStateHolder(
+        vc.reasoning_config,
+        vc.scheduler_config.max_num_seqs,
+        0,
+        torch.device("cpu"),
+        False,
+    )
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=1,
+            removed=(),
+            added=[
+                (
+                    0,
+                    SamplingParams(thinking_token_budget=5),
+                    None,
+                    [THINK_START_TOKEN_ID],
+                )
+            ],
+            moved=(),
+        )
+    )
+    out_lists = [[THINK_START_TOKEN_ID], [THINK_START_TOKEN_ID, 10, 11, 12, 13, 14]]
+    h.update_state(
+        out_lists,
+        None,
+        torch.tensor([0, 0], dtype=torch.long),
+    )
+    assert h._state[0]["output_tok_ids"] == out_lists[1]
+
+
+def test_thinking_budget_holder_spec_mode_tensor_layout():
+    h = ThinkingBudgetStateHolder(
+        MockReasoningConfig(),
+        8,
+        2,
+        torch.device("cpu"),
+        False,
+    )
+    assert h.in_spec_mode
+    assert h.mask.shape[0] == 8 * (2 + 1)
+
+
+def test_thinking_budget_holder_empty_end_tokens_disables_row():
+    vc = VllmConfig()
+    vc.reasoning_config = MockReasoningNoEndTokens()
+    h = ThinkingBudgetStateHolder(
+        vc.reasoning_config,
+        vc.scheduler_config.max_num_seqs,
+        0,
+        torch.device("cpu"),
+        False,
+    )
+    h.sync_batch(
+        BatchUpdate(
+            batch_size=1,
+            removed=(),
+            added=[
+                (
+                    0,
+                    SamplingParams(thinking_token_budget=5),
+                    None,
+                    [THINK_START_TOKEN_ID],
+                )
+            ],
+            moved=(),
+        )
+    )
+    h.update_state([[THINK_START_TOKEN_ID, 1]], None, None)
+    assert h._state[0]["thinking_token_budget"] == -1
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
index 3dc6b8979015..93825c65bc92 100644
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -120,12 +120,11 @@ async def client(server):
 
 
 @create_new_process_for_each_test()
-@pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
     [MODEL_NAME],
 )
-async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
+def test_custom_logitsprocs(server, model_name: str):
     """Test custom logitsprocs when starting OpenAI server from CLI
 
     Launch vLLM OpenAI-compatible server, configured to load a custom logitproc
@@ -139,36 +138,45 @@ async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
     token
     """
 
-    use_dummy_logitproc = True
-    for prompt in prompts:
-        # Build request arguments
-        request_keyword_args: dict[str, Any] = {
-            **api_keyword_args,
-        }
-        if use_dummy_logitproc:
-            # 50% of requests pass target_token custom arg
-            target_token = random.choice([128, 67])
-            # For requests which activate the dummy logitproc, choose one of
-            # two `target_token` values which are known not to be EOS tokens
-            request_keyword_args["extra_body"] = {
-                "vllm_xargs": {DUMMY_LOGITPROC_ARG: target_token}
-            }
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            **request_keyword_args,
-        )
+    import asyncio
 
-        if use_dummy_logitproc:
-            # Only for requests which activate dummy logitproc - validate that
-            # output token is repeated
-            choices: openai.types.CompletionChoice = batch.choices
-            toks = choices[0].logprobs.tokens
-            if not all([x == toks[0] for x in toks]):
-                raise AssertionError(f"Generated {toks} should all be {toks[0]}")
+    async def _async_main(srv, mn):
+        async with srv.get_async_client() as client:
+            await _run(client)
 
-        # Alternate whether to activate dummy logitproc for each request
-        use_dummy_logitproc = not use_dummy_logitproc
+    async def _run(client):
+        use_dummy_logitproc = True
+        for prompt in prompts:
+            # Build request arguments
+            request_keyword_args: dict[str, Any] = {
+                **api_keyword_args,
+            }
+            if use_dummy_logitproc:
+                # 50% of requests pass target_token custom arg
+                target_token = random.choice([128, 67])
+                # For requests which activate the dummy logitproc, choose one of
+                # two `target_token` values which are known not to be EOS tokens
+                request_keyword_args["extra_body"] = {
+                    "vllm_xargs": {DUMMY_LOGITPROC_ARG: target_token}
+                }
+            batch = await client.completions.create(
+                model=model_name,
+                prompt=prompt,
+                **request_keyword_args,
+            )
+
+            if use_dummy_logitproc:
+                # Only for requests which activate dummy logitproc - validate that
+                # output token is repeated
+                choices: openai.types.CompletionChoice = batch.choices
+                toks = choices[0].logprobs.tokens
+                if not all([x == toks[0] for x in toks]):
+                    raise AssertionError(f"Generated {toks} should all be {toks[0]}")
+
+            # Alternate whether to activate dummy logitproc for each request
+            use_dummy_logitproc = not use_dummy_logitproc
+
+    asyncio.run(_async_main(server, model_name))
 
 
 @pytest.mark.asyncio
diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py
index f08d9f684921..6bad1299b61e 100644
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@@ -1,13 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import MagicMock
+
 import pytest
 import ray
 
 from vllm.config.model import ModelDType
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
-from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
+from vllm.v1.metrics.ray_wrappers import (
+    RayCounterWrapper,
+    RayGaugeWrapper,
+    RayHistogramWrapper,
+    RayPrometheusMetric,
+    RayPrometheusStatLogger,
+)
 
 MODELS = [
     "distilbert/distilgpt2",
@@ -94,3 +102,148 @@ def test_sanitized_opentelemetry_name():
 
     # Test empty string
     assert RayPrometheusMetric._get_sanitized_opentelemetry_name("") == ""
+
+
+def _install_mock_metric(wrapper: RayPrometheusMetric) -> MagicMock:
+    """Swap the wrapper's underlying Ray metric for a MagicMock while
+    preserving the real metric's ``_tag_keys`` (labels() reads them to
+    validate arity)."""
+    real_metric = wrapper.metric
+    mock = MagicMock()
+    mock._tag_keys = real_metric._tag_keys
+    wrapper.metric = mock
+    return mock
+
+
+def test_ray_counter_labels_returns_independent_children():
+    """RayCounterWrapper.labels() must return distinct labeled children that
+    each carry their own tag set."""
+    base = RayCounterWrapper(
+        name="vllm_test_finish_reason",
+        documentation="",
+        labelnames=["reason"],
+    )
+
+    stop_child = base.labels("stop")
+    rep_child = base.labels("repetition")
+
+    assert stop_child is not rep_child
+    assert stop_child._tags["reason"] == "stop"
+    assert rep_child._tags["reason"] == "repetition"
+    # Mutating one child's tags must not leak into another.
+    stop_child._tags["reason"] = "mutated"
+    assert rep_child._tags["reason"] == "repetition"
+
+
+def test_ray_counter_inc_forwards_per_child_tags():
+    """.inc() on a labeled counter must forward that child's tags to the
+    underlying Ray metric (not rely on a shared set_default_tags)."""
+    wrapper = RayCounterWrapper(
+        name="vllm_test_counter_tag_forward",
+        documentation="",
+        labelnames=["reason"],
+    )
+    mock = _install_mock_metric(wrapper)
+
+    wrapper.labels("stop").inc()
+    wrapper.labels("repetition").inc(3)
+    wrapper.labels("stop").inc(0)  # zero increment must be a no-op.
+
+    # The zero-increment call should not reach the underlying metric.
+    assert mock.inc.call_count == 2
+    first, second = mock.inc.call_args_list
+    assert first.args == (1.0,)
+    assert first.kwargs["tags"]["reason"] == "stop"
+    assert second.args == (3,)
+    assert second.kwargs["tags"]["reason"] == "repetition"
+
+
+def test_ray_gauge_labels_returns_independent_children_and_forwards_tags():
+    wrapper = RayGaugeWrapper(
+        name="vllm_test_gauge_tag_forward",
+        documentation="",
+        labelnames=["kind"],
+    )
+    mock = _install_mock_metric(wrapper)
+
+    a = wrapper.labels("a")
+    b = wrapper.labels("b")
+    assert a is not b
+
+    a.set(1)
+    b.set(2)
+    assert mock.set.call_args_list[0].args == (1,)
+    assert mock.set.call_args_list[0].kwargs["tags"]["kind"] == "a"
+    assert mock.set.call_args_list[1].args == (2,)
+    assert mock.set.call_args_list[1].kwargs["tags"]["kind"] == "b"
+
+
+def test_ray_histogram_labels_returns_independent_children_and_forwards_tags():
+    wrapper = RayHistogramWrapper(
+        name="vllm_test_histogram_tag_forward",
+        documentation="",
+        labelnames=["bucket"],
+        buckets=[1.0, 2.0, 5.0],
+    )
+    mock = _install_mock_metric(wrapper)
+
+    x = wrapper.labels("x")
+    y = wrapper.labels("y")
+    assert x is not y
+
+    x.observe(0.5)
+    y.observe(4.0)
+    assert mock.observe.call_args_list[0].args == (0.5,)
+    assert mock.observe.call_args_list[0].kwargs["tags"]["bucket"] == "x"
+    assert mock.observe.call_args_list[1].args == (4.0,)
+    assert mock.observe.call_args_list[1].kwargs["tags"]["bucket"] == "y"
+
+
+def test_ray_counter_labels_accepts_non_string_label_values():
+    """RayPrometheusStatLogger passes ``str(idx)`` for engine indexes; this
+    covers the coercion path for any caller that passes a non-string label
+    value positionally."""
+    wrapper = RayCounterWrapper(
+        name="vllm_test_nonstr_label",
+        documentation="",
+        labelnames=["engine", "reason"],
+    )
+    child = wrapper.labels(0, "stop")
+    assert child._tags["engine"] == "0"
+    assert child._tags["reason"] == "stop"
+
+
+def test_ray_counter_labels_arity_validation():
+    wrapper = RayCounterWrapper(
+        name="vllm_test_arity",
+        documentation="",
+        labelnames=["a", "b"],
+    )
+    with pytest.raises(ValueError, match="Number of labels must match"):
+        wrapper.labels("only-one")
+
+
+def test_unlabeled_inc_carries_replica_id():
+    """Recording on an unlabeled metric must still pass ReplicaId — it's a
+    declared tag_key and Ray rejects updates that omit any declared key."""
+    wrapper = RayCounterWrapper(
+        name="vllm_test_unlabeled_replica_id",
+        documentation="",
+        labelnames=None,
+    )
+    mock = _install_mock_metric(wrapper)
+    wrapper.inc()
+    assert mock.inc.call_args.kwargs["tags"] == {"ReplicaId": ""}
+
+
+def test_double_labels_raises():
+    """labels() on an already-labeled child should raise, mirroring the
+    prometheus_client contract."""
+    wrapper = RayCounterWrapper(
+        name="vllm_test_double_labels",
+        documentation="",
+        labelnames=["reason"],
+    )
+    child = wrapper.labels("stop")
+    with pytest.raises(ValueError, match="already-labeled"):
+        child.labels("repetition")
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 28fb2931b229..460e0d685649 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -33,11 +33,10 @@
 SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
 
 # On ROCm, floating-point reductions in attention and GEMM kernels are
-# non-associative and sensitive to batch geometry. The ref LLM (no spec
-# decode, default scheduling) and the spec-decode LLM (chunked prefill,
-# different effective batch sizes) follow different reduction orders,
-# producing numerically divergent logprobs that get misattributed to
-# spec-decode incorrectness.
+# non-associative and sensitive to batch geometry. If the ref LLM and
+# spec-decode LLM use different scheduling or batch geometry, they can
+# follow different reduction orders and produce numerically divergent
+# logprobs that get misattributed to spec-decode incorrectness.
 #
 # Force LLM instances into an identical, deterministic execution
 # mode so the test isolates spec-decode correctness only:
@@ -1086,18 +1085,25 @@ def test_spec_decode_logprobs(
     )
 
     max_model_len = 256
-
-    # Run base LLM.
-    ref_llm = LLM(
-        model=model_name,
+    llm_kwargs = dict(
         max_logprobs=5,
         max_model_len=max_model_len,
         seed=42,
         logprobs_mode=logprobs_mode,
         gpu_memory_utilization=0.4,
+        # Force the same prefill chunking for both the base model and
+        # spec decode model so the comparison isolates spec decode.
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=32,
         enable_prefix_caching=False,
         **ROCM_DETERMINISM_KWARGS,
     )
+
+    # Run base LLM.
+    ref_llm = LLM(
+        model=model_name,
+        **llm_kwargs,
+    )
     ref_results = ref_llm.generate(
         [prompt, prompt], [sampling_params, penalty_sampling_params]
     )
@@ -1117,16 +1123,7 @@ def test_spec_decode_logprobs(
     spec_llm = LLM(
         model_name,
         speculative_config=spec_config_with_len,
-        max_logprobs=5,
-        max_model_len=max_model_len,
-        seed=42,
-        logprobs_mode=logprobs_mode,
-        gpu_memory_utilization=0.4,
-        # Force prefill chunking
-        enable_chunked_prefill=True,
-        max_num_batched_tokens=32,
-        enable_prefix_caching=False,
-        **ROCM_DETERMINISM_KWARGS,
+        **llm_kwargs,
     )
     spec_results = spec_llm.generate(
         [prompt, prompt], [sampling_params, penalty_sampling_params]
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index 23f1f1c1f98a..659577b754f6 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -13,6 +13,30 @@
 VOCAB_SIZE = 128 * 1024
 
 
+def _flashinfer_topk_topp_supported() -> bool:
+    """True iff the FlashInfer top-k/top-p sampler is usable on this host.
+
+    Mirrors the gate in `TopKTopPSampler.__init__`: CUDA + flashinfer
+    importable + GPU compute capability supported by the FlashInfer
+    backend.
+    """
+    if not current_platform.is_cuda():
+        return False
+    try:
+        import flashinfer  # noqa: F401
+
+        from vllm.v1.attention.backends.flashinfer import FlashInferBackend
+    except ImportError:
+        return False
+    capability = current_platform.get_device_capability()
+    if capability is None:
+        return False
+    return FlashInferBackend.supports_compute_capability(capability)
+
+
+FLASHINFER_TOPK_TOPP_SUPPORTED = _flashinfer_topk_topp_supported()
+
+
 @pytest.fixture(autouse=True)
 def reset_default_device():
     """
@@ -568,3 +592,280 @@ def test_mixed_neginf_and_normal_rows(self):
             finite_in = (logits[i] > float("-inf")).sum().item()
             if finite_in > 0:
                 assert kept > 0, f"Row {i}: no tokens kept"
+
+
+# =============================================================================
+# FlashInfer top-k/top-p robustness tests
+# =============================================================================
+
+
+@pytest.mark.skipif(
+    not FLASHINFER_TOPK_TOPP_SUPPORTED,
+    reason="FlashInfer top-k/top-p sampler requires CUDA "
+    "and a GPU with FlashInfer support.",
+)
+class TestFlashInferTopkToppRobustness:
+    """Robustness of FlashInfer top-k / top-p sampling to NaN / Inf logits.
+
+    The FlashInfer sampler is enabled by default on supported GPUs. A
+    single poisoned request (NaN / +Inf / -Inf in row 0) must not:
+
+    1. crash or hang the process;
+    2. produce out-of-range token ids (anything outside ``[0, vocab)``);
+    3. corrupt other batch rows — neighbours of a poisoned row must
+       still receive valid token ids (regression for cross-row
+       corruption in a DP batch where one bad request would otherwise
+       poison its peers).
+
+    The reference is "no crash + valid token ids", not bit-exact equality
+    against the PyTorch-native path.
+    """
+
+    BATCH = 8
+    VOCAB = 32768
+    TOPK = 50
+    TOPP = 0.9
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        torch.set_default_device(DEVICE_TYPE)
+        self.generator = Generator(device=DEVICE_TYPE).manual_seed(1234)
+
+    def _make_logits(self, pattern: str) -> torch.Tensor:
+        """Build (BATCH, VOCAB) logits with `pattern` applied to row 0
+        (rows 1..B-1 stay clean so we can detect cross-row corruption)."""
+        logits = (
+            torch.randn(
+                self.BATCH,
+                self.VOCAB,
+                generator=self.generator,
+                dtype=torch.float32,
+            )
+            * 5.0
+        )
+        if pattern == "clean":
+            return logits
+        if pattern == "nan_one_row":
+            logits[0, :] = float("nan")
+        elif pattern == "nan_few":
+            # Scatter 16 NaNs across row 0, keep the rest finite.
+            idx = torch.randperm(self.VOCAB, generator=self.generator)[:16]
+            logits[0, idx] = float("nan")
+        elif pattern == "nan_at_top":
+            # Poison the top-32 highest-scoring positions of row 0 — worst
+            # case for top-k since these are exactly the tokens that would
+            # otherwise be selected. Use argsort instead of topk to avoid
+            # a known compute-sanitizer false positive in mbtopk.
+            top_idx = logits[0].argsort(descending=True)[:32]
+            logits[0, top_idx] = float("nan")
+        elif pattern == "nan_all_rows":
+            logits[:, :] = float("nan")
+        elif pattern == "pos_inf_one_row":
+            logits[0, :] = float("inf")
+        elif pattern == "neg_inf_one_row":
+            logits[0, :] = float("-inf")
+        elif pattern == "mixed_inf_nan":
+            assert self.BATCH >= 3
+            logits[0, :] = float("nan")
+            logits[1, :] = float("inf")
+            logits[2, :] = float("-inf")
+        elif pattern == "degenerate_flat":
+            logits[:, :] = 1.0
+        else:
+            raise ValueError(f"unknown pattern: {pattern}")
+        return logits
+
+    def _check_tokens(self, tokens: torch.Tensor, ctx: str):
+        assert tokens.dim() == 1, f"{ctx}: expected 1-D output, got {tokens.shape}"
+        assert tokens.shape[0] == self.BATCH, (
+            f"{ctx}: expected batch size {self.BATCH}, got {tokens.shape[0]}"
+        )
+        ids = tokens.tolist()
+        min_id, max_id = min(ids), max(ids)
+        assert 0 <= min_id < self.VOCAB and 0 <= max_id < self.VOCAB, (
+            f"{ctx}: token id(s) outside [0, {self.VOCAB}): min={min_id}, max={max_id}"
+        )
+
+    @pytest.mark.parametrize(
+        "pattern",
+        [
+            "clean",
+            "nan_one_row",
+            "nan_few",
+            "nan_at_top",
+            "nan_all_rows",
+            "pos_inf_one_row",
+            "neg_inf_one_row",
+            "mixed_inf_nan",
+            "degenerate_flat",
+        ],
+    )
+    @pytest.mark.parametrize("path", ["topk_only", "topp_only", "topk_topp"])
+    def test_flashinfer_handles_pathological_logits(self, pattern: str, path: str):
+        """flashinfer_sample must return valid ids even on poisoned logits.
+
+        Direct call into ``flashinfer_sample`` — exactly the code path
+        ``TopKTopPSampler.forward_cuda`` takes when FI is enabled.
+        """
+        from vllm.v1.sample.ops.topk_topp_sampler import flashinfer_sample
+
+        logits = self._make_logits(pattern)
+        k = (
+            torch.full(
+                (self.BATCH,),
+                self.TOPK,
+                device=DEVICE_TYPE,
+                dtype=torch.int32,
+            )
+            if path in ("topk_only", "topk_topp")
+            else None
+        )
+        p = (
+            torch.full(
+                (self.BATCH,),
+                self.TOPP,
+                device=DEVICE_TYPE,
+                dtype=torch.float32,
+            )
+            if path in ("topp_only", "topk_topp")
+            else None
+        )
+
+        # flashinfer_sample may mutate its input in-place; pass a clone so
+        # the parametrize iterations stay independent.
+        tokens = flashinfer_sample(logits.clone().contiguous(), k, p, {})
+        # Surface any async CUDA error synchronously (e.g. illegal memory
+        # access from a malformed FlashInfer call) so it's attributed to
+        # this test rather than a later, unrelated GPU op.
+        torch.accelerator.synchronize()
+        self._check_tokens(tokens, ctx=f"pattern={pattern}, path={path}")
+
+
+# =============================================================================
+# FlashInfer top-k/top-p distribution-match tests
+# =============================================================================
+
+
+@pytest.mark.skipif(
+    not FLASHINFER_TOPK_TOPP_SUPPORTED,
+    reason="FlashInfer top-k/top-p sampler requires CUDA "
+    "and a GPU with FlashInfer support.",
+)
+class TestFlashInferDistributionMatch:
+    """Chi-square goodness-of-fit: FlashInfer and PyTorch-native samplers
+    both reproduce the expected token distribution after top-k / top-p.
+
+    Regression guard against historical FlashInfer distribution-shift.
+    Each impl is compared to the theoretical distribution (softmax of
+    filtered logits); if both pass they are statistically equivalent
+    to each other by transitivity.
+    """
+
+    VOCAB = 32
+    N_SAMPLES = 50_000
+    ALPHA = 1e-6
+    SEED = 0
+
+    @pytest.mark.parametrize(
+        "topk,topp",
+        [
+            (8, None),
+            (16, None),
+            (None, 0.5),
+            (None, 0.7),
+            (None, 0.99),
+            (8, 0.9),
+            (4, 0.5),
+        ],
+    )
+    def test_distribution_matches_theoretical(self, topk, topp):
+        from scipy.stats import chisquare
+
+        from vllm.v1.sample.ops.topk_topp_sampler import (
+            apply_top_k_top_p,
+            flashinfer_sample,
+            random_sample,
+        )
+
+        torch.set_default_device(DEVICE_TYPE)
+        torch.manual_seed(self.SEED)
+
+        # Same logits row used for both impls so the comparison is fair.
+        logits_one = (
+            torch.randn(
+                (1, self.VOCAB),
+                dtype=torch.float32,
+            )
+            * 2.0
+        )
+
+        # Theoretical expected distribution from PyTorch-native filter.
+        k_one = torch.tensor([topk], dtype=torch.int32) if topk is not None else None
+        p_one = torch.tensor([topp], dtype=torch.float32) if topp is not None else None
+        masked = apply_top_k_top_p_pytorch(logits_one.clone(), k_one, p_one)
+        expected_probs = masked.softmax(dim=-1).flatten().cpu().numpy()
+        expected_counts = expected_probs * self.N_SAMPLES
+
+        # Build a batch of N identical rows for both impls.
+        batch = logits_one.expand(self.N_SAMPLES, self.VOCAB).contiguous()
+        k_batch = (
+            torch.full((self.N_SAMPLES,), topk, dtype=torch.int32)
+            if topk is not None
+            else None
+        )
+        p_batch = (
+            torch.full((self.N_SAMPLES,), topp, dtype=torch.float32)
+            if topp is not None
+            else None
+        )
+
+        # FlashInfer dispatch path.
+        fi_tokens = flashinfer_sample(batch.contiguous(), k_batch, p_batch, {})
+        fi_counts = torch.bincount(fi_tokens, minlength=self.VOCAB).cpu().numpy()
+        self._chi2_check(
+            fi_counts,
+            expected_counts,
+            chisquare,
+            label=f"flashinfer top-k={topk} top-p={topp}",
+        )
+
+        # PyTorch-native dispatch path (Triton-routed filter + Gumbel sample).
+        processed = apply_top_k_top_p(batch.clone(), k_batch, p_batch)
+        probs = processed.softmax(dim=-1, dtype=torch.float32)
+        pt_tokens = random_sample(probs, {})
+        pt_counts = torch.bincount(pt_tokens, minlength=self.VOCAB).cpu().numpy()
+        self._chi2_check(
+            pt_counts,
+            expected_counts,
+            chisquare,
+            label=f"native top-k={topk} top-p={topp}",
+        )
+
+    def _chi2_check(self, empirical, expected, chisquare_fn, *, label):
+        import numpy as np
+
+        # Hard check: the sampler must never produce a token outside the
+        # expected support (zero theoretical probability).
+        outside = (expected == 0) & (empirical > 0)
+        assert not outside.any(), (
+            f"{label}: sampled out-of-support tokens "
+            f"(zero expected prob): indices={outside.nonzero()[0].tolist()}"
+        )
+        # Skip chi-square in the degenerate case where the support
+        # collapses to a single token (e.g. very restrictive joint
+        # top-k + top-p): all samples must land there and the hard
+        # check above already verified they do.
+        in_support = expected > 0
+        if int(in_support.sum()) <= 1:
+            return
+        # Soft check: chi-square goodness-of-fit on in-support tokens.
+        # Cast to float64 so the rescaling step below stays within
+        # scipy.chisquare's strict 1.5e-8 sum-equality tolerance.
+        emp = empirical[in_support].astype(np.float64)
+        exp = expected[in_support].astype(np.float64)
+        exp = exp * (emp.sum() / exp.sum())
+        chi2, p_value = chisquare_fn(emp, exp)
+        assert p_value > self.ALPHA, (
+            f"{label}: distribution differs from theoretical: "
+            f"chi2={chi2:.2f} p_value={p_value:.2e} alpha={self.ALPHA}"
+        )
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index a0abb3b4c6ce..907be3614b9c 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -198,22 +198,43 @@ def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
 
 def fake_update_logitsprocs_state(
     test_fakes: LogitsprocsTestFakes,
-    batch_update: BatchUpdate,
+    batch_update: BatchUpdate | None,
 ) -> None:
     """Imitate logits processors persistent batch state update
     in engine core"""
     for logitproc in test_fakes.get_logitsprocs():
         logitproc.update_state(batch_update)
+    holder = test_fakes.sampling_metadata.thinking_budget_state_holder
+    if holder is not None:
+        holder.sync_batch(batch_update)
 
 
 def fake_apply_logitsprocs(
     test_fakes: LogitsprocsTestFakes,
     slice_indices: list[int],
+    slot_output_token_ids: list[list[int]] | None = None,
 ) -> torch.Tensor:
-    """Imitate application of logits processors in engine core"""
+    """Imitate application of logits processors in engine core.
+
+    When ``thinking_budget_state_holder`` has tracked requests, this mirrors
+    :meth:`Sampler.apply_logits_processors` by refreshing per-slot
+    ``output_token_ids`` (if ``slot_output_token_ids`` is provided), then
+    ``update_state`` + ``apply_to_logits`` on the holder after built-in logits
+    processors.
+    """
     logits = test_fakes.logits[torch.tensor(slice_indices, dtype=torch.long)].clone()
     for processor in test_fakes.get_logitsprocs():
         logits = processor.apply(logits)
+
+    md = test_fakes.sampling_metadata
+    holder = md.thinking_budget_state_holder
+    if holder is not None and holder.has_tracked_requests():
+        if slot_output_token_ids is not None:
+            for i, toks in enumerate(slot_output_token_ids):
+                if i < len(md.output_token_ids):
+                    md.output_token_ids[i] = list(toks)
+        holder.update_state(md.output_token_ids, md.spec_token_ids, None)
+        logits = holder.apply_to_logits(logits, False, md.spec_token_ids)
     return logits
 
 
diff --git a/tests/v1/simple_kv_offload/test_integration.py b/tests/v1/simple_kv_offload/test_integration.py
index 29399516be18..02f6360e08e8 100644
--- a/tests/v1/simple_kv_offload/test_integration.py
+++ b/tests/v1/simple_kv_offload/test_integration.py
@@ -10,8 +10,8 @@
 from vllm.config import KVTransferConfig
 from vllm.platforms import current_platform
 
-if not current_platform.is_cuda():
-    pytest.skip("Requires CUDA", allow_module_level=True)
+if not current_platform.is_cuda_alike():
+    pytest.skip("Requires CUDA or ROCm", allow_module_level=True)
 
 # Small models for default CI / local runs (accuracy only).
 SMALL_MODELS = [
diff --git a/tests/v1/simple_kv_offload/test_scheduler.py b/tests/v1/simple_kv_offload/test_scheduler.py
index 132f52fe3b36..4d685103df60 100644
--- a/tests/v1/simple_kv_offload/test_scheduler.py
+++ b/tests/v1/simple_kv_offload/test_scheduler.py
@@ -186,7 +186,13 @@ def make_request(
     if request_id is None:
         request_id = f"req-{_req_counter}"
 
-    num_tokens = num_blocks * BLOCK_SIZE
+    # Add one extra token beyond the last full block so that
+    # ``max_cache_hit_length = num_tokens - 1`` (see
+    # KVCacheManager.get_computed_blocks) does not truncate the final
+    # full block: ``find_longest_cache_hit`` uses
+    # ``max_length // block_size`` and would otherwise drop one block
+    # when the prompt is an exact multiple of block_size.
+    num_tokens = num_blocks * BLOCK_SIZE + 1
     start = _req_counter * 10000
     prompt_token_ids = list(range(start, start + num_tokens))
     sampling_params = SamplingParams(max_tokens=1)
diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py
index ec65e20cbde1..62ff100fdbf8 100644
--- a/tests/v1/spec_decode/test_acceptance_length.py
+++ b/tests/v1/spec_decode/test_acceptance_length.py
@@ -43,7 +43,8 @@ class Eagle3ModelConfig:
 
 # Model configurations for EAGLE3 acceptance length tests.
 # Expected acceptance lengths are determined by running baseline benchmarks
-# using examples/offline_inference/spec_decode.py with the MT-Bench dataset.
+# using examples/features/speculative_decoding/spec_decode_offline.py
+# with the MT-Bench dataset.
 EAGLE3_MODEL_CONFIGS = [
     Eagle3ModelConfig(
         verifier="meta-llama/Llama-3.1-8B-Instruct",
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 1e1c6745191d..77c041d84a94 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -6,6 +6,7 @@
 
 from tests.utils import get_attn_backend_list_based_on_platform
 from vllm import LLM, SamplingParams
+from vllm.config import ModelConfig, ParallelConfig, SpeculativeConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import StructuredOutputsParams
 
@@ -77,3 +78,23 @@ def test_eagle_max_len(
             "is longer than the eagle max length"
         )
         assert o.outputs[0].text == "a b c d e " * 15
+
+
+@pytest.mark.parametrize("spec_max_model_len", [80, 150])
+def test_mtp_speculative_config_max_model_len(spec_max_model_len: int):
+    """Regression test for #41456: max_model_len in speculative config
+    should be respected for the draft model."""
+    model_config = ModelConfig(
+        model="XiaomiMiMo/MiMo-7B-Base",
+        runner="generate",
+        max_model_len=200,
+        trust_remote_code=True,
+    )
+    spec_config = SpeculativeConfig(
+        target_model_config=model_config,
+        target_parallel_config=ParallelConfig(),
+        method="mtp",
+        num_speculative_tokens=1,
+        max_model_len=spec_max_model_len,
+    )
+    assert spec_config.draft_model_config.max_model_len == spec_max_model_len
diff --git a/tests/v1/streaming_input/test_gpu_model_runner_streaming.py b/tests/v1/streaming_input/test_gpu_model_runner_streaming.py
index 0ed7b6cb3efc..946ca99507df 100644
--- a/tests/v1/streaming_input/test_gpu_model_runner_streaming.py
+++ b/tests/v1/streaming_input/test_gpu_model_runner_streaming.py
@@ -39,7 +39,6 @@ def mock_model_runner_with_input_batch():
         vocab_size=32000,
         block_sizes=[16],
         kernel_block_sizes=[16],
-        is_spec_decode=False,
         logitsprocs=None,
         is_pooling_model=False,
     )
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index 98a25e41dfe0..a12b36b8a44e 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -8,11 +8,16 @@
 import pytest
 
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.reasoning import ReasoningParser
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
 
 
+class MockReasoner:
+    def __init__(self, tokenizer):
+        self.is_reasoning_end = Mock(return_value=False)
+        self.is_reasoning_end_streaming = Mock(return_value=False)
+
+
 class TestReasoningStructuredOutput:
     """Test reasoning-aware structured output functionality."""
 
@@ -50,13 +55,6 @@ def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
         config.speculative_config = None
         return config
 
-    @pytest.fixture
-    def mock_reasoning_parser(self):
-        """Create a mock ReasoningParser."""
-        parser = Mock(spec=ReasoningParser)
-        parser.is_reasoning_end = Mock(return_value=False)
-        return parser
-
     @pytest.fixture
     def mock_request_with_structured_output(self):
         """Create a mock request with structured output."""
@@ -64,6 +62,8 @@ def mock_request_with_structured_output(self):
         request.structured_output_request = Mock()
         request.structured_output_request.reasoning_ended = None
         request.structured_output_request.grammar = Mock()
+        request.structured_output_request.reasoning_parser_kwargs = None
+        request.structured_output_request.reasoner = None
         request.structured_output_request.grammar.is_terminated = Mock(
             return_value=False
         )
@@ -74,6 +74,13 @@ def mock_request_with_structured_output(self):
         request.num_output_placeholders = 0
         return request
 
+    @pytest.fixture
+    def manager_with_reasoner(self, mock_vllm_config):
+        manager = StructuredOutputManager(mock_vllm_config)
+        manager.reasoner_cls = MockReasoner
+        manager.tokenizer = Mock()
+        return manager
+
     def test_should_fill_bitmask_with_enable_in_reasoning(
         self, mock_vllm_config, mock_request_with_structured_output
     ):
@@ -89,22 +96,17 @@ def test_should_fill_bitmask_with_enable_in_reasoning(
 
     def test_should_fill_bitmask_without_enable_in_reasoning(
         self,
-        mock_vllm_config,
+        manager_with_reasoner,
         mock_request_with_structured_output,
-        mock_reasoning_parser,
     ):
         """Test should_fill_bitmask when enable_in_reasoning is False."""
         # Keep enable_in_reasoning as False (default)
-        config = mock_vllm_config.structured_outputs_config
+        config = manager_with_reasoner.vllm_config.structured_outputs_config
         assert config.enable_in_reasoning is False
 
-        manager = StructuredOutputManager(mock_vllm_config)
-        manager.reasoner = mock_reasoning_parser
-
-        # Mock reasoning not ended
-        mock_reasoning_parser.is_reasoning_end.return_value = False
-
-        result = manager.should_fill_bitmask(mock_request_with_structured_output)
+        result = manager_with_reasoner.should_fill_bitmask(
+            mock_request_with_structured_output
+        )
 
         # Should set reasoning_ended and return its value
         assert (
@@ -118,68 +120,92 @@ def test_should_fill_bitmask_no_reasoner(
     ):
         """Test should_fill_bitmask when no reasoner is configured."""
         manager = StructuredOutputManager(mock_vllm_config)
-        manager.reasoner = None
 
         result = manager.should_fill_bitmask(mock_request_with_structured_output)
 
         # Should default to True when no reasoner
         assert result is True
 
+    def test_should_fill_bitmask_uses_request_reasoning_parser_kwargs(
+        self, mock_vllm_config, mock_request_with_structured_output
+    ):
+        """Test request-level parser kwargs override the default reasoner."""
+
+        class KwargReasoner:
+            def __init__(self, tokenizer, chat_template_kwargs=None):
+                self.chat_template_kwargs = chat_template_kwargs or {}
+
+            def is_reasoning_end(self, input_ids):
+                return not self.chat_template_kwargs.get("enable_thinking", False)
+
+        manager = StructuredOutputManager(mock_vllm_config)
+        manager.reasoner_cls = KwargReasoner
+        manager.tokenizer = Mock()
+
+        structured_req = mock_request_with_structured_output.structured_output_request
+        structured_req.reasoning_parser_kwargs = {
+            "chat_template_kwargs": {"enable_thinking": True}
+        }
+
+        result = manager.should_fill_bitmask(mock_request_with_structured_output)
+
+        assert result is False
+        assert (
+            mock_request_with_structured_output.structured_output_request.reasoner
+            is not None
+        )
+
     def test_should_advance_with_enable_in_reasoning(
         self,
-        mock_vllm_config,
+        manager_with_reasoner,
         mock_request_with_structured_output,
-        mock_reasoning_parser,
     ):
         """Test should_advance when enable_in_reasoning is True."""
         # Enable enable_in_reasoning
-        mock_vllm_config.structured_outputs_config.enable_in_reasoning = True
-
-        manager = StructuredOutputManager(mock_vllm_config)
-        manager.reasoner = mock_reasoning_parser
+        manager_with_reasoner.enable_in_reasoning = True
 
         # Should always return True when enable_in_reasoning is enabled
-        result = manager.should_advance(mock_request_with_structured_output)
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output
+        )
         assert result is True
 
     def test_should_advance_reasoning_not_ended(
         self,
-        mock_vllm_config,
+        manager_with_reasoner,
         mock_request_with_structured_output,
-        mock_reasoning_parser,
     ):
         """Test should_advance when reasoning has not ended."""
-        manager = StructuredOutputManager(mock_vllm_config)
-        manager.reasoner = mock_reasoning_parser
-
         # Set reasoning as not ended
         (
             mock_request_with_structured_output.structured_output_request
         ).reasoning_ended = False
-        mock_reasoning_parser.is_reasoning_end.return_value = False
 
-        result = manager.should_advance(mock_request_with_structured_output)
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output
+        )
 
         # Should return False since reasoning hasn't ended
         assert result is False
 
     def test_should_advance_reasoning_just_ended(
         self,
-        mock_vllm_config,
+        manager_with_reasoner,
         mock_request_with_structured_output,
-        mock_reasoning_parser,
     ):
         """Test should_advance when reasoning ends in current step."""
-        manager = StructuredOutputManager(mock_vllm_config)
-        manager.reasoner = mock_reasoning_parser
-
         # Set reasoning as not ended initially, but ends in this step
         (
             mock_request_with_structured_output.structured_output_request
         ).reasoning_ended = False
-        mock_reasoning_parser.is_reasoning_end.return_value = True
+        reasoner = MockReasoner(tokenizer=Mock())
+        reasoner.is_reasoning_end_streaming.return_value = True
+        structured_req = mock_request_with_structured_output.structured_output_request
+        structured_req.reasoner = reasoner
 
-        result = manager.should_advance(mock_request_with_structured_output)
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output
+        )
 
         # Should set reasoning_ended to True but return False for this step
         assert (
@@ -190,20 +216,18 @@ def test_should_advance_reasoning_just_ended(
 
     def test_should_advance_reasoning_already_ended(
         self,
-        mock_vllm_config,
+        manager_with_reasoner,
         mock_request_with_structured_output,
-        mock_reasoning_parser,
     ):
         """Test should_advance when reasoning has already ended."""
-        manager = StructuredOutputManager(mock_vllm_config)
-        manager.reasoner = mock_reasoning_parser
-
         # Set reasoning as already ended
         (
             mock_request_with_structured_output.structured_output_request
         ).reasoning_ended = True
 
-        result = manager.should_advance(mock_request_with_structured_output)
+        result = manager_with_reasoner.should_advance(
+            mock_request_with_structured_output
+        )
 
         # Should return True since reasoning has ended
         assert result is True
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 0de443858c98..4942390cdd38 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -7,6 +7,7 @@
 import pytest
 import torch
 
+import vllm.v1.worker.gpu_model_runner as gpu_model_runner_module
 from vllm.config import (
     AttentionConfig,
     CacheConfig,
@@ -162,6 +163,34 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
     )
 
 
+def _schedule_cached_requests(
+    req_ids: list[str],
+    num_scheduled_tokens: dict[str, int],
+    new_token_ids: list[list[int]],
+    num_computed_tokens: list[int],
+    num_output_tokens: list[int],
+) -> SchedulerOutput:
+    return SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData(
+            req_ids=req_ids,
+            resumed_req_ids=set(),
+            new_token_ids=new_token_ids,
+            all_token_ids={},
+            new_block_ids=[None] * len(req_ids),
+            num_computed_tokens=num_computed_tokens,
+            num_output_tokens=num_output_tokens,
+        ),
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=sum(num_scheduled_tokens.values()),
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=[],
+        finished_req_ids=set(),
+        free_encoder_mm_hashes=[],
+    )
+
+
 def _is_req_scheduled(model_runner, req_id: str) -> bool:
     return req_id in model_runner.input_batch.req_id_to_index
 
@@ -219,6 +248,58 @@ def test_select_common_block_size_uses_largest_shared_int():
     assert selected_size == 64
 
 
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize(
+    ("world_size", "is_last_rank", "expected_calls"),
+    [(1, True, 0), (2, True, 0), (2, False, 1)],
+)
+def test_sample_tokens_receives_pp_sampled_ids_only_on_non_last_rank(
+    monkeypatch: pytest.MonkeyPatch,
+    world_size: int,
+    is_last_rank: bool,
+    expected_calls: int,
+):
+    runner = GPUModelRunner.__new__(GPUModelRunner)
+    runner.execute_model_state = None
+    runner.kv_connector_output = None
+    runner.use_async_scheduling = True
+    receive_calls = 0
+
+    def receive_prev_sampled_token_ids():
+        nonlocal receive_calls
+        receive_calls += 1
+
+    runner._pp_receive_prev_sampled_token_ids_to_input_batch = (
+        receive_prev_sampled_token_ids
+    )
+    monkeypatch.setattr(
+        gpu_model_runner_module,
+        "get_pp_group",
+        lambda: SimpleNamespace(world_size=world_size, is_last_rank=is_last_rank),
+    )
+
+    assert GPUModelRunner.sample_tokens(runner, None) is None
+    assert receive_calls == expected_calls
+
+
+@pytest.mark.skip_global_cleanup
+def test_sample_tokens_skips_pp_group_lookup_without_async_scheduling(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    runner = GPUModelRunner.__new__(GPUModelRunner)
+    runner.execute_model_state = None
+    runner.kv_connector_output = None
+    runner.use_async_scheduling = False
+
+    monkeypatch.setattr(
+        gpu_model_runner_module,
+        "get_pp_group",
+        pytest.fail,
+    )
+
+    assert GPUModelRunner.sample_tokens(runner, None) is None
+
+
 def test_select_common_block_size_no_valid_option():
     backend_a = _make_mock_backend_for_kernel_block_size([64])
     backend_b = _make_mock_backend_for_kernel_block_size([MultipleOf(16)])
@@ -457,6 +538,135 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
     assert not _is_req_scheduled(model_runner, req_ids[1])
 
 
+def test_update_states_pp_non_async_multi_request_keeps_token_buffers_consistent(
+    model_runner, model_runner_2, dist_init, monkeypatch
+):
+    req_ids = ["req_0", "req_1"]
+    non_last_runner = model_runner
+    last_runner = model_runner_2
+    non_last_runner.use_async_scheduling = False
+    last_runner.use_async_scheduling = False
+
+    # Both ranks start from the same request set.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=False, world_size=2),
+    )
+    non_last_runner._update_states(_schedule_new_request(*req_ids))
+    last_runner._update_states(_schedule_new_request(*req_ids))
+
+    sampled_by_last_rank = {req_ids[0]: 101, req_ids[1]: 201}
+    # Emulate last-rank bookkeeping result from previous step:
+    # sampled tokens already cached in CPU token buffers.
+    for req_id, token_id in sampled_by_last_rank.items():
+        req_index = last_runner.input_batch.req_id_to_index[req_id]
+        start_idx = int(last_runner.input_batch.num_tokens_no_spec[req_index])
+        end_idx = start_idx + 1
+        last_runner.input_batch.token_ids_cpu[req_index, start_idx:end_idx] = [token_id]
+        last_runner.input_batch.is_token_ids[req_index, start_idx:end_idx] = True
+        last_runner.input_batch.num_tokens_no_spec[req_index] = end_idx
+        last_runner.requests[req_id].output_token_ids.append(token_id)
+
+    scheduler_output = _schedule_cached_requests(
+        req_ids=req_ids,
+        num_scheduled_tokens={req_ids[0]: 1, req_ids[1]: 1},
+        new_token_ids=[[101], [201]],
+        num_computed_tokens=[3, 3],  # prompt tokens only
+        num_output_tokens=[1, 1],
+    )
+    # non-last rank appends new_token_ids in _update_states.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=False, world_size=2),
+    )
+    non_last_runner._update_states(scheduler_output)
+    # last rank should keep its already-bookkept CPU buffers unchanged.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=True, world_size=2),
+    )
+    last_runner._update_states(scheduler_output)
+
+    # Verify consistency between PP ranks after _update_states.
+    for req_id in req_ids:
+        non_last_idx = non_last_runner.input_batch.req_id_to_index[req_id]
+        last_idx = last_runner.input_batch.req_id_to_index[req_id]
+        non_last_len = int(non_last_runner.input_batch.num_tokens_no_spec[non_last_idx])
+        last_len = int(last_runner.input_batch.num_tokens_no_spec[last_idx])
+        assert non_last_len == last_len
+        assert (
+            non_last_runner.input_batch.token_ids_cpu[
+                non_last_idx, :non_last_len
+            ].tolist()
+            == last_runner.input_batch.token_ids_cpu[last_idx, :last_len].tolist()
+        )
+
+
+def test_update_states_pp_async_multi_request_keeps_rank_state_consistent(
+    model_runner, model_runner_2, dist_init, monkeypatch
+):
+    req_ids = ["req_0", "req_1"]
+    non_last_runner = model_runner
+    last_runner = model_runner_2
+    non_last_runner.use_async_scheduling = True
+    last_runner.use_async_scheduling = True
+
+    # Both ranks start from the same request set.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=False, world_size=2),
+    )
+    non_last_runner._update_states(_schedule_new_request(*req_ids))
+    last_runner._update_states(_schedule_new_request(*req_ids))
+
+    # Simulate async previous-step sampled tokens known on both ranks.
+    # non-last rank may receive them via PP communication; last rank has
+    # them from local sampling/bookkeeping.
+    sampled_by_last_rank = {req_ids[0]: 111, req_ids[1]: 222}
+    for runner in (non_last_runner, last_runner):
+        for req_id, token_id in sampled_by_last_rank.items():
+            req_index = runner.input_batch.req_id_to_index[req_id]
+            start_idx = int(runner.input_batch.num_tokens_no_spec[req_index])
+            end_idx = start_idx + 1
+            runner.input_batch.token_ids_cpu[req_index, start_idx:end_idx] = [token_id]
+            runner.input_batch.is_token_ids[req_index, start_idx:end_idx] = True
+            runner.input_batch.num_tokens_no_spec[req_index] = end_idx
+            runner.requests[req_id].output_token_ids.append(token_id)
+
+    scheduler_output = _schedule_cached_requests(
+        req_ids=req_ids,
+        num_scheduled_tokens={req_ids[0]: 1, req_ids[1]: 1},
+        new_token_ids=[],
+        num_computed_tokens=[4, 4],
+        num_output_tokens=[1, 1],
+    )
+    # non-last rank: async PP branch (new_token_ids empty).
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=False, world_size=2),
+    )
+    non_last_runner._update_states(scheduler_output)
+    # last rank: keep already-bookkept state aligned with scheduler view.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=True, world_size=2),
+    )
+    last_runner._update_states(scheduler_output)
+
+    for req_id in req_ids:
+        non_last_idx = non_last_runner.input_batch.req_id_to_index[req_id]
+        last_idx = last_runner.input_batch.req_id_to_index[req_id]
+        non_last_len = int(non_last_runner.input_batch.num_tokens_no_spec[non_last_idx])
+        last_len = int(last_runner.input_batch.num_tokens_no_spec[last_idx])
+        assert non_last_len == last_len
+        assert (
+            non_last_runner.input_batch.token_ids_cpu[
+                non_last_idx, :non_last_len
+            ].tolist()
+            == last_runner.input_batch.token_ids_cpu[last_idx, :last_len].tolist()
+        )
+
+
 def test_kv_cache_stride_order(monkeypatch, model_runner):
     # This test checks if GPUModelRunner initializes correctly when an attention
     # backend enforces a non-default KV cache stride order.
diff --git a/tools/pre_commit/generate_attention_backend_docs.py b/tools/pre_commit/generate_attention_backend_docs.py
index 1b7ef27949b8..c0503fd69712 100644
--- a/tools/pre_commit/generate_attention_backend_docs.py
+++ b/tools/pre_commit/generate_attention_backend_docs.py
@@ -30,7 +30,6 @@
 RELEVANT_PATTERNS = [
     "vllm/v1/attention/backends/*.py",
     "vllm/v1/attention/backends/**/*.py",
-    "vllm/v1/attention/backends/fa_utils.py",
     "vllm/model_executor/layers/attention/mla_attention.py",
     "vllm/platforms/cuda.py",
     "tools/pre_commit/generate_attention_backend_docs.py",
@@ -68,6 +67,11 @@ def is_relevant_file(filepath: str) -> bool:
     return any(fnmatch.fnmatch(path_str, pattern) for pattern in RELEVANT_PATTERNS)
 
 
+MLA_PREFILL_DIR = BACKENDS_DIR / "mla" / "prefill"
+MLA_PREFILL_REGISTRY_FILE = MLA_PREFILL_DIR / "registry.py"
+MLA_PREFILL_SELECTOR_FILE = MLA_PREFILL_DIR / "selector.py"
+
+
 # ---------------------------------------------------------------------------
 # AST utility helpers
 # ---------------------------------------------------------------------------
@@ -293,6 +297,242 @@ def get_file_from_class_path(class_path: str) -> Path | None:
     return py_file if py_file.exists() else None
 
 
+def parse_mla_prefill_registry() -> dict[str, str]:
+    """Parse MLAPrefillBackendEnum from the prefill registry.
+
+    Returns:
+        A dict mapping backend names to their class paths.
+    """
+    if not MLA_PREFILL_REGISTRY_FILE.exists():
+        return {}
+
+    try:
+        tree = ast.parse(MLA_PREFILL_REGISTRY_FILE.read_text())
+    except Exception:
+        return {}
+
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef) and node.name == "MLAPrefillBackendEnum":
+            return _extract_enum_values(node)
+    return {}
+
+
+def parse_mla_prefill_priorities() -> dict[str, list[str]]:
+    """Parse MLA prefill backend priorities from selector.py.
+
+    Returns:
+        A dict with keys like 'blackwell' and 'default' containing
+        lists of backend enum names in priority order.
+    """
+    if not MLA_PREFILL_SELECTOR_FILE.exists():
+        return {}
+
+    try:
+        tree = ast.parse(MLA_PREFILL_SELECTOR_FILE.read_text())
+    except Exception:
+        return {}
+
+    priorities: dict[str, list[str]] = {}
+
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.FunctionDef):
+            continue
+        if node.name != "_get_mla_prefill_backend_priorities":
+            continue
+
+        # Look for if statements checking device_capability.major
+        for stmt in ast.walk(node):
+            if not isinstance(stmt, ast.If):
+                continue
+
+            # Check if it's a capability.major == 10 check (Blackwell)
+            is_blackwell = (
+                isinstance(stmt.test, ast.Compare)
+                and isinstance(stmt.test.left, ast.Attribute)
+                and stmt.test.left.attr == "major"
+                and stmt.test.comparators
+                and isinstance(stmt.test.comparators[0], ast.Constant)
+                and stmt.test.comparators[0].value == 10
+            )
+
+            # Extract backends from return statements
+            for body_stmt in stmt.body:
+                if isinstance(body_stmt, ast.Return) and isinstance(
+                    body_stmt.value, ast.List
+                ):
+                    backends = []
+                    for elt in body_stmt.value.elts:
+                        if isinstance(elt, ast.Attribute):
+                            backends.append(elt.attr)
+                    if is_blackwell:
+                        priorities["blackwell"] = backends
+                    else:
+                        priorities["default"] = backends
+
+            # Extract from else branch
+            for else_stmt in stmt.orelse:
+                if isinstance(else_stmt, ast.Return) and isinstance(
+                    else_stmt.value, ast.List
+                ):
+                    backends = []
+                    for elt in else_stmt.value.elts:
+                        if isinstance(elt, ast.Attribute):
+                            backends.append(elt.attr)
+                    priorities["default"] = backends
+
+    return priorities
+
+
+def parse_mla_prefill_backend_file(class_path: str) -> dict[str, Any] | None:
+    """Parse a single MLA prefill backend file to extract its properties.
+
+    Args:
+        class_path: The fully qualified class path.
+
+    Returns:
+        A dict with backend properties, or None if parsing fails.
+    """
+    file_path = get_file_from_class_path(class_path)
+    if file_path is None:
+        return None
+
+    try:
+        tree = ast.parse(file_path.read_text())
+    except Exception:
+        return None
+
+    class_name = class_path.rsplit(".", 1)[1]
+    class_node = find_class_in_ast(tree, class_name)
+    if class_node is None:
+        return None
+
+    info: dict[str, Any] = {
+        "compute_capability": "Any",
+        "requires_r1_dims": False,
+        "dtypes": "fp16, bf16",  # Default from base class
+    }
+
+    # Parse class variables
+    for item in class_node.body:
+        if isinstance(item, ast.Assign):
+            for target in item.targets:
+                if (
+                    isinstance(target, ast.Name)
+                    and target.id == "requires_r1_mla_dimensions"
+                    and isinstance(item.value, ast.Constant)
+                ):
+                    info["requires_r1_dims"] = item.value.value
+
+        # Parse supported_dtypes class variable
+        if (
+            isinstance(item, ast.AnnAssign)
+            and isinstance(item.target, ast.Name)
+            and item.target.id == "supported_dtypes"
+            and isinstance(item.value, ast.List)
+        ):
+            dtype_map = {"float16": "fp16", "bfloat16": "bf16", "float32": "fp32"}
+            dtypes = []
+            for elt in item.value.elts:
+                if isinstance(elt, ast.Attribute):
+                    dtypes.append(dtype_map.get(elt.attr, elt.attr))
+            if dtypes:
+                info["dtypes"] = ", ".join(dtypes)
+
+    # Parse get_name static method
+    get_name_method = find_method(class_node, "get_name")
+    if get_name_method:
+        for n in ast.walk(get_name_method):
+            if isinstance(n, ast.Return) and isinstance(n.value, ast.Constant):
+                info["name"] = n.value.value
+
+    # Parse supports_compute_capability classmethod
+    cc_method = find_method(class_node, "supports_compute_capability")
+    if cc_method:
+        for n in ast.walk(cc_method):
+            # Look for capability.major == 10 style checks
+            if (
+                isinstance(n, ast.Compare)
+                and isinstance(n.left, ast.Attribute)
+                and n.left.attr == "major"
+                and n.comparators
+                and isinstance(n.comparators[0], ast.Constant)
+            ):
+                major = n.comparators[0].value
+                info["compute_capability"] = f"{major}.x"
+
+    return info
+
+
+def parse_mla_prefill_backends() -> list[dict[str, Any]]:
+    """Parse MLA prefill backend options from the prefill registry.
+
+    MLA uses different backends for prefill vs decode. The decode backends are
+    registered in the main registry, but prefill backends have their own
+    registry at vllm/v1/attention/backends/mla/prefill/registry.py.
+
+    Returns a list of prefill backend info dicts with their requirements.
+    """
+    registry = parse_mla_prefill_registry()
+    priorities = parse_mla_prefill_priorities()
+
+    if not registry:
+        return []
+
+    # Get the priority order (Blackwell order shows all backends)
+    priority_order = priorities.get("blackwell", list(registry.keys()))
+
+    prefill_backends: list[dict[str, Any]] = []
+
+    # Backend-specific metadata that can't be easily parsed from code
+    backend_metadata = {
+        "TRTLLM_RAGGED": {
+            "description": "TensorRT-LLM ragged attention",
+        },
+        "FLASHINFER": {
+            "description": "FlashInfer CUTLASS backend",
+        },
+        "FLASH_ATTN": {
+            "description": "FlashAttention varlen (FA2/FA3/FA4)",
+        },
+    }
+
+    for backend_name in priority_order:
+        if backend_name not in registry:
+            continue
+
+        class_path = registry[backend_name]
+        backend_info = parse_mla_prefill_backend_file(class_path)
+        if backend_info is None:
+            continue
+
+        metadata = backend_metadata.get(backend_name, {})
+        display_name = backend_info.get("name", backend_name)
+
+        # Add marker for default Blackwell backend
+        marker = ""
+        if backend_name == priority_order[0] and priorities.get("blackwell"):
+            marker = "‡"
+
+        notes = ""
+        if backend_info.get("requires_r1_dims"):
+            notes = "DeepSeek R1 dims only"
+        elif backend_name == "FLASH_ATTN":
+            notes = "FA4 on SM100+, FA3 on SM90, FA2 otherwise"
+
+        prefill_backends.append(
+            {
+                "name": display_name,
+                "marker": marker,
+                "description": metadata.get("description", ""),
+                "dtypes": backend_info.get("dtypes", "fp16, bf16"),
+                "compute_capability": backend_info.get("compute_capability", "Any"),
+                "notes": notes,
+            }
+        )
+
+    return prefill_backends
+
+
 # ---------------------------------------------------------------------------
 # Backend feature extraction from AST
 # ---------------------------------------------------------------------------
@@ -570,6 +810,9 @@ def analyze_backend(backend_name: str, class_path: str) -> dict[str, Any] | None
         "compute_capability": compute_cap,
         "is_mla": is_mla_backend or check_method_overrides(class_node, "is_mla"),
         "supports_sink": check_method_overrides(class_node, "supports_sink"),
+        "supports_non_causal": check_method_overrides(
+            class_node, "supports_non_causal"
+        ),
         "is_sparse": check_method_overrides(class_node, "is_sparse"),
         "supports_mm_prefix": check_method_overrides(class_node, "supports_mm_prefix"),
         "supports_dcp": supports_dcp,
@@ -788,6 +1031,11 @@ def parse_flashinfer_trtllm_features() -> dict[str, dict[str, Any]]:
     if not trtllm_compute_cap:
         return {}
 
+    # KV cache dtypes that only work with a dedicated kernel (e.g. nvfp4
+    # requires the SM100 NVFP4 MHA kernel) and should not appear in the
+    # generic attention-backend feature matrix.
+    kernel_only_kv_dtypes = ["nvfp4"]
+
     return {
         "native": {
             # Native FlashInfer: everything except SM100
@@ -798,89 +1046,10 @@ def parse_flashinfer_trtllm_features() -> dict[str, dict[str, Any]]:
             "compute_capability": trtllm_compute_cap,
             "supports_sink": True,
         },
+        "exclude_kv_dtypes": kernel_only_kv_dtypes,
     }
 
 
-def parse_mla_prefill_backends() -> list[dict[str, Any]]:
-    """Parse MLA prefill backend options from mla_attention.py.
-
-    MLA uses different backends for prefill vs decode. The decode backends are
-    registered in the registry, but prefill backends are selected at runtime
-    based on conditions in MLACommonImpl.__init__.
-
-    Returns a list of prefill backend info dicts with their requirements.
-    """
-    if not MLA_ATTENTION_FILE.exists():
-        return []
-
-    try:
-        tree = ast.parse(MLA_ATTENTION_FILE.read_text())
-    except Exception:
-        return []
-
-    # Find compute capability requirements by parsing use_* functions
-    trtllm_cc = _find_cc_in_function(tree, "use_trtllm_ragged_deepseek_prefill")
-    flashinfer_cc = _find_cc_in_function(tree, "use_flashinfer_prefill")
-    cudnn_cc = _find_cc_in_function(tree, "use_cudnn_prefill")
-
-    # Build prefill backend list based on what we found
-    # Order matches the priority in MLACommonImpl.__init__
-    prefill_backends: list[dict[str, Any]] = []
-
-    # TRT-LLM Ragged (highest priority if available)
-    if trtllm_cc:
-        prefill_backends.append(
-            {
-                "name": "TRT-LLM Ragged‡",
-                "description": "TensorRT-LLM ragged attention",
-                "compute_capability": trtllm_cc,
-                "enable": "Default on SM100",
-                "disable": "`-ac.use_trtllm_ragged_deepseek_prefill=0`",
-                "notes": "DeepSeek R1 dims only",
-            }
-        )
-
-    # FlashInfer prefill
-    if flashinfer_cc:
-        prefill_backends.append(
-            {
-                "name": "FlashInfer",
-                "description": "FlashInfer CUTLASS backend",
-                "compute_capability": flashinfer_cc,
-                "enable": "`-ac.disable_flashinfer_prefill=0`",
-                "disable": "`-ac.disable_flashinfer_prefill=1`",
-                "notes": "DeepSeek R1 dims only",
-            }
-        )
-
-    # cuDNN prefill
-    if cudnn_cc:
-        prefill_backends.append(
-            {
-                "name": "cuDNN",
-                "description": "cuDNN-based attention",
-                "compute_capability": cudnn_cc,
-                "enable": "`-ac.use_cudnn_prefill=1`",
-                "disable": "`-ac.use_cudnn_prefill=0`",
-                "notes": "",
-            }
-        )
-
-    # FlashAttention is always available as fallback
-    prefill_backends.append(
-        {
-            "name": "FlashAttention",
-            "description": "FlashAttention varlen (FA2/FA3)",
-            "compute_capability": "Any",
-            "enable": "Default fallback",
-            "disable": "Use other backends",
-            "notes": "FA3 on SM90, FA2 otherwise",
-        }
-    )
-
-    return prefill_backends
-
-
 # ---------------------------------------------------------------------------
 # Backend variant expansion (FA2/FA3/FA4, FlashInfer native/TRTLLM)
 # ---------------------------------------------------------------------------
@@ -963,6 +1132,15 @@ def _expand_flashinfer_variants(
         native["supports_sink"] = fi_features["native"]["supports_sink"]
         native["compute_capability"] = f"{min_cc}.x-9.x"
 
+        # Remove KV dtypes only supported by SM100 kernels (e.g. nvfp4)
+        exclude = fi_features.get("exclude_kv_dtypes", [])
+        if exclude:
+            native["kv_cache_dtypes"] = ", ".join(
+                d
+                for d in (d.strip() for d in native["kv_cache_dtypes"].split(","))
+                if d not in exclude
+            )
+
         # Create TRTLLM entry
         trtllm = backend.copy()
         trtllm["version"] = "TRTLLM†"
@@ -1136,6 +1314,10 @@ def _extract_priorities(body: list, priorities: dict[str, list[str]], prefix: st
 _COL_BLOCK_SIZES: TableColumn = ("Block Sizes", lambda b: b["block_sizes"])
 _COL_HEAD_SIZES: TableColumn = ("Head Sizes", lambda b: b["head_sizes"])
 _COL_SINK: TableColumn = ("Sink", lambda b: bool_to_emoji(b["supports_sink"]))
+_COL_NON_CAUSAL: TableColumn = (
+    "Non-Causal",
+    lambda b: bool_to_emoji(b["supports_non_causal"]),
+)
 _COL_SPARSE: TableColumn = ("Sparse", lambda b: bool_to_emoji(b["is_sparse"]))
 _COL_MM_PREFIX: TableColumn = (
     "MM Prefix",
@@ -1169,6 +1351,7 @@ def _build_columns(is_mla: bool, has_versions: bool) -> list[TableColumn]:
         cols.append(_COL_VERSION)
     cols.extend([_COL_DTYPES, _COL_KV_DTYPES, _COL_BLOCK_SIZES, _COL_HEAD_SIZES])
     cols.append(_COL_SINK)
+    cols.append(_COL_NON_CAUSAL)
     if is_mla:
         cols.append(_COL_SPARSE)
     cols.extend([_COL_MM_PREFIX, _COL_DCP, _COL_ATTN_TYPES, _COL_COMPUTE_CAP])
@@ -1379,6 +1562,7 @@ def generate_legend() -> str:
 | **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) |
 | **Head Sizes** | Supported attention head sizes |
 | **Sink** | Attention sink support (for StreamingLLM) |
+| **Non-Causal** | Non-causal (bidirectional) attention support for decoder models |
 | **Sparse** | Sparse attention support (MLA only) |
 | **MM Prefix** | Multimodal prefix full attention support |
 | **DCP** | Decode Context Parallelism support (`--decode-context-parallel-size`) |
@@ -1400,20 +1584,22 @@ def generate_mla_section(
         "",
         "### Prefill Backends",
         "",
-        "The prefill backend is selected at runtime based on hardware and",
-        "configuration.",
+        "To explicitly select a prefill backend, use",
+        "`-ac.mla_prefill_backend=<BACKEND>` (e.g., `FLASH_ATTN`, `FLASHINFER`).",
+        "Otherwise, the prefill backend is selected automatically at runtime based on",
+        "hardware and configuration.",
         "",
-        "| Backend | Description | Compute Cap. | Enable | Disable | Notes |",
-        "| ------- | ----------- | ------------ | ------ | ------- | ----- |",
+        "| Backend | Description | Dtypes | Compute Cap. | Notes |",
+        "| ------- | ----------- | ------ | ------------ | ----- |",
     ]
 
     for backend in prefill_backends:
-        row = "| {} | {} | {} | {} | {} | {} |".format(
+        row = "| `{}`{} | {} | {} | {} | {} |".format(
             backend["name"],
+            backend.get("marker", ""),
             backend["description"],
+            backend.get("dtypes", "fp16, bf16"),
             backend["compute_capability"],
-            backend["enable"],
-            backend["disable"],
             backend.get("notes", ""),
         )
         lines.append(row.replace("  ", " "))
@@ -1426,6 +1612,9 @@ def generate_mla_section(
             "",
             "### Decode Backends",
             "",
+            "MLA decode backends are selected using the standard",
+            "`-ac.backend=<BACKEND>` argument (e.g., `FLASHMLA`, `TRITON_MLA`).",
+            "",
         ]
     )
 
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 0250fbfac709..45defc6926ba 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -2,9 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from collections.abc import Callable
+from contextlib import contextmanager
+from typing import Protocol
 
 import torch
 from torch._ops import OpOverload
+from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
 from vllm.platforms import current_platform
@@ -39,6 +42,27 @@ def is_aiter_found() -> bool:
 IS_AITER_FOUND = is_aiter_found()
 
 
+class AiterCustomAllreduceProto(Protocol):
+    max_size: int
+    world_size: int
+    fully_connected: bool
+
+    @contextmanager
+    def capture(self): ...
+    def close(self) -> None: ...
+    def fused_ar_rms(
+        self,
+        inp: torch.Tensor,
+        res_inp: torch.Tensor,
+        *,
+        w: torch.Tensor,
+        eps: float,
+        registered: bool = False,
+        use_1stage: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]: ...
+    def should_custom_ar(self, inp: torch.Tensor) -> bool: ...
+
+
 def is_aiter_found_and_supported() -> bool:
     """Check if AITER library is available and platform supports it.
 
@@ -623,58 +647,6 @@ def _rocm_aiter_gemm_a8w8_blockscale_fake(
     return Y
 
 
-def _rocm_aiter_rms_norm_impl(
-    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-) -> torch.Tensor:
-    from aiter import rms_norm
-
-    if x.dim() > 2:
-        x_original_shape = x.shape
-        x = x.reshape(-1, x_original_shape[-1])
-        x = rms_norm(x, weight, variance_epsilon)
-        return x.reshape(x_original_shape)
-
-    return rms_norm(x, weight, variance_epsilon)
-
-
-def _rocm_aiter_rms_norm_fake(
-    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-) -> torch.Tensor:
-    return torch.empty_like(x)
-
-
-def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    from aiter import rmsnorm2d_fwd_with_add
-
-    residual_out = torch.empty_like(residual)
-    out = torch.empty_like(x)
-    rmsnorm2d_fwd_with_add(
-        out,  # output
-        x,  # input
-        residual,  # residual input
-        residual_out,  # residual output
-        weight,
-        variance_epsilon,
-    )
-    return out, residual_out
-
-
-def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    residual_out = torch.empty_like(residual)
-    out = torch.empty_like(x)
-    return out, residual_out
-
-
 def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl(
     x: torch.Tensor,
     residual: torch.Tensor,
@@ -750,6 +722,55 @@ def _rocm_aiter_rmsnorm_fused_dynamic_quant_fake(
     return out, y_scale
 
 
+def _rocm_aiter_fused_allreduce_rmsnorm_impl(
+    input_: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    aiter_ar = rocm_aiter_ops.get_aiter_allreduce()
+    assert aiter_ar is not None, "aiter allreduce must be initialized"
+
+    total_bytes = input_.numel() * input_.element_size()
+    hidden_dim = input_.shape[-1]
+    token_num = input_.shape[0]
+    hidden_ok = hidden_dim in (512, 1024, 2048, 4096, 7168)
+    token_ok = token_num <= 80
+    world_size = aiter_ar.world_size
+    full_nvlink = aiter_ar.fully_connected
+
+    if world_size == 2:
+        size_ok = True
+    elif full_nvlink and world_size <= 4:
+        size_ok = total_bytes < 256 * 1024
+    elif full_nvlink and world_size <= 8:
+        size_ok = total_bytes < 128 * 1024
+    else:
+        size_ok = False
+
+    use_1stage = hidden_ok and token_ok and size_ok
+
+    result = aiter_ar.fused_ar_rms(
+        input_,
+        residual,
+        w=weight,
+        eps=epsilon,
+        registered=torch.cuda.is_current_stream_capturing(),
+        use_1stage=use_1stage,
+    )
+    assert result is not None
+    return result[0], result[1]
+
+
+def _rocm_aiter_fused_allreduce_rmsnorm_fake(
+    input_: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty_like(input_), torch.empty_like(residual)
+
+
 def _rocm_aiter_per_tensor_quant_impl(
     x: torch.Tensor,
     quant_dtype: torch.dtype,
@@ -778,7 +799,7 @@ def _rocm_aiter_per_token_quant_impl(
     assert quant_dtype in [torch.int8, FP8_DTYPE]
 
     out_shape = x.shape
-    out = torch.empty(x.shape, dtype=FP8_DTYPE, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
     if scale is None:
         scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device)
     dynamic_per_token_scaled_quant(
@@ -798,7 +819,7 @@ def _rocm_aiter_per_token_quant_fake(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     out_shape = x.shape
     return (
-        torch.empty(x.shape, dtype=FP8_DTYPE, device=x.device),
+        torch.empty(x.shape, dtype=quant_dtype, device=x.device),
         torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device),
     )
 
@@ -1156,10 +1177,9 @@ class rocm_aiter_ops:
 
         # Check if aiter is enabled before using operations
         if rocm_aiter_ops.is_enabled():
-            result = rocm_aiter_ops.rms_norm(x, weight, epsilon)
+            result = rocm_aiter_ops.per_token_quant(x, FP8_DTYPE)
 
     Operations:
-        - RMS normalization: rms_norm, rms_norm2d_with_add
         - GEMM operations: gemm_a8w8, gemm_a8w8_blockscale
         - Fused MoE: fused_moe, asm_moe_tkw1
         - Routing: topk_softmax, biased_grouped_topk, grouped_topk
@@ -1171,7 +1191,6 @@ class rocm_aiter_ops:
     # Check if the env variable is set
     _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
     _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
-    _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
     _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
     _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
     _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
@@ -1188,6 +1207,9 @@ class rocm_aiter_ops:
     # TODO: Consolidate under _LINEAR_ENABLED
     _TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
 
+    _ALL_REDUCE_MAX_SIZE: int = 8192 * 1024 * 8 * 2
+    _CUSTOM_ALL_REDUCE: AiterCustomAllreduceProto | None = None
+
     @classmethod
     def refresh_env_variables(cls):
         """
@@ -1199,7 +1221,6 @@ def refresh_env_variables(cls):
         """
         cls._AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
         cls._LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
-        cls._RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
         cls._FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
         cls._MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
         cls._MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
@@ -1291,11 +1312,6 @@ def is_linear_enabled(cls) -> bool:
     def is_linear_fp8_enabled(cls) -> bool:
         return cls.is_linear_enabled()
 
-    @classmethod
-    @if_aiter_supported
-    def is_rmsnorm_enabled(cls) -> bool:
-        return cls._AITER_ENABLED and cls._RMSNORM_ENABLED
-
     @classmethod
     @if_aiter_supported
     def is_fused_moe_enabled(cls) -> bool:
@@ -1355,6 +1371,42 @@ def is_triton_rotary_embed_enabled(cls) -> bool:
     def is_triton_gemm_enabled(cls) -> bool:
         return cls._AITER_ENABLED and cls._TRITON_UNQUANT_GEMM
 
+    @classmethod
+    @if_aiter_supported
+    def is_tgemm_enabled(cls) -> bool:
+        from vllm.platforms.rocm import on_gfx950
+
+        return cls.is_linear_enabled() and on_gfx950()
+
+    @classmethod
+    def initialize_aiter_allreduce(
+        cls, group: ProcessGroup, device: torch.device
+    ) -> None:
+        try:
+            from aiter.dist.device_communicators.custom_all_reduce import (
+                CustomAllreduce as AiterCustomAllreduce,
+            )
+
+            cls._CUSTOM_ALL_REDUCE = AiterCustomAllreduce(group, device)
+        except Exception:
+            cls._CUSTOM_ALL_REDUCE = None
+
+    @classmethod
+    def get_aiter_allreduce(cls) -> AiterCustomAllreduceProto | None:
+        return cls._CUSTOM_ALL_REDUCE
+
+    @classmethod
+    def destroy_aiter_allreduce(cls) -> None:
+        if cls._CUSTOM_ALL_REDUCE is not None:
+            cls._CUSTOM_ALL_REDUCE.close()
+            cls._CUSTOM_ALL_REDUCE = None
+
+    @classmethod
+    def get_aiter_allreduce_max_size(cls) -> int | None:
+        # effective max input size (based on upstream aiter version: v0.1.10.post3)
+        # https://github.com/ROCm/aiter/blob/6a0e7b26ccf33164785531212cc2ec2cde0b9243/aiter/dist/device_communicators/custom_all_reduce.py#L272-L273
+        return int(cls._ALL_REDUCE_MAX_SIZE / 2)
+
     @staticmethod
     @if_aiter_supported
     def register_ops_once() -> None:
@@ -1448,19 +1500,6 @@ def register_ops_once() -> None:
                 fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake,
             )
 
-            direct_register_custom_op(
-                op_name="rocm_aiter_rms_norm",
-                op_func=_rocm_aiter_rms_norm_impl,
-                fake_impl=_rocm_aiter_rms_norm_fake,
-            )
-
-            direct_register_custom_op(
-                op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
-                op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
-                fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
-                dispatch_key=current_platform.dispatch_key,
-            )
-
             direct_register_custom_op(
                 op_name="rocm_aiter_rmsnorm_fused_dynamic_quant",
                 op_func=_rocm_aiter_rmsnorm_fused_dynamic_quant_impl,
@@ -1545,6 +1584,12 @@ def register_ops_once() -> None:
                 fake_impl=_triton_rotary_embedding_fake,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_fused_allreduce_rmsnorm",
+                op_func=_rocm_aiter_fused_allreduce_rmsnorm_impl,
+                fake_impl=_rocm_aiter_fused_allreduce_rmsnorm_fake,
+            )
+
             direct_register_custom_op(
                 op_name="fused_mla_dual_rms_norm",
                 op_func=_fused_mla_dual_rms_norm_impl,
@@ -1554,14 +1599,6 @@ def register_ops_once() -> None:
 
             _OPS_REGISTERED = True
 
-    @staticmethod
-    def get_rmsnorm_fused_add_op() -> OpOverload:
-        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
-
-    @staticmethod
-    def get_rmsnorm_op() -> OpOverload:
-        return torch.ops.vllm.rocm_aiter_rms_norm.default
-
     @staticmethod
     def get_rmsnorm_fused_add_dynamic_quant_op() -> OpOverload:
         return torch.ops.vllm.rocm_aiter_rmsnorm_fused_add_dynamic_quant.default
@@ -1599,25 +1636,12 @@ def get_triton_rotary_embedding_op() -> OpOverload:
         return torch.ops.vllm.rocm_aiter_triton_rotary_embedding.default
 
     @staticmethod
-    def get_fused_mla_dual_rms_norm_op() -> OpOverload:
-        return torch.ops.vllm.fused_mla_dual_rms_norm.default
-
-    @staticmethod
-    def rms_norm(
-        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-    ) -> torch.Tensor:
-        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
+    def get_fused_allreduce_rmsnorm_op() -> OpOverload:
+        return torch.ops.vllm.rocm_aiter_fused_allreduce_rmsnorm.default
 
     @staticmethod
-    def rms_norm2d_with_add(
-        x: torch.Tensor,
-        residual: torch.Tensor,
-        weight: torch.Tensor,
-        variance_epsilon: float,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add(
-            x, residual, weight, variance_epsilon
-        )
+    def get_fused_mla_dual_rms_norm_op() -> OpOverload:
+        return torch.ops.vllm.fused_mla_dual_rms_norm.default
 
     @staticmethod
     def w8a8_gemm(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3d5fbd18d6e8..430536ecca41 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2449,23 +2449,6 @@ def moe_wna16_gemm(
     )
 
 
-def router_gemm_bf16_fp32(input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
-    """bf16 x bf16 -> fp32 GEMM via cuBLAS. weight shape: (N, K)."""
-    return torch.ops._moe_C.router_gemm_bf16_fp32(input, weight)
-
-
-if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "router_gemm_bf16_fp32"):
-
-    @register_fake("_moe_C::router_gemm_bf16_fp32")
-    def router_gemm_bf16_fp32_fake(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-    ) -> torch.Tensor:
-        return torch.empty(
-            input.shape[0], weight.shape[0], dtype=torch.float32, device=input.device
-        )
-
-
 def dsv3_router_gemm(
     hidden_states: torch.Tensor,
     router_weight: torch.Tensor,
@@ -3236,6 +3219,39 @@ def int4_scaled_mm_cpu_fake(
 
 _supports_cpu_w4a8_int8 = bool(hasattr(torch.ops._C, "convert_weight_packed_scale_zp"))
 
+if hasattr(torch.ops._C, "fp8_scaled_mm_cpu"):
+
+    @register_fake("_C::fp8_scaled_mm_cpu")
+    def fp8_scaled_mm_cpu_fake(
+        mat1: torch.Tensor,
+        mat2: torch.Tensor,
+        scales2: torch.Tensor,
+        block_size: list[int],
+        bias: torch.Tensor | None,
+        out_dtype: torch.dtype,
+        is_vnni: bool,
+    ) -> torch.Tensor:
+        M = mat1.size(0)
+        N = mat2.size(0)
+        return torch.empty((M, N), dtype=out_dtype, device=mat1.device)
+
+
+_supports_cpu_fp8_w8a16 = bool(hasattr(torch.ops._C, "fp8_scaled_mm_cpu"))
+
+
+def fp8_scaled_mm_cpu(
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+    scales2: torch.Tensor,
+    block_size: list[int],
+    bias: torch.Tensor | None,
+    out_dtype: torch.dtype,
+    is_vnni: bool,
+) -> torch.Tensor:
+    return torch.ops._C.fp8_scaled_mm_cpu(
+        mat1, mat2, scales2, block_size, bias, out_dtype, is_vnni
+    )
+
 
 class CPUDNNLGEMMHandler:
     def __init__(self) -> None:
@@ -3403,6 +3419,9 @@ def cpu_attn_reshape_and_cache(
     value_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
     isa: str,
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
+    kv_cache_dtype: str = "auto",
 ) -> None:
     torch.ops._C.cpu_attn_reshape_and_cache(
         key,
@@ -3411,6 +3430,9 @@ def cpu_attn_reshape_and_cache(
         value_cache,
         slot_mapping,
         isa,
+        k_scale,
+        v_scale,
+        kv_cache_dtype,
     )
 
 
@@ -3429,6 +3451,9 @@ def cpu_attention_with_kv_cache(
     softcap: float,
     scheduler_metadata: torch.Tensor,
     s_aux: torch.Tensor | None,
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
+    kv_cache_dtype: str = "auto",
 ) -> None:
     torch.ops._C.cpu_attention_with_kv_cache(
         query,
@@ -3446,6 +3471,9 @@ def cpu_attention_with_kv_cache(
         softcap,
         scheduler_metadata,
         s_aux,
+        k_scale,
+        v_scale,
+        kv_cache_dtype,
     )
 
 
diff --git a/vllm/_oink_ops.py b/vllm/_oink_ops.py
deleted file mode 100644
index c7a055410b71..000000000000
--- a/vllm/_oink_ops.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Small helper wrappers for external Oink Blackwell custom ops.
-
-vLLM does not depend on the external Oink repository/package. When an external
-plugin registers torch.library.custom_op entrypoints under the `oink::`
-namespace (e.g. via vLLM's general_plugins mechanism) and
-`VLLM_USE_OINK_OPS=1` is set, vLLM can route eligible calls to those ops.
-
-This module provides:
-- A single place to probe Oink op availability at module init time
-  (outside torch.compile tracing), and
-- Thin wrappers around the torch.ops entrypoints for use in CUDA fast paths,
-  without introducing graph breaks.
-
-Important:
-  Do not call the availability helpers in a compiled region. They may call
-  functions decorated with `torch._dynamo.disable` to safely check
-  conditions that should not be traced.
-"""
-
-from __future__ import annotations
-
-from collections.abc import Callable
-
-import torch
-
-try:
-    from torch._dynamo import disable as _dynamo_disable  # type: ignore[attr-defined]
-except Exception:  # pragma: no cover
-
-    def _dynamo_disable(fn: Callable):  # type: ignore[misc]
-        return fn
-
-
-def _has_oink_op(op_name: str) -> bool:
-    """Check if a specific oink op is registered."""
-    return hasattr(torch.ops, "oink") and hasattr(torch.ops.oink, op_name)
-
-
-@_dynamo_disable
-def is_oink_available_for_device(device_index: int) -> bool:
-    """Return True if Oink ops are registered and device is SM100+.
-
-    This function is intended to be called during module initialization
-    (e.g., in RMSNorm.__init__), not in the forward path.
-
-    External plugins are expected to gate registration on SM100+ and
-    VLLM_USE_OINK_OPS=1, so if the ops are present they should be usable.
-    """
-    if not torch.cuda.is_available():
-        return False
-
-    try:
-        major, minor = torch.cuda.get_device_capability(device_index)
-        sm = 10 * major + minor
-        if sm < 100:
-            return False
-    except Exception:
-        return False
-
-    return _has_oink_op("rmsnorm")
-
-
-def has_fused_add_rms_norm() -> bool:
-    """Return True if the in-place fused op is registered."""
-    return _has_oink_op("fused_add_rms_norm")
-
-
-def rmsnorm(x: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
-    """Call `torch.ops.oink.rmsnorm`.
-
-    This wrapper is safe to call in torch.compile regions.
-    """
-    return torch.ops.oink.rmsnorm(x, weight, eps)
-
-
-def fused_add_rms_norm_(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    eps: float,
-) -> None:
-    """Call `torch.ops.oink.fused_add_rms_norm` (mutates x and residual)."""
-    torch.ops.oink.fused_add_rms_norm(x, residual, weight, eps)
-
-
-def fused_add_rms_norm(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    eps: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Convenience wrapper returning (x, residual) after in-place mutation."""
-    fused_add_rms_norm_(x, residual, weight, eps)
-    return x, residual
diff --git a/vllm/_xpu_ops.py b/vllm/_xpu_ops.py
index 0b39a4000126..09f700d0de70 100644
--- a/vllm/_xpu_ops.py
+++ b/vllm/_xpu_ops.py
@@ -185,6 +185,35 @@ def _xpu_ops_deepseek_scaling_rope_fake(
     return query, key
 
 
+def _topk_topp_sample_impl(
+    random_sampled: torch.Tensor,
+    logits_to_return: torch.Tensor | None,
+    logits: torch.Tensor,
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
+    logprobs_mode: str,
+    seeds: torch.Tensor | None,
+    lambda_: float = 1.0,
+) -> None:
+    torch.ops._xpu_C.topk_topp_sampler(
+        random_sampled, logits_to_return, logits, k, p, logprobs_mode, seeds, lambda_
+    )
+    return
+
+
+def _topk_topp_sample_fake(
+    random_sampled: torch.Tensor,
+    logits_to_return: torch.Tensor | None,
+    logits: torch.Tensor,
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
+    logprobs_mode: str,
+    seeds: torch.Tensor | None,
+    lambda_: float = 1.0,
+) -> None:
+    return
+
+
 def _xpu_mxfp8_quantize_impl(
     x: torch.Tensor, dtype: torch.dtype | None = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -691,6 +720,12 @@ def register_ops_once() -> None:
                 fake_impl=_gdn_attention_core_xpu_fake,
             )
 
+            direct_register_custom_op(
+                op_name="xpu_topk_topp_sampler",
+                op_func=_topk_topp_sample_impl,
+                fake_impl=_topk_topp_sample_fake,
+            )
+
             _OPS_REGISTERED = True
 
 
diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index 745b5ab2ff8f..b032c0a0d613 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -1803,7 +1803,9 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
 
     if args.dataset_name == "custom":
         dataset = CustomDataset(
-            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+            dataset_path=args.dataset_path,
+            disable_shuffle=args.disable_shuffle,
+            random_seed=args.seed,
         )
         input_requests = dataset.sample(
             num_requests=args.num_prompts,
@@ -1816,7 +1818,9 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
 
     elif args.dataset_name == "custom_mm":
         dataset = CustomMMDataset(
-            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+            dataset_path=args.dataset_path,
+            disable_shuffle=args.disable_shuffle,
+            random_seed=args.seed,
         )
         input_requests = dataset.sample(
             num_requests=args.num_prompts,
@@ -2365,6 +2369,7 @@ def load_data(self) -> None:
             random.shuffle(self.data)
 
     def sample(
+        self,
         **kwargs,
     ) -> list[SampleRequest]:
         # leverage CustomDataset sample
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 42a8132ffe6e..1c3b728fdbef 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -393,29 +393,48 @@ def get_requests(args, tokenizer):
     elif args.dataset_name == "hf":
         if args.output_len is not None:
             sample_kwargs["output_len"] = args.output_len
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+        common_kwargs["hf_name"] = args.hf_name
+        if (
+            args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_cls = VisionArenaDataset
             common_kwargs["dataset_subset"] = None
             common_kwargs["dataset_split"] = "train"
             sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_cls = InstructCoderDataset
             common_kwargs["dataset_split"] = "train"
-        elif args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_cls = MultiModalConversationDataset
             common_kwargs["dataset_subset"] = args.hf_subset
             common_kwargs["dataset_split"] = args.hf_split
             sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_cls = ConversationDataset
             common_kwargs["dataset_subset"] = args.hf_subset
             common_kwargs["dataset_split"] = args.hf_split
             sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_cls = AIMODataset
             common_kwargs["dataset_subset"] = None
             common_kwargs["dataset_split"] = "train"
-        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_cls = ASRDataset
             common_kwargs["dataset_subset"] = args.hf_subset
             common_kwargs["dataset_split"] = args.hf_split
@@ -557,6 +576,10 @@ def validate_args(args):
             VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
             | MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
             | ConversationDataset.SUPPORTED_DATASET_PATHS
+        ) or args.hf_name in (
+            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+            | MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
+            | ConversationDataset.SUPPORTED_DATASET_PATHS
         ):
             assert args.backend == "vllm-chat", (
                 f"{args.dataset_path} needs to use vllm-chat as the backend."
@@ -565,6 +588,10 @@ def validate_args(args):
             InstructCoderDataset.SUPPORTED_DATASET_PATHS
             | AIMODataset.SUPPORTED_DATASET_PATHS
             | ASRDataset.SUPPORTED_DATASET_PATHS
+        ) or args.hf_name in (
+            InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            | AIMODataset.SUPPORTED_DATASET_PATHS
+            | ASRDataset.SUPPORTED_DATASET_PATHS
         ):
             assert args.backend == "vllm", (
                 f"{args.dataset_path} needs to use vllm as the backend."
@@ -795,7 +822,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "context in a request (default: 0).",
     )
 
-    # hf dtaset
+    # hf dataset
     parser.add_argument(
         "--hf-subset",
         type=str,
@@ -808,6 +835,17 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default=None,
         help="Split of the HF dataset.",
     )
+    parser.add_argument(
+        "--hf-name",
+        type=str,
+        default=None,
+        help=(
+            "Name of the dataset on HuggingFace "
+            "(e.g., 'lmms-lab/LLaVA-OneVision-Data'). "
+            "Specify this when --dataset-path is a local filesystem path "
+            "so the benchmark can identify the correct dataset class."
+        ),
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 569b0ac0801d..5a67415f1030 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -50,6 +50,7 @@
     should_split,
 )
 from .passes.inductor_pass import InductorPass, pass_context
+from .passes.ir.inplace_functionalization import VllmIRInplaceFunctionalizationPass
 from .passes.pass_manager import PostGradPassManager
 
 logger = init_logger(__name__)
@@ -279,6 +280,7 @@ def compile(
         compilation_counter.num_backend_compilations += 1
 
         compiled_graph = None
+        handle = None
 
         # try to load from the cache
         compiled_graph = self.load(graph, example_inputs, graph_index, compile_range)
@@ -357,7 +359,7 @@ def autograd_cache_key(*args, **kwargs):
                     )
                 except StopCompiling:
                     assert cache_key is not None
-                    return self.loaded_artifacts[cache_key]
+                    compiled_graph = self.loaded_artifacts[cache_key]
             if cache_key is not None and compiled_graph is not None:
                 self.loaded_artifacts[cache_key] = compiled_graph
 
@@ -925,6 +927,24 @@ def collect_standalone_compile_artifacts(
         return standalone_compile_artifacts, sym_shape_indices_map, returns_tuple_map
 
     def configure_post_pass(self) -> None:
+        # TODO proper PassManager?
+        pre_grad_pass_key = "pre_grad_custom_pass"
+        assert self.pass_key != pre_grad_pass_key
+        assert pre_grad_pass_key not in self.inductor_config
+        self.inductor_config[pre_grad_pass_key] = VllmIRInplaceFunctionalizationPass(
+            self.vllm_config
+        )
+
+        # Make sure pre_grad_custom_pass is not pickled
+        # as part of AOTAutograd built-in cache key
+        # TODO(luka) is there a cleaner way to do this
+        import torch._inductor.config as inductor_config
+
+        ignore = inductor_config._cache_config_ignore_prefix + [pre_grad_pass_key]
+        assert "_cache_config_ignore_prefix" not in self.inductor_config
+        self.inductor_config["_cache_config_ignore_prefix"] = ignore
+
+        # Configure the (nominally post-grad) pass manager
         self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
@@ -1248,7 +1268,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             original_split_gm if envs.VLLM_USE_MEGA_AOT_ARTIFACT else self.graph
         )
 
-        execution_code, submod_names = generate_execution_code(self.split_gm)
+        execution_code, submod_names, consts = generate_execution_code(self.split_gm)
         # Use getattr to get correct callables: __dict__ has PiecewiseBackend
         # instances (from PiecewiseCompileInterpreter), _modules has originals.
         # getattr checks __dict__ first, then falls back to _modules.
@@ -1257,7 +1277,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             for name, _ in self.split_gm.named_children()
         }
         runtime_callable = compile_execution_fn(
-            execution_code, submod_callables, submod_names
+            execution_code, submod_callables, submod_names, consts
         )
 
         if (
@@ -1273,6 +1293,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
                 vllm_backend=self,
                 execution_code=execution_code,
                 submod_names=submod_names,
+                consts=consts,
             )
 
         # index of tensors that have symbolic shapes (batch size)
@@ -1306,4 +1327,5 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             sym_tensor_indices=sym_tensor_indices,
             execution_code=execution_code,
             submod_names=submod_names,
+            consts=consts,
         )
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 81c3d7b28655..62da2d9de35b 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -187,6 +187,7 @@ def __init__(
         aot_autograd_config: dict[str, Any] | None = None,
         execution_code: str | None = None,
         submod_names: list[str] | None = None,
+        consts: list[Any] | None = None,
     ) -> None:
         self.graph_module = graph_module
         self.example_inputs = example_inputs
@@ -198,6 +199,7 @@ def __init__(
         self.sym_tensor_indices = sym_tensor_indices
         self.execution_code = execution_code
         self.submod_names = submod_names
+        self.consts = consts
         self._fake_mode: Any | None = None
 
         import torch._functorch.config as functorch_config
@@ -526,8 +528,9 @@ def reconstruct_serializable_fn_from_mega_artifact(
     execution_code = state.get("execution_code")
     submod_names = state.get("submod_names")
     if execution_code is not None and submod_names is not None:
+        consts = state.get("consts")
         runtime_callable = compile_execution_fn(
-            execution_code, submod_callables, submod_names
+            execution_code, submod_callables, submod_names, consts
         )
     else:
         logger.warning(
diff --git a/vllm/compilation/codegen.py b/vllm/compilation/codegen.py
index 1baad4357648..67e9ac843a64 100644
--- a/vllm/compilation/codegen.py
+++ b/vllm/compilation/codegen.py
@@ -22,11 +22,17 @@ def generate_execution_code_with_name(
     split_gm: torch.fx.GraphModule,
     fn_name: str,
     with_submod: bool,
-) -> tuple[str, list[str]]:
+    consts: list[Any] | None = None,
+    const_index: dict[int, int] | None = None,
+) -> tuple[str, list[str], list[Any]]:
     lines: list[str] = []
     param_names: list[str] = []
     submod_names: list[str] = []
     submod_index: dict[str, int] = {}
+    if consts is None:
+        consts = []
+    if const_index is None:
+        const_index = {}
 
     # Build node ordering for liveness analysis.
     nodes = list(split_gm.graph.nodes)
@@ -48,6 +54,9 @@ def generate_execution_code_with_name(
             continue
         del_after.setdefault(node_order[last_user], []).append(node.name)
 
+    def ref(arg: Any) -> str:
+        return _node_ref(arg, consts, const_index)
+
     for i, node in enumerate(nodes):
         if node.op == "placeholder":
             param_names.append(node.name)
@@ -62,16 +71,18 @@ def generate_execution_code_with_name(
                 submod_index[target] = len(submod_names)
                 submod_names.append(target)
             idx = submod_index[target]
-            args_str = ", ".join(_node_ref(a) for a in node.args)
-            kwargs_str = ", ".join(
-                f"{k}={_node_ref(v)}" for k, v in node.kwargs.items()
-            )
+            args_str = ", ".join(ref(a) for a in node.args)
+            kwargs_str = ", ".join(f"{k}={ref(v)}" for k, v in node.kwargs.items())
             all_args = ", ".join(filter(None, [args_str, kwargs_str]))
             submod = getattr(split_gm, target)
             if isinstance(submod, torch.fx.GraphModule):
                 callable_name = f"__vllm_inlined_submods__{idx}"
-                inlined_code, _ = generate_execution_code_with_name(
-                    submod, callable_name, with_submod=False
+                inlined_code, _, _ = generate_execution_code_with_name(
+                    submod,
+                    callable_name,
+                    with_submod=False,
+                    consts=consts,
+                    const_index=const_index,
                 )
                 inlined_submods.append(inlined_code)
             else:
@@ -80,15 +91,13 @@ def generate_execution_code_with_name(
 
         elif node.op == "call_function":
             if node.target is operator.getitem:
-                source = _node_ref(node.args[0])
+                source = ref(node.args[0])
                 index = node.args[1]
                 assert isinstance(index, int)
                 lines.append(f"    {node.name} = {source}[{index}]")
             else:
-                args_str = ", ".join(_node_ref(a) for a in node.args)
-                kwargs_str = ", ".join(
-                    f"{k}={_node_ref(v)}" for k, v in node.kwargs.items()
-                )
+                args_str = ", ".join(ref(a) for a in node.args)
+                kwargs_str = ", ".join(f"{k}={ref(v)}" for k, v in node.kwargs.items())
                 all_args = ", ".join(filter(None, [args_str, kwargs_str]))
                 lines.append(
                     f"    {node.name} = {_get_qualified_name(node.target)}({all_args})"
@@ -96,7 +105,7 @@ def generate_execution_code_with_name(
 
         elif node.op == "output":
             assert len(node.args) == 1
-            ret = _node_ref(node.args[0])
+            ret = ref(node.args[0])
             lines.append(f"    return {ret}")
 
         else:
@@ -109,22 +118,29 @@ def generate_execution_code_with_name(
 
     assert len(param_names) > 0
     params = ", ".join(param_names)
-    header = (
-        f"\ndef {fn_name}({params}{', *, __vllm_submods__' if with_submod else ''}):"
+    kw_params = ", *, __vllm_submods__" if with_submod else ""
+    header = f"\ndef {fn_name}({params}{kw_params}):"
+    return (
+        "".join(inlined_submods) + "\n".join([header] + lines) + "\n",
+        submod_names,
+        consts,
     )
-    return "".join(inlined_submods) + "\n".join([header] + lines) + "\n", submod_names
 
 
 @dynamo_timed("vllm.generate_execution_code")
 def generate_execution_code(
     split_gm: torch.fx.GraphModule,
-) -> tuple[str, list[str]]:
+) -> tuple[str, list[str], list[Any]]:
     """Generate Python source code from a split_gm's stitching graph.
 
     Walks split_gm.graph.nodes and produces a function that calls
     submodules via a __vllm_submods__ list, avoiding FX GraphModule overhead
     and dict lookup cost.
 
+    Non-primitive constant arguments (e.g. torch.device, DTensor placement
+    types) are collected into a constants list and referenced by index
+    in the generated code, avoiding reliance on repr() being eval-able.
+
     If a submodule is a plain torch.fx.GraphModule, it is inlined directly
     in the generated code and we do not need to serialize it in the artifact.
 
@@ -132,15 +148,17 @@ def generate_execution_code(
         split_gm: The split graph module produced by split_graph().
 
     Returns:
-        A tuple of (code, submod_names) where code is the Python source
-        and submod_names is the ordered list of submodule target names
-        corresponding to list indices used in the generated code.
+        A tuple of (code, submod_names, consts) where code is the Python
+        source, submod_names is the ordered list of submodule target names
+        corresponding to list indices used in the generated code, and
+        consts is a list of non-primitive constant objects referenced
+        by the generated code via __vllm_consts__. These objects are
+        kept alive for the lifetime of the compiled function.
     """
-
-    code, submod_names = generate_execution_code_with_name(
+    code, submod_names, consts = generate_execution_code_with_name(
         split_gm, "execution_fn", with_submod=True
     )
-    return "import torch\nimport operator\n" + code, submod_names
+    return "import torch\nimport operator\n" + code, submod_names, consts
 
 
 @dynamo_timed("vllm.compile_execution_fn")
@@ -148,6 +166,7 @@ def compile_execution_fn(
     code: str,
     submod_callables: dict[str, Callable[..., Any]],
     submod_names: list[str],
+    consts: list[Any] | None = None,
 ) -> Callable[..., Any]:
     """Compile execution code and bind submodule callables.
 
@@ -156,6 +175,9 @@ def compile_execution_fn(
         submod_callables: Mapping of submodule names to their callables.
         submod_names: Ordered list of submodule names matching the indices
             used in the generated code.
+        consts: List of non-primitive constant objects referenced by the
+            generated code via __vllm_consts__. None for legacy cached
+            code that predates this feature.
 
     Returns:
         A callable that executes the stitching logic.
@@ -169,6 +191,8 @@ def compile_execution_fn(
         payload_fn=lambda: code,
     )
     namespace: dict[str, Any] = {}
+    if consts is not None:
+        namespace["__vllm_consts__"] = consts
     exec(code, namespace)  # noqa: S102
     fn = namespace["execution_fn"]
     # Using .get() is intentional here because only piecewise backend will
@@ -180,19 +204,32 @@ def compile_execution_fn(
     return partial(fn, __vllm_submods__=submods_list)
 
 
-def _node_ref(arg: Any) -> str:
-    """Convert an FX node argument to a source code reference recursively."""
+def _node_ref(arg: Any, consts: list[Any], const_index: dict[int, int]) -> str:
+    """Convert an FX node argument to a source code reference."""
     if isinstance(arg, torch.fx.Node):
         return arg.name
     if isinstance(arg, list):
-        return f"[{', '.join(_node_ref(x) for x in arg)}]"
+        return f"[{', '.join(_node_ref(x, consts, const_index) for x in arg)}]"
     if isinstance(arg, tuple):
-        items = ", ".join(_node_ref(x) for x in arg)
+        items = ", ".join(_node_ref(x, consts, const_index) for x in arg)
         return f"({items},)" if len(arg) == 1 else f"({items})"
     if isinstance(arg, dict):
         return (
             "{"
-            + ", ".join(f"{_node_ref(k)}: {_node_ref(v)}" for k, v in arg.items())
+            + ", ".join(
+                f"{_node_ref(k, consts, const_index)}: "
+                f"{_node_ref(v, consts, const_index)}"
+                for k, v in arg.items()
+            )
             + "}"
         )
-    return repr(arg)
+    if isinstance(arg, (int, float, bool, str, bytes, type(None))):
+        return repr(arg)
+    # Dedup by identity, not equality: safe because FX graph args
+    # are live for the entire code-generation pass. Objects stored
+    # here must be picklable (for compile-artifact caching).
+    key = id(arg)
+    if key not in const_index:
+        const_index[key] = len(consts)
+        consts.append(arg)
+    return f"__vllm_consts__[{const_index[key]}]"
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index 00bf4bbc71f1..b63d86199720 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -290,9 +290,14 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
                     # across layers will make the cudagraph capture very slow.
                     # therefore, we only run gc for the first graph,
                     # and disable gc for the rest of the graphs.
-                    stack.enter_context(patch("gc.collect", lambda: None))
                     stack.enter_context(
-                        patch("torch.accelerator.empty_cache", lambda: None)
+                        patch("gc.collect", lambda *args, **kwargs: None)
+                    )
+                    stack.enter_context(
+                        patch(
+                            "torch.accelerator.empty_cache",
+                            lambda *args, **kwargs: None,
+                        )
                     )
 
                 if self.graph_pool is not None:
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 90b5c0c44eda..ad710a6fe3fc 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -32,6 +32,9 @@
 
 from .monitor import monitor_profiling_run, monitor_torch_compile
 
+# shape_id parameter was added to mark_unbacked in PyTorch 2.11.0
+_SUPPORTS_SHAPE_ID = is_torch_equal_or_newer("2.11.0")
+
 if TYPE_CHECKING:
     # Only added on nightly/2.10 so wrap
     try:
@@ -89,7 +92,7 @@ def support_torch_compile(
 @overload
 def support_torch_compile(
     *,
-    dynamic_arg_dims: dict[str, int | list[int]] | None,
+    dynamic_arg_dims: dict[str, int | list[int] | dict[int, str]] | None,
 ) -> Callable[[type[_T]], type[_T]]: ...
 
 
@@ -103,7 +106,7 @@ def support_torch_compile(
 @overload
 def support_torch_compile(
     *,
-    dynamic_arg_dims: dict[str, int | list[int]] | None,
+    dynamic_arg_dims: dict[str, int | list[int] | dict[int, str]] | None,
     mark_unbacked_dims: dict[str, int | list[int]] | None,
 ) -> Callable[[type[_T]], type[_T]]: ...
 
@@ -115,11 +118,10 @@ def support_torch_compile(cls: type[_T]) -> type[_T]: ...
 def support_torch_compile(
     cls: type[_T] | None = None,
     *,
-    dynamic_arg_dims: dict[str, int | list[int]] | None = None,
+    dynamic_arg_dims: dict[str, int | list[int] | dict[int, str]] | None = None,
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
     is_encoder: bool = False,
-    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> Callable[[type[_T]], type[_T]] | type[_T]:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -141,8 +143,12 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
     ```
 
     `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic
-    dimensions of the argument. The dynamic dimensions can be either a single
-    integer or a list of integers.
+    dimensions of the argument. The value can be:
+    - int: a single dimension index (e.g., 0)
+    - list[int]: multiple dimension indices (e.g., [0, 1])
+    - dict[int, str]: dimension to shape_id mapping for shape relations
+      (e.g., {0: "b"}). Dimensions with the same shape_id share the same
+      unbacked symbol.
 
     if `dynamic_arg_dims` is `None`, it is inferred from the type annotation
     of the `forward` method, based on the following default rules:
@@ -189,7 +195,7 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
             torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
     This enforces constraints on the symbolic shapes without hardcoding
     specific values. It is needed for some models to avoid data dependent
-    errors.
+    errors and maximize perf when unbacked shapes are used.
     """
 
     def cls_decorator_helper(cls: type[_T]) -> type[_T]:
@@ -229,13 +235,13 @@ def cls_decorator_helper(cls: type[_T]) -> type[_T]:
                 raise ValueError(
                     f"Argument {k} not found in the forward method of {cls}"
                 )
+
         return _support_torch_compile(
             cls,
             inferred_dynamic_arg_dims,
             mark_unbacked_dims,
             enable_if,
             is_encoder,
-            shape_invariants,
         )
 
     if cls is not None:
@@ -324,15 +330,13 @@ def _try_load_aot_compiled_fn(
 
 def _support_torch_compile(
     cls: type[_T],
-    dynamic_arg_dims: dict[str, int | list[int]],
+    dynamic_arg_dims: dict[str, int | list[int] | dict[int, str]],
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
     is_encoder: bool = False,
-    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> type[_T]:
-    """
-    A decorator to add support for compiling the forward method of a class.
-    """
+    """Internal implementation of support_torch_compile decorator."""
+
     if TorchCompileWithNoGuardsWrapper in cls.__bases__:
         # support decorating multiple times
         return cls
@@ -392,7 +396,8 @@ def __init__(
         if self.do_not_compile:
             return
 
-        self._check_shape_invariants = shape_invariants
+        self._dynamic_arg_dims = dynamic_arg_dims
+
         self.was_aot_compile_fn_loaded_from_disk = False
         compilation_counter.num_models_seen += 1
         self.compiled = False
@@ -409,48 +414,83 @@ def __init__(
     def _mark_dynamic_inputs(
         mod: type[_T], ds_type: DynamicShapesType, *args: Any, **kwargs: Any
     ) -> None:
-        def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None:
+        def mark_dynamic(
+            arg: torch.Tensor, dim_shape_pairs: list[tuple[int, str | None]]
+        ) -> None:
             if ds_type == DynamicShapesType.UNBACKED:
                 if is_torch_equal_or_newer("2.10.0"):
-                    for dim in dims:
-                        torch._dynamo.decorators.mark_unbacked(
-                            arg, dim, hint_override=arg.size()[dim]
-                        )
+                    for dim, shape_id in dim_shape_pairs:
+                        if shape_id is not None:
+                            if not _SUPPORTS_SHAPE_ID:
+                                raise RuntimeError(
+                                    f"shape_id='{shape_id}' requires PyTorch >= 2.11.0"
+                                )
+                            torch._dynamo.decorators.mark_unbacked(
+                                arg,
+                                dim,
+                                hint_override=arg.size()[dim],
+                                shape_id=shape_id,
+                            )
+                        else:
+                            torch._dynamo.decorators.mark_unbacked(
+                                arg,
+                                dim,
+                                hint_override=arg.size()[dim],
+                            )
                 else:
+                    # For older versions, we can't use hint_override or shape_id
+                    dims = [dim for dim, _ in dim_shape_pairs]
                     torch._dynamo.decorators.mark_unbacked(arg, dims)
             else:
+                dims = [dim for dim, _ in dim_shape_pairs]
                 torch._dynamo.mark_dynamic(arg, dims)
 
         sig = inspect.signature(mod.__class__.forward)  # type: ignore[attr-defined]
         bound_args = sig.bind(mod, *args, **kwargs)
         bound_args.apply_defaults()
-        for k, dims in dynamic_arg_dims.items():
+
+        # Normalize dynamic_arg_dims to dict[str, dict[int, str | None]]
+        normalized_dims: dict[str, dict[int, str | None]] = {}
+        for k, v in dynamic_arg_dims.items():
+            if isinstance(v, dict):
+                normalized_dims[k] = {dim: shape_id for dim, shape_id in v.items()}
+            elif isinstance(v, int):
+                normalized_dims[k] = {v: None}
+            else:
+                normalized_dims[k] = {d: None for d in v}
+
+        for k, dim_to_shape_id in normalized_dims.items():
             arg = bound_args.arguments.get(k)
 
             if arg is not None:
-                dims = [dims] if isinstance(dims, int) else dims
+                dims = list(dim_to_shape_id.keys())
+
                 if isinstance(arg, torch.Tensor):
-                    # In case dims is specified with negative indexing
-                    dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                    mark_dynamic(arg, dims)
+                    dim_shape_pairs = [
+                        (arg.ndim + d if d < 0 else d, dim_to_shape_id.get(d))
+                        for d in dims
+                    ]
+                    mark_dynamic(arg, dim_shape_pairs)
                 elif isinstance(arg, IntermediateTensors):
                     for tensor in arg.tensors.values():
-                        # In case dims is specified with negative indexing
-                        dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
-                        mark_dynamic(tensor, dims)
+                        dim_shape_pairs = [
+                            (tensor.ndim + d if d < 0 else d, dim_to_shape_id.get(d))
+                            for d in dims
+                        ]
+                        mark_dynamic(tensor, dim_shape_pairs)
                 else:
                     raise ValueError(
-                        "Unsupported dynamic dimensions"
-                        f" {dims} for argument {k} with type {type(arg)}."
+                        f"Unsupported dynamic dimensions {dims} "
+                        f"for argument {k} with type {type(arg)}."
                     )
+
         if mark_unbacked_dims:
-            for k, dims in mark_unbacked_dims.items():
+            for k, dims_val in mark_unbacked_dims.items():
                 arg = bound_args.arguments.get(k)
                 if arg is not None:
-                    dims = [dims] if isinstance(dims, int) else dims
+                    dims = [dims_val] if isinstance(dims_val, int) else list(dims_val)
                     if isinstance(arg, torch.Tensor):
-                        # In case dims is specified with negative indexing
-                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                        dims = [arg.ndim + d if d < 0 else d for d in dims]
                         if is_torch_equal_or_newer("2.10.0"):
                             for dim in dims:
                                 torch._dynamo.decorators.mark_unbacked(
diff --git a/vllm/compilation/passes/fusion/act_quant_fusion.py b/vllm/compilation/passes/fusion/act_quant_fusion.py
index 73234ec7920d..e35fc5cd4084 100644
--- a/vllm/compilation/passes/fusion/act_quant_fusion.py
+++ b/vllm/compilation/passes/fusion/act_quant_fusion.py
@@ -190,6 +190,7 @@ def __init__(
         is_scale_transposed: bool = False,
         is_e8m0: bool = False,
         is_tma_aligned: bool = False,
+        match_aiter: bool = False,
     ) -> None:
         super().__init__(quant_key)
         self.quant_matcher = MatcherQuantFP8(
diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
index 09b9a557fe45..6cb0c8f49f3d 100644
--- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
+++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
@@ -12,12 +12,14 @@
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
 import vllm.ir.ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.compilation.passes.fusion.rms_quant_fusion import (
     _rms_input_weight_dtype_match,
 )
 from vllm.config import VllmConfig
 from vllm.config.utils import Range
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -32,8 +34,13 @@
 )
 
 from ..inductor_pass import enable_fake_mode
-from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
-from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8
+from ..vllm_inductor_pass import (
+    VllmFusionPatternMatcherPass,
+    VllmInductorPass,
+    VllmPatternMatcherPass,
+    VllmPatternReplacement,
+)
+from .matcher_utils import MatcherQuantFP8
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
@@ -349,10 +356,11 @@ def __init__(
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def get_inputs(self) -> list[torch.Tensor]:
-        input, residual, weight = self.rmsnorm_matcher.inputs()
+        input = self.empty(5, 16)
+        residual = self.empty(5, 16)
+        weight = self.empty(16)
 
         # input goes through allreduce first, always 16-bit
         return [residual, input.to(self.dtype), weight]
@@ -362,7 +370,9 @@ def pattern(
             residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
         ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
+            rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                allreduce_output, residual, weight, self.epsilon
+            )
             return rms, residual
 
         def replacement(
@@ -496,11 +506,12 @@ def __init__(
         self.allreduce_params = allreduce_params
         self.quant_dtype = torch.float8_e4m3fn
 
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self) -> list[torch.Tensor]:
-        input, residual, weight = self.rmsnorm_matcher.inputs()
+        input = self.empty(5, 16)
+        residual = self.empty(5, 16)
+        weight = self.empty(16)
         _, scale = self.quant_matcher.inputs()
 
         # input goes through allreduce first, always 16-bit
@@ -514,7 +525,9 @@ def pattern(
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms, res = self.rmsnorm_matcher(allreduce_output, weight, residual)
+            rms, res = vllm.ir.ops.fused_add_rms_norm(
+                allreduce_output, residual, weight, self.epsilon
+            )
             quant, _ = self.quant_matcher(rms, scale)
 
             return quant, res
@@ -661,7 +674,6 @@ def __init__(
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def get_inputs(self) -> list[torch.Tensor]:
         input = torch.empty([16, 16], device=self.device, dtype=self.dtype)
@@ -693,7 +705,9 @@ def pattern(
             input_global_scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
+            rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                allreduce_output, residual, weight, self.epsilon
+            )
             quant_out_tuple = auto_functionalized(
                 STATIC_FP4_QUANT_OP,
                 input=rms,
@@ -889,3 +903,201 @@ def __del__(self) -> None:
             return
         with contextlib.suppress(Exception):
             destroy_fi_ar_workspace()
+
+
+# TODO: make BasePattern to inherit from VllmPatternReplacement
+class AiterAllreduceFusedRMSNormPattern(BasePattern, VllmPatternReplacement):
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str | None,
+        use_aiter_rmsnorm: bool = True,
+    ) -> None:
+        super().__init__(dtype, device)
+        self.dtype = dtype
+        self.epsilon = epsilon
+        self.FUSED_AR_RMSNORM_OP = rocm_aiter_ops.get_fused_allreduce_rmsnorm_op()
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        return [self.empty(5, 16), self.empty(16)]
+
+    @property
+    def pattern(self):
+        def _pattern(
+            input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            allreduce_output = tensor_model_parallel_all_reduce(input)
+            rms = vllm.ir.ops.rms_norm(allreduce_output, weight, self.epsilon)
+
+            return rms, allreduce_output
+
+        return _pattern
+
+    @property
+    def replacement(self):
+        def _replacement(
+            input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            residual = torch.empty_like(input)
+            allreduce = self.FUSED_AR_RMSNORM_OP(
+                input_=input,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+            )
+            return allreduce[0], allreduce[1]
+
+        return _replacement
+
+
+class AiterAllreduceFusedAddRMSNormPattern(BasePattern, VllmPatternReplacement):
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str | None,
+        use_aiter_rmsnorm: bool = True,
+    ) -> None:
+        super().__init__(dtype, device)
+        self.epsilon = epsilon
+        self.dtype = dtype
+        self.FUSED_AR_RMSNORM_OP = rocm_aiter_ops.get_fused_allreduce_rmsnorm_op()
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        # input, residual, weight
+        return [self.empty(5, 16), self.empty(5, 16), self.empty(16)]
+
+    @property
+    def pattern(self):
+        def _pattern(
+            residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            allreduce_output = tensor_model_parallel_all_reduce(input)
+            rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                allreduce_output, residual, weight, self.epsilon
+            )
+            return rms, residual
+
+        return _pattern
+
+    @property
+    def replacement(self):
+        def _replacement(
+            residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            allreduce = self.FUSED_AR_RMSNORM_OP(
+                input_=input,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+            )
+            return allreduce[0], allreduce[1]
+
+        return _replacement
+
+
+class RocmAiterAllReduceFusionPass(VllmFusionPatternMatcherPass):
+    def __init__(self, config: VllmConfig) -> None:
+        super().__init__(config, "rocm_aiter_allreduce_fusion_pass")
+        self.disabled = True
+        self.tp_size = get_tensor_model_parallel_world_size()
+        if self.tp_size <= 1:
+            logger.warning_once("AllReduce fusion pass is disabled for tp_size <= 1.")
+            return
+
+        if config.model_config is None:
+            logger.warning_once(
+                "AllReduce fusion pass is disabled for missing model_config."
+            )
+            return
+
+        device_comm = get_tp_group().device_communicator
+        if device_comm is None:
+            logger.warning_once("Device communicator is required.")
+            return
+
+        ca_comm = getattr(device_comm, "ca_comm", None)
+        if ca_comm is None:
+            logger.warning_once("Custom Allreduce is required.")
+            return
+        self.ca_comm = ca_comm
+
+        assert isinstance(ca_comm, CustomAllreduce)
+
+        group = get_tp_group().cpu_group
+        rocm_aiter_ops.initialize_aiter_allreduce(group, self.device)
+        hidden_dim = config.model_config.get_hidden_size()
+        element_size = torch.tensor([], dtype=self.model_dtype).element_size()
+        max_size = rocm_aiter_ops.get_aiter_allreduce_max_size()
+        if max_size is None:
+            logger.warning("AITER allreduce fusion must be initialized")
+            return
+
+        # Aiter's fused_allreduce_rmsnorm kernel dispatches on hidden_dim.
+        # Before aiter v0.1.12 the launcher was template-specialized on HIDDEN_DIM
+        # and silently no-op'd for sizes outside {512, 1024, 2048, 4096}. From v0.1.12
+        # hidden_dim is a runtime argument. Detect the older API via the missing
+        # `_pool` attribute and skip fusion for unsupported sizes.
+        # Ref (old kernel): https://github.com/ROCm/aiter/blob/6a0e7b26ccf33164785531212cc2ec2cde0b9243/csrc/include/custom_all_reduce.cuh#L2590
+        aiter_ar = rocm_aiter_ops.get_aiter_allreduce()
+        _AITER_OLD_FUSED_AR_RMS_HIDDEN = (512, 1024, 2048, 4096)
+        if (
+            aiter_ar is not None
+            and not hasattr(aiter_ar, "_pool")
+            and hidden_dim not in _AITER_OLD_FUSED_AR_RMS_HIDDEN
+        ):
+            logger.warning_once(
+                "AITER allreduce-rmsnorm fusion disabled: aiter<0.1.12 "
+                "only supports hidden_dim in %s; got %d. Upgrade aiter to "
+                ">=0.1.12 to enable fusion for this model.",
+                _AITER_OLD_FUSED_AR_RMS_HIDDEN,
+                hidden_dim,
+            )
+            # Tear down aiter's custom-allreduce so its IPC handles don't
+            # race with vllm's ca_comm on the unfused fallback path.
+            with contextlib.suppress(Exception):
+                rocm_aiter_ops.destroy_aiter_allreduce()
+            return
+
+        max_token_num = max_size // (hidden_dim * element_size)
+        self.max_token_num = min(
+            max_token_num,
+            config.scheduler_config.max_num_batched_tokens,
+        )
+
+        for epsilon in [1e-5, 1e-6]:
+            self.register(
+                AiterAllreduceFusedRMSNormPattern(
+                    epsilon,
+                    self.model_dtype,
+                    self.device,
+                )
+            )
+            self.register(
+                AiterAllreduceFusedAddRMSNormPattern(
+                    epsilon,
+                    self.model_dtype,
+                    self.device,
+                )
+            )
+
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+        self.disabled = False
+
+        self.dump_patterns(config, self.pm_pass)
+
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
+        if self.disabled:
+            logger.warning_once("AllReduce fusion pass is disabled.")
+            return False
+        return bool(compile_range.end <= self.max_token_num)
+
+    def __del__(self) -> None:
+        if getattr(self, "disabled", True):
+            return
+        with contextlib.suppress(Exception):
+            rocm_aiter_ops.destroy_aiter_allreduce()
diff --git a/vllm/compilation/passes/fusion/collective_fusion.py b/vllm/compilation/passes/fusion/collective_fusion.py
index 7c14931f497b..2b74eae8dd32 100644
--- a/vllm/compilation/passes/fusion/collective_fusion.py
+++ b/vllm/compilation/passes/fusion/collective_fusion.py
@@ -1,8 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
+from contextlib import suppress
+
 import torch
 import torch._inductor.pattern_matcher as pm
+import torch.distributed.distributed_c10d as c10d
 import torch.fx as fx
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
@@ -15,15 +19,197 @@
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import direct_register_custom_op
 
 from ..inductor_pass import enable_fake_mode
-from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
+from ..vllm_inductor_pass import (
+    VllmFusionPatternMatcherPass,
+    VllmInductorPass,
+    VllmPatternMatcherPass,
+    VllmPatternReplacement,
+)
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
 logger = init_logger(__name__)
 
 
+def _flashinfer_scaled_mm_out(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    *,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    scale_result: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
+    use_fast_accum: bool = False,
+) -> None:
+    # Import lazily to avoid a circular import during module initialization
+    # when docs or other tooling import the pass without FlashInfer.
+    from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm_out
+
+    assert bias is None, "FlashInfer symm_mem adapter does not support bias"
+    assert scale_result is None, (
+        "FlashInfer symm_mem adapter does not support result scaling"
+    )
+    assert not use_fast_accum, (
+        "FlashInfer symm_mem adapter does not support use_fast_accum"
+    )
+    assert A.ndim == 2 and B.ndim == 2 and out.ndim == 2, (
+        "FlashInfer symm_mem adapter expects 2D inputs and output"
+    )
+    assert scale_a.numel() == 1 and scale_b.numel() == 1, (
+        "FlashInfer symm_mem adapter only supports tensor-wise FP8 scales"
+    )
+
+    flashinfer_scaled_fp8_mm_out(
+        A,
+        B,
+        scale_a,
+        scale_b,
+        out=out,
+        out_dtype=out_dtype or out.dtype,
+    )
+
+
+def fused_flashinfer_scaled_matmul_reduce_scatter_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    world_size = c10d._resolve_process_group(group_name).size()
+    result_shape = list(output_shape)
+    result_shape[orig_scatter_dim] //= world_size
+    return torch.empty(
+        result_shape,
+        dtype=out_dtype or torch.bfloat16,
+        device=A.device,
+    )
+
+
+def fused_flashinfer_scaled_matmul_reduce_scatter(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    assert orig_scatter_dim == 0 and scatter_dim_after_maybe_reshape == 0, (
+        "FlashInfer symm_mem adapter currently only supports scatter_dim=0"
+    )
+    world_size = c10d._resolve_process_group(group_name).size()
+    assert A.ndim == 2 and B.ndim == 2, "FlashInfer symm_mem adapter expects 2D inputs"
+    assert A.is_contiguous(), "FlashInfer symm_mem adapter expects contiguous A"
+    assert A_scale.numel() == 1 and B_scale.numel() == 1, (
+        "FlashInfer symm_mem adapter only supports tensor-wise FP8 scales"
+    )
+    assert A.shape[0] % world_size == 0, (
+        "FlashInfer symm_mem adapter expects M divisible by world size"
+    )
+
+    kwargs = {
+        "scale_b": B_scale,
+        "bias": None,
+        "scale_result": None,
+        "out_dtype": out_dtype,
+        "use_fast_accum": False,
+    }
+    return torch.distributed._symmetric_memory._fused_scaled_matmul_reduce_scatter_impl(
+        mm_out_op=_flashinfer_scaled_mm_out,
+        A=A,
+        B=B,
+        A_scale=A_scale,
+        kwargs=kwargs,
+        out_dtype=out_dtype,
+        reduce_op=reduce_op,
+        orig_scatter_dim=orig_scatter_dim,
+        scatter_dim_after_maybe_reshape=scatter_dim_after_maybe_reshape,
+        group_name=group_name,
+        output_shape=output_shape,
+    )
+
+
+def fused_all_gather_flashinfer_scaled_matmul_fake(
+    A_shard: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    gather_dim: int,
+    group_name: str,
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    world_size = c10d._resolve_process_group(group_name).size()
+    output_shape = list(A_shard.shape)
+    output_shape[gather_dim] *= world_size
+    output_shape[-1] = B.shape[1]
+    return torch.empty(
+        output_shape,
+        dtype=out_dtype or torch.bfloat16,
+        device=A_shard.device,
+    )
+
+
+def fused_all_gather_flashinfer_scaled_matmul(
+    A_shard: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    gather_dim: int,
+    group_name: str,
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    assert gather_dim == 0, (
+        "FlashInfer symm_mem adapter currently only supports gather_dim=0"
+    )
+    _, outputs = torch.distributed._symmetric_memory._fused_all_gather_matmul_impl(
+        mm_out_op=_flashinfer_scaled_mm_out,
+        A_shard=A_shard,
+        Bs=[B],
+        A_scale=A_scale,
+        kwargs_list=[
+            {
+                "scale_b": B_scale,
+                "bias": None,
+                "scale_result": None,
+                "out_dtype": out_dtype,
+                "use_fast_accum": False,
+            }
+        ],
+        out_dtypes=[out_dtype],
+        gather_dim=gather_dim,
+        group_name=group_name,
+        return_A=False,
+    )
+    return outputs[0]
+
+
+direct_register_custom_op(
+    op_name="fused_flashinfer_scaled_matmul_reduce_scatter",
+    op_func=fused_flashinfer_scaled_matmul_reduce_scatter,
+    fake_impl=fused_flashinfer_scaled_matmul_reduce_scatter_fake,
+)
+
+direct_register_custom_op(
+    op_name="fused_all_gather_flashinfer_scaled_matmul",
+    op_func=fused_all_gather_flashinfer_scaled_matmul,
+    fake_impl=fused_all_gather_flashinfer_scaled_matmul_fake,
+)
+
+
 class BasePattern:
     def __init__(self, dtype: torch.dtype, device: str | None) -> None:
         self.dtype = dtype
@@ -371,39 +557,169 @@ def replacement(
         )
 
 
-class AsyncTPPass(VllmPatternMatcherPass):
+class FlashInferBMMFP8ReduceScatterPattern(
+    BasePattern, VllmPatternReplacement[..., torch.Tensor]
+):
+    def get_inputs(self) -> list[torch.Tensor]:
+        a_2d = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+        b_2d = (
+            torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+            .contiguous()
+            .transpose(0, 1)
+        )
+        a_scale = torch.empty([1], device=self.device, dtype=torch.float32)
+        b_scale = torch.empty([1], device=self.device, dtype=torch.float32)
+        return [a_2d, b_2d, a_scale, b_scale]
+
+    @property
+    def pattern(self) -> Callable[..., torch.Tensor]:
+        def _pattern(
+            a_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale: torch.Tensor,
+            b_scale: torch.Tensor,
+        ) -> torch.Tensor:
+            bmm = torch.ops.vllm.bmm_fp8.default(
+                torch.ops.aten.unsqueeze.default(a_2d, 0),
+                torch.ops.aten.unsqueeze.default(b_2d, 0),
+                a_scale,
+                b_scale,
+                self.dtype,
+                "auto",
+            )
+            output = torch.ops.aten.reshape.default(bmm, list(bmm.shape[1:]))
+            return torch.ops.vllm.reduce_scatter.default(
+                output,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name,
+            )
+
+        return _pattern
+
+    @property
+    def replacement(self) -> Callable[..., torch.Tensor]:
+        def _replacement(
+            a_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale: torch.Tensor,
+            b_scale: torch.Tensor,
+        ) -> torch.Tensor:
+            return torch.ops.vllm.fused_flashinfer_scaled_matmul_reduce_scatter.default(
+                a_2d,
+                b_2d,
+                a_scale,
+                b_scale,
+                "sum",
+                0,
+                0,
+                self.tp.device_group.group_name,
+                [a_2d.shape[0], b_2d.shape[1]],
+                self.dtype,
+            )
+
+        return _replacement
+
+
+class FlashInferAllGatherBMMFP8Pattern(
+    BasePattern, VllmPatternReplacement[..., torch.Tensor]
+):
+    def get_inputs(self) -> list[torch.Tensor]:
+        a_shard_2d = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
+        b_2d = (
+            torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+            .contiguous()
+            .transpose(0, 1)
+        )
+        a_scale = torch.empty([1], device=self.device, dtype=torch.float32)
+        b_scale = torch.empty([1], device=self.device, dtype=torch.float32)
+        return [a_shard_2d, b_2d, a_scale, b_scale]
+
+    @property
+    def pattern(self) -> Callable[..., torch.Tensor]:
+        def _pattern(
+            a_shard_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale: torch.Tensor,
+            b_scale: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather = torch.ops.vllm.all_gather.default(
+                a_shard_2d,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name,
+            )
+            return torch.ops.vllm.bmm_fp8.default(
+                torch.ops.aten.unsqueeze.default(all_gather, 0),
+                torch.ops.aten.unsqueeze.default(b_2d, 0),
+                a_scale,
+                b_scale,
+                self.dtype,
+                "auto",
+            )
+
+        return _pattern
+
+    @property
+    def replacement(self) -> Callable[..., torch.Tensor]:
+        def _replacement(
+            a_shard_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale: torch.Tensor,
+            b_scale: torch.Tensor,
+        ) -> torch.Tensor:
+            fused = torch.ops.vllm.fused_all_gather_flashinfer_scaled_matmul.default(
+                a_shard_2d,
+                b_2d,
+                a_scale,
+                b_scale,
+                0,
+                self.tp.device_group.group_name,
+                self.dtype,
+            )
+            return torch.ops.aten.unsqueeze.default(fused, 0)
+
+        return _replacement
+
+
+class AsyncTPPass(VllmFusionPatternMatcherPass):
     @enable_fake_mode
     def __init__(self, config: VllmConfig) -> None:
-        super().__init__(config)
+        super().__init__(config, pass_name="async_tp_pass")
 
-        # Enable symmetric memory for the TP process group
         enable_symm_mem_for_group(get_tp_group().device_group.group_name)
-        self.patterns: PatternMatcherPass = PatternMatcherPass(
-            pass_name="async_tp_pass"
-        )
-        GEMMReduceScatterPattern(self.model_dtype, self.device).register(self.patterns)
+        GEMMReduceScatterPattern(self.model_dtype, self.device).register(self.pm_pass)
 
-        AllGatherGEMMPattern(self.model_dtype, self.device).register(self.patterns)
+        AllGatherGEMMPattern(self.model_dtype, self.device).register(self.pm_pass)
 
         # These fusions are enabled only for bfloat16 models because
         # `scaled_mm` or `cutlass_scaled_mm` with per-token (row-wise) scaling
         # only supports bfloat16 as the output dtype.
         if self.model_dtype == torch.bfloat16:
             ScaledMMReduceScatterPattern(self.model_dtype, self.device).register(
-                self.patterns
+                self.pm_pass
             )
             AllGatherScaledMMPattern(self.model_dtype, self.device).register(
-                self.patterns
+                self.pm_pass
             )
 
             CutlassScaledMMReduceScatterPattern(self.model_dtype, self.device).register(
-                self.patterns
+                self.pm_pass
             )
             AllGatherCutlassScaledMMPattern(self.model_dtype, self.device).register(
-                self.patterns
+                self.pm_pass
             )
-
-        self.dump_patterns(config, self.patterns)
+            with suppress(ImportError):
+                import vllm.utils.flashinfer  # noqa: F401
+            if hasattr(torch.ops.vllm, "bmm_fp8"):
+                self.register(
+                    FlashInferAllGatherBMMFP8Pattern(self.model_dtype, self.device)
+                )
+                self.register(
+                    FlashInferBMMFP8ReduceScatterPattern(self.model_dtype, self.device)
+                )
+
+        self.dump_patterns(config, self.pm_pass)
 
     def is_applicable_for_range(self, compile_range: Range) -> bool:
         # This pass is applied on top of the sequence parallelism pass,
@@ -416,5 +732,6 @@ def is_applicable_for_range(self, compile_range: Range) -> bool:
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph) -> None:
-        self.matched_count = self.patterns.apply(graph)
+        self.matched_count = self.pm_pass.apply(graph)
+        VllmPatternMatcherPass.match_table[self.pass_name] += self.matched_count
         logger.debug("Replaced %s patterns", self.matched_count)
diff --git a/vllm/compilation/passes/fusion/matcher_utils.py b/vllm/compilation/passes/fusion/matcher_utils.py
index c2490d8a21f5..e5130c19c392 100644
--- a/vllm/compilation/passes/fusion/matcher_utils.py
+++ b/vllm/compilation/passes/fusion/matcher_utils.py
@@ -10,7 +10,6 @@
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
@@ -159,72 +158,6 @@ def forward_native(
         return result
 
 
-class MatcherFusedAddRMSNorm(MatcherCustomOp):
-    def __init__(
-        self,
-        epsilon: float,
-        enabled: bool | None = None,
-        match_rocm_aiter: bool = False,
-    ) -> None:
-        if enabled is None:
-            enabled = RMSNorm.enabled()
-
-        super().__init__(enabled)
-        self.epsilon = epsilon
-        self.match_rocm_aiter = match_rocm_aiter
-
-        self._rmsnorm_op = RMS_ADD_OP
-
-        if match_rocm_aiter:
-            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_fused_add_op()
-
-    def inputs(self) -> list[torch.Tensor]:
-        input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
-        weight = self.empty(16)
-        residual = self.empty(5, 16)
-        return [input, weight, residual]
-
-    def forward_rocm_aiter(
-        self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        residual: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        return self._rmsnorm_op(  # type: ignore[no-any-return]
-            x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon
-        )
-
-    def forward_custom(
-        self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        residual: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        if self.match_rocm_aiter:
-            return self.forward_rocm_aiter(input, weight, residual)
-
-        _, result, residual = auto_functionalized(
-            self._rmsnorm_op,
-            input=input,
-            residual=residual,
-            weight=weight,
-            epsilon=self.epsilon,
-        )
-
-        return result, residual
-
-    def forward_native(
-        self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        residual: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        result: tuple[torch.Tensor, torch.Tensor] = RMSNorm.forward_static(
-            input, self.epsilon, input.size(-1), self.model_dtype, weight, residual
-        )
-        return result
-
-
 class MatcherQuantFP8(MatcherCustomOp):
     def __init__(
         self,
diff --git a/vllm/compilation/passes/fusion/rms_quant_fusion.py b/vllm/compilation/passes/fusion/rms_quant_fusion.py
index 850e434a3e73..cc986595d436 100644
--- a/vllm/compilation/passes/fusion/rms_quant_fusion.py
+++ b/vllm/compilation/passes/fusion/rms_quant_fusion.py
@@ -29,7 +29,6 @@
 from ..inductor_pass import enable_fake_mode
 from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 from .matcher_utils import (
-    MatcherFusedAddRMSNorm,
     MatcherQuantFP8,
 )
 
@@ -146,9 +145,6 @@ def __init__(
         assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
 
-        if key.fused_add:
-            self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
-
         self.quant_matcher = MatcherQuantFP8(
             key.quant,
             has_col_major_scales=has_col_major_scales,
@@ -231,7 +227,9 @@ def pattern(
             residual: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
-            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result, _ = self.quant_matcher(result_rms, scale)
 
             return result, residual
@@ -261,8 +259,9 @@ def replacement(
             return at[1], at[2]
 
         inputs = [
-            # input, weight, residual
-            *self.rmsnorm_matcher.inputs(),
+            empty_bf16(5, 16),  # input
+            empty_bf16(16),  # weight
+            empty_bf16(5, 16),  # residual
             self.quant_matcher.inputs()[1],  # scale
         ]
 
@@ -311,7 +310,9 @@ def pattern(
             residual: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result = torch.empty(
                 result_rms.shape,
                 device=result_rms.device,
@@ -366,12 +367,17 @@ def replacement(
             # result, residual, scale
             return at[1], at[3], at[2]
 
-        scale = self.quant_matcher.empty_f32(1, 1)
+        inputs = [
+            empty_bf16(5, 16),  # input
+            empty_bf16(16),  # weight
+            empty_bf16(5, 16),  # residual
+            self.quant_matcher.empty_f32(1, 1),  # scale
+        ]
 
         pm.register_replacement(
             pattern,
             replacement,
-            self.rmsnorm_matcher.inputs() + [scale],
+            inputs,
             pm.fwd_only,
             pm_pass,
             extra_check=_rms_input_weight_dtype_match,
@@ -552,7 +558,9 @@ def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result, scale = self.quant_matcher(result_rms)
 
             return result, residual, scale
@@ -580,10 +588,16 @@ def replacement(
             # result, residual, scale
             return at[1], at[3], at[2]
 
+        inputs = [
+            empty_bf16(5, 16),  # input
+            empty_bf16(16),  # weight
+            empty_bf16(5, 16),  # residual
+        ]
+
         pm.register_replacement(
             pattern,
             replacement,
-            self.rmsnorm_matcher.inputs(),
+            inputs,
             pm.fwd_only,
             pm_pass,
             extra_check=_rms_input_weight_dtype_match,
diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
index cdd0e23773d6..28159dbe0872 100644
--- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
+++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
@@ -29,7 +29,6 @@
     VllmPatternReplacement,
 )
 from .matcher_utils import (
-    MatcherFusedAddRMSNorm,
     MatcherQuantFP8,
     MatcherSiluAndMul,
 )
@@ -49,10 +48,6 @@ def __init__(
         self.quant_dtype = key.quant.dtype
         self.device = torch.device("cuda")
 
-        if key.fused_add:
-            self.rmsnorm_matcher = MatcherFusedAddRMSNorm(
-                epsilon, match_rocm_aiter=True
-            )
         self.quant_matcher = MatcherQuantFP8(
             key.quant,
             match_rocm_aiter=match_aiter_quant,
@@ -145,7 +140,9 @@ def pattern(
             weight: torch.Tensor,
             residual: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual_out = torch.ops.vllm_ir.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result, scale = self.quant_matcher(result_rms)
 
             return result, residual_out, scale
@@ -163,10 +160,16 @@ def replacement(
 
             return result[0], result[1], result[2]
 
+        inputs = [
+            self.empty(5, 16),  # input
+            self.empty(16),  # weight
+            self.empty(5, 16),  # residual
+        ]
+
         pm.register_replacement(
             pattern,
             replacement,
-            self.rmsnorm_matcher.inputs(),
+            inputs,
             pm.fwd_only,
             pm_pass,
         )
@@ -258,7 +261,9 @@ def pattern(
             weight: torch.Tensor,
             residual: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual_out = torch.ops.vllm_ir.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result, scale = self.quant_matcher(result_rms)
 
             return result, residual_out, scale
@@ -279,9 +284,13 @@ def replacement(
             # result, scale, residual
             return at[0], at[1], at[2]
 
-        pm.register_replacement(
-            pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
-        )
+        inputs = [
+            self.empty(5, 16),  # input
+            self.empty(16),  # weight
+            self.empty(5, 16),  # residual
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
 
 
 class RocmAiterRMSNormQuantFusionPass(VllmPatternMatcherPass):
@@ -420,12 +429,15 @@ def __init__(
         self.epsilon = epsilon
         self.hidden_size = hidden_size
         self.x_pad_to_multiple = x_pad_to_multiple
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=True)
 
     def get_inputs(self) -> list[torch.Tensor]:
-        input, weight, residual = self.rmsnorm_matcher.inputs()
-        router_weight = torch.empty([8, 16], dtype=weight.dtype, device=weight.device)
-        router_bias = torch.empty([8], dtype=weight.dtype, device=weight.device)
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+        input = torch.empty(5, 16, dtype=dtype, device=device)
+        weight = torch.empty(16, dtype=dtype, device=device)
+        residual = torch.empty(5, 16, dtype=dtype, device=device)
+        router_weight = torch.empty([8, 16], dtype=dtype, device=device)
+        router_bias = torch.empty([8], dtype=dtype, device=device)
         return [input, weight, residual, router_weight, router_bias]
 
     def register(self, pm_pass: PatternMatcherPass) -> None:
@@ -439,7 +451,9 @@ def pattern(
             pad_size = self.x_pad_to_multiple - (
                 self.hidden_size % self.x_pad_to_multiple
             )
-            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual_out = torch.ops.vllm_ir.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             router_logits = torch.ops.vllm.rocm_unquantized_gemm(
                 result_rms, router_weight, router_bias
             )
diff --git a/vllm/compilation/passes/fusion/sequence_parallelism.py b/vllm/compilation/passes/fusion/sequence_parallelism.py
index 1eae92ecb6a1..2c7a1390bdb8 100644
--- a/vllm/compilation/passes/fusion/sequence_parallelism.py
+++ b/vllm/compilation/passes/fusion/sequence_parallelism.py
@@ -23,7 +23,7 @@
 from ..inductor_pass import enable_fake_mode
 from ..utility.noop_elimination import NoOpEliminationPass
 from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
-from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8
+from .matcher_utils import MatcherQuantFP8
 
 logger = init_logger(__name__)
 
@@ -31,6 +31,7 @@
 # Only apply sequence parallelism for models with hidden_size >= threshold
 SP_MIN_HIDDEN_SIZE: dict[int, int] = {
     90: 8192,  # H100: only for models with hidden_size >= 8192
+    100: 8192,  # Blackwell family: only for models with hidden_size >= 8192
 }
 
 # Min size per GPU per device capability for sequence parallelism
@@ -38,6 +39,8 @@
 # This ensures the threshold scales appropriately with tensor parallelism
 SP_MIN_PER_GPU_SIZE_MB: dict[int, float] = {
     90: 8,  # 8MB per GPU for H100
+    # Use a more conservative threshold on Blackwell so TP8 starts later.
+    100: 32,
 }
 
 
@@ -67,7 +70,12 @@ def get_sequence_parallelism_threshold(
     capability = current_platform.get_device_capability()
     if capability is None:
         return None
-    device_capability = capability.to_int()
+
+    # Collapse Blackwell variants (sm100/sm103/...) into one policy bucket.
+    if current_platform.is_device_capability_family(100):
+        device_capability = 100
+    else:
+        device_capability = capability.to_int()
 
     # Check if device has configured thresholds
     min_hidden_size = SP_MIN_HIDDEN_SIZE.get(device_capability)
@@ -166,7 +174,6 @@ def replacement(
 class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
     def __init__(self, epsilon: float, dtype: torch.dtype, device: str | None) -> None:
         super().__init__(epsilon, dtype, device)
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def get_inputs(self) -> list[torch.Tensor]:
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
@@ -187,7 +194,9 @@ def pattern(
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self.rmsnorm_matcher(all_reduce, rms_norm_weights, residual)
+            rmsnorm = vllm.ir.ops.fused_add_rms_norm(
+                all_reduce, residual, rms_norm_weights, self.epsilon
+            )
             return rmsnorm[0], rmsnorm[1]
 
         def replacement(
@@ -200,7 +209,9 @@ def replacement(
             # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
             residual = residual[0 : reduce_scatter.size(0), ...]
-            rmsnorm = self.rmsnorm_matcher(reduce_scatter, rms_norm_weights, residual)
+            rmsnorm = vllm.ir.ops.fused_add_rms_norm(
+                reduce_scatter, residual, rms_norm_weights, self.epsilon
+            )
             all_gather = self._all_gather(rmsnorm[0])
             # shape of residual changes but that's fine,
             # next node is already slicing it, now becomes a noop
@@ -263,7 +274,6 @@ def replacement(
 class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
     def __init__(self, epsilon: float, dtype: torch.dtype, device: str | None) -> None:
         super().__init__(epsilon, dtype, device)
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self) -> list[torch.Tensor]:
@@ -282,8 +292,8 @@ def pattern(
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            rms, residual_out = self.rmsnorm_matcher(
-                all_reduce, rms_norm_weights, residual
+            rms, residual_out = vllm.ir.ops.fused_add_rms_norm(
+                all_reduce, residual, rms_norm_weights, self.epsilon
             )
             quant, _ = self.quant_matcher(rms, scale)
             return quant, residual_out
@@ -300,8 +310,8 @@ def replacement(
             # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
             residual = residual[0 : reduce_scatter.size(0), ...]
-            rms, residual_out = self.rmsnorm_matcher(
-                reduce_scatter, rms_norm_weights, residual
+            rms, residual_out = vllm.ir.ops.fused_add_rms_norm(
+                reduce_scatter, residual, rms_norm_weights, self.epsilon
             )
             quant, _ = self.quant_matcher(rms, scale)
             all_gather = self._all_gather(quant)
diff --git a/vllm/compilation/passes/inductor_pass.py b/vllm/compilation/passes/inductor_pass.py
index b54c7bfa14d0..8a0d5326dd92 100644
--- a/vllm/compilation/passes/inductor_pass.py
+++ b/vllm/compilation/passes/inductor_pass.py
@@ -30,6 +30,9 @@ class PassContext:
     def __init__(self, compile_range: Range):
         self.compile_range: Range = compile_range
 
+        # set of arg indices
+        self.donated_input_ids: set[int] = set()
+
 
 def get_pass_context() -> PassContext:
     """Get the current pass context."""
diff --git a/vllm/compilation/passes/ir/clone_elimination.py b/vllm/compilation/passes/ir/clone_elimination.py
new file mode 100644
index 000000000000..61ba750a6c4e
--- /dev/null
+++ b/vllm/compilation/passes/ir/clone_elimination.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._higher_order_ops.triton_kernel_wrap import TritonKernelWrapperFunctional
+from torch._ops import HigherOrderOperator, OpOverload
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+from ..fx_utils import is_func
+from ..inductor_pass import get_pass_context
+from ..vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+def user_writes_to_node(user: fx.Node, node: fx.Node) -> bool:
+    if user.op == "output":
+        return False
+
+    if is_func(user, auto_functionalized):
+        # While autofunc writes to the node,
+        # this is a follow-up use we're not interested in.
+        # It is also guaranteed to be the final use,
+        # as auto_functionalized returns the tensor back for follow-up use.
+        return False
+    elif user.op == "call_function" and isinstance(user.target, HigherOrderOperator):
+        # By default, be conservative, assume this could be a write
+        # (except functional HOPs)
+        return not isinstance(user.target, TritonKernelWrapperFunctional)
+
+    assert isinstance(user.target, OpOverload), (
+        f"{node=} {user=} {user.op=} {user.target=}"
+    )
+    schema = user.target._schema
+    assert len(user.args) <= len(schema.arguments)
+    for i, arg in enumerate(user.args):
+        # Only interested in writes to node
+        if arg is not node:
+            continue
+
+        # If not a write, next arg could be
+        if schema.arguments[i].is_write:
+            return True
+
+    # No writes found
+    return False
+
+
+class UnsafeCloneEliminationPass(VllmInductorPass):
+    """
+    This pass removes clone nodes that are no longer needed after vLLM IR lowering.
+    It uses donated_input_ids to eliminate clones of donated graph inputs, preserving
+    contents of non-donated graph inputs.
+
+    It is "unsafe" because it does not (yet) take aliasing into account. Solving
+    aliasing is an open problem, so this pass intends to support known vLLM cases
+    and not guarantee soundness on general graphs. In the future, this pass will likely
+    support basic forms of aliasing to handle simple views (e.g. qkv -> q,k,v).
+    """
+
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        super().__init__(vllm_config)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph) -> None:
+        count = 0
+        node_to_idx = {node: i for i, node in enumerate(graph.nodes)}
+        pass_context = get_pass_context()
+        donated_input_ids = pass_context.donated_input_ids
+        logger.debug("Donated input ids: %s", donated_input_ids)
+
+        for node in graph.nodes:
+            if not is_func(node, torch.ops.aten.clone.default):
+                continue
+
+            original_node = node.args[0]
+            assert isinstance(original_node, fx.Node)
+
+            # Clone needs to be preserved if node is getting written to and
+            # the old value is used again.
+            # This could only happen if an inplace implementation was lowered.
+            # Then node (the clone) will have one write.
+            # TODO(luka) hopefully this can be removed once we lower functional graphs.
+            write_idxs = [
+                node_to_idx[u] for u in node.users if user_writes_to_node(u, node)
+            ]
+            assert len(write_idxs) in (0, 1)
+            if write_idxs:
+                # Check if a user of original_node occurs after a write
+                write_idx = write_idxs[0]
+                if any(
+                    node_to_idx[orig_user] > write_idx
+                    for orig_user in original_node.users
+                ):
+                    logger.debug(
+                        "Clone removal not possible, "
+                        "original_node=%s used after mutation on node=%s",
+                        original_node,
+                        node,
+                    )
+                    continue
+
+                # Check if a node is a (non-donated) graph input
+                if (
+                    original_node.op == "placeholder"
+                    and node_to_idx[original_node] not in donated_input_ids
+                ):
+                    logger.debug(
+                        "Graph input %s not donated, cannot eliminate its clone",
+                        original_node,
+                    )
+                    continue
+
+            logger.debug(
+                "Node %s is a redundant clone node of %s, removing it",
+                node,
+                original_node,
+            )
+            node.replace_all_uses_with(original_node)
+            graph.erase_node(node)
+            count += 1
+
+        logger.debug("CloneCleanupPass removed %d clone nodes", count)
diff --git a/vllm/compilation/passes/ir/inplace_functionalization.py b/vllm/compilation/passes/ir/inplace_functionalization.py
new file mode 100644
index 000000000000..e69351075bca
--- /dev/null
+++ b/vllm/compilation/passes/ir/inplace_functionalization.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import defaultdict
+
+from torch import fx
+from torch._inductor.pattern_matcher import (
+    PatternMatcherPass,
+)
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+from ..inductor_pass import get_pass_context
+from ..vllm_inductor_pass import VllmInductorPass
+from .lowering_pass import get_ir_op
+from .utils import overload_or_default
+
+logger = init_logger(__name__)
+
+
+class VllmIRInplaceFunctionalizationPass(VllmInductorPass):
+    """
+    This pass functionalizes maybe_inplace vLLM IR ops to the default overload.
+    The maybe_inplace overloads have the same signature as the default overload
+    so the pass simply replaces the called overload.
+    That makes the graph properly functional.
+    The pass also validates that activations passed to maybe_inplace have no later
+    uses in the graph: they are donated to the maybe_inplace op call,
+    and their contents are not defined afterward.
+
+    This pass operates pre-AOTAutograd,
+    so it must handle non-normalized and non-functional IR.
+    """
+
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        super().__init__(vllm_config)
+        self.patterns = PatternMatcherPass(self.pass_name)
+        self.functionalized_ops: dict[str, int] = defaultdict(lambda: 0)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph) -> None:
+        # clear at the beginning instead of end, so that tests can inspect
+        self.functionalized_ops.clear()
+        assert graph.owning_module is not None
+        node_to_idx = {node: i for i, node in enumerate(graph.nodes)}
+
+        # Pass donated input via vLLM's pass context
+        pass_context = get_pass_context()
+        pass_context.donated_input_ids = set[int]()
+
+        for node in graph.nodes:
+            if (ir_op := get_ir_op(node)) is None:
+                continue
+
+            op_overload = overload_or_default(node.target)
+            overload_name = op_overload._overloadname
+            if overload_name != "maybe_inplace":
+                assert overload_name == "default", (
+                    f"Found overload {overload_name} for op {ir_op.name}, "
+                    f"expected maybe_inplace or default"
+                )
+                continue
+
+            # must have maybe_inplace overload and allow_inplace
+            assert ir_op.allow_inplace and hasattr(ir_op, "maybe_inplace")
+
+            # Check that activation inputs are not used after this op
+            for arg_idx in ir_op.activation_indices:
+                arg = node.args[arg_idx]
+                assert isinstance(arg, fx.Node), "Activation inputs must be fx.Node"
+                for user in arg.users:
+                    if node_to_idx[user] > node_to_idx[node]:
+                        raise ValueError(
+                            f"Input {arg} to maybe_inplace node {node} "
+                            f"is used again after the node. "
+                            f"This is not allowed; activation inputs to maybe_inplace "
+                            f"ops are donated to the op, meaning their memory may be "
+                            f"recycled for outputs.\n\n"
+                            f"To preserve the inputs, use the default overload or "
+                            f"clone them manually beforehand."
+                        )
+
+                if arg.op == "placeholder":
+                    # Graph input that maybe_inplace might modify.
+                    # Mark it so downstream passes know it's donated.
+                    # TODO(luka) store in placeholder node meta once supported
+                    pass_context.donated_input_ids.add(node_to_idx[arg])
+
+            # Same signature, just replace the overload that's called.
+            node.target = ir_op.torch_op
+            self.functionalized_ops[ir_op.name] += 1
+
+        count = sum(self.functionalized_ops.values())
+        ops = ",".join(self.functionalized_ops.keys())
+        logger.debug("Donated input IDs: %s", pass_context.donated_input_ids)
+        logger.debug(
+            "%s functionalized %d vLLM IR nodes for op(s) %s",
+            self.pass_name,
+            count,
+            ops,
+        )
diff --git a/vllm/compilation/passes/ir/lowering_pass.py b/vllm/compilation/passes/ir/lowering_pass.py
index 02acdd1a298b..f34f1c64b76e 100644
--- a/vllm/compilation/passes/ir/lowering_pass.py
+++ b/vllm/compilation/passes/ir/lowering_pass.py
@@ -10,7 +10,6 @@
     PatternMatcherPass,
     register_graph_pattern,
 )
-from torch._ops import OpOverload, OpOverloadPacket
 
 from vllm.config import VllmConfig
 from vllm.ir.op import IrOp
@@ -18,41 +17,11 @@
 from vllm.logging_utils import lazy
 
 from ..vllm_inductor_pass import VllmInductorPass
+from .utils import get_ir_op
 
 logger = init_logger(__name__)
 
 
-def get_default_overload(op: OpOverload | OpOverloadPacket) -> OpOverload:
-    if isinstance(op, OpOverloadPacket):
-        return op.default
-    assert isinstance(op, OpOverload), "Expected an OpOverload or OpOverloadPacket"
-    return op
-
-
-def get_ir_op(node: fx.Node) -> IrOp | None:
-    if node.op != "call_function":
-        return None
-
-    if not isinstance(node.target, (OpOverload, OpOverloadPacket)):
-        return None
-
-    op_overload = get_default_overload(node.target)
-    if op_overload.namespace != "vllm_ir":
-        return None
-
-    op_name = op_overload._opname
-    if op_name not in IrOp.registry:
-        logger.warning(
-            "Unknown vLLM IR op %s, there's likely an issue with torch registration, "
-            "or a torch custom op was registered in the vllm_ir namespace by mistake.",
-            op_name,
-        )
-        return None
-
-    ir_op = IrOp.registry[op_name]
-    return ir_op
-
-
 class VllmIRLoweringPass(VllmInductorPass):
     """
     This pass lowers vLLM IR ops to their implementations the priority list.
@@ -76,7 +45,7 @@ def lower_matched_op(self, match: Match, *args, **kwargs):
 
         assert len(match.nodes) == 1, "Expected single node match"
         node = match.nodes[0]
-        ir_op = get_ir_op(node)
+        ir_op = get_ir_op(node)  # TODO is node.target always an overload?
         assert ir_op is not None, "Expected vLLM IR op"
         assert not node.kwargs  # I think there should never be kwargs here
 
@@ -86,13 +55,18 @@ def lower_matched_op(self, match: Match, *args, **kwargs):
         self.selected_impls[ir_op.name][node.name] = ir_op_impl.provider
 
         # replace_by_example wants node args, not the fake tensors
+        # use func_impl_fn to properly handle in-place implementations
         # TODO(luka): Use aot_export_module to get functionalized graph
         # TODO(luka): Cache the fx_replacement to avoid re-tracing the same impl
 
         # Defaults not present on node.args but required for replacement tracing
         bound_args = ir_op._py_signature.bind(*node.args)
         bound_args.apply_defaults()
-        match.replace_by_example(ir_op_impl.impl_fn, bound_args.args)
+        # It is not safe to run functional passes (like DCE) on the replacements
+        # as they might not be functional.
+        match.replace_by_example(
+            ir_op_impl.func_impl_fn, bound_args.args, run_functional_passes=False
+        )
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph) -> None:
@@ -136,7 +110,7 @@ def print_count(counts: dict[str, int]) -> str:
 
         if failed_nodes or failed_ops:
             logger.warning("Failed to lower vLLM IR ops: %s", ",".join(failed_ops))
-            logger.warning("Full node list: %s", failed_nodes)
+            logger.warning("Full node list: %s", ",".join(str(n) for n in failed_nodes))
 
     def uuid(self) -> str:
         """
diff --git a/vllm/compilation/passes/ir/utils.py b/vllm/compilation/passes/ir/utils.py
new file mode 100644
index 000000000000..50b4773ce523
--- /dev/null
+++ b/vllm/compilation/passes/ir/utils.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from torch import fx
+from torch._ops import OpOverload, OpOverloadPacket
+
+from vllm.ir.op import IrOp
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def overload_or_default(op: OpOverload | OpOverloadPacket) -> OpOverload:
+    if isinstance(op, OpOverloadPacket):
+        return op.default
+    assert isinstance(op, OpOverload), "Expected an OpOverload or OpOverloadPacket"
+    return op
+
+
+def get_ir_op(node: fx.Node) -> IrOp | None:
+    if node.op != "call_function":
+        return None
+
+    if not isinstance(node.target, (OpOverload, OpOverloadPacket)):
+        return None
+
+    op_overload = overload_or_default(node.target)
+    if op_overload.namespace != "vllm_ir":
+        return None
+
+    op_name = op_overload._opname
+    if op_name not in IrOp.registry:
+        logger.warning(
+            "Unknown vLLM IR op %s, there's likely an issue with torch registration, "
+            "or a torch custom op was registered in the vllm_ir namespace by mistake.",
+            op_name,
+        )
+        return None
+
+    ir_op = IrOp.registry[op_name]
+    return ir_op
diff --git a/vllm/compilation/passes/pass_manager.py b/vllm/compilation/passes/pass_manager.py
index b7c0d525c91d..5d4355a5b2b4 100644
--- a/vllm/compilation/passes/pass_manager.py
+++ b/vllm/compilation/passes/pass_manager.py
@@ -14,10 +14,14 @@
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import set_env_var
 
+from .ir.clone_elimination import UnsafeCloneEliminationPass
 from .ir.lowering_pass import VllmIRLoweringPass
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 if rocm_aiter_ops.is_enabled():
+    from .fusion.allreduce_rms_fusion import (
+        RocmAiterAllReduceFusionPass,
+    )
     from .fusion.rocm_aiter_fusion import (
         MLADualRMSNormFusionPass,
         RocmAiterRMSNormQuantFusionPass,
@@ -112,6 +116,8 @@ def __call__(self, graph: fx.Graph) -> None:
         # DCE handles mutating ops correctly as well.
         self.ir_lowering(graph)
         VllmInductorPass.dump_prefix += 1
+        self.clone_elimination(graph)
+        VllmInductorPass.dump_prefix += 1
 
         # clean up after lowering again
         self.post_cleanup(graph)
@@ -137,17 +143,21 @@ def configure(self, config: VllmConfig) -> None:
                     self.passes += [AsyncTPPass(config)]
 
             if self.pass_config.fuse_allreduce_rms:
-                self.passes += [AllReduceFusionPass(config)]
+                if rocm_aiter_ops.is_enabled():
+                    self.passes += [RocmAiterAllReduceFusionPass(config)]
+                else:
+                    self.passes += [AllReduceFusionPass(config)]
 
             if self.pass_config.fuse_minimax_qk_norm:
                 self.passes += [MiniMaxQKNormPass(config)]
 
             if self.pass_config.fuse_norm_quant:
-                self.passes += [RMSNormQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
                     self.passes += [
                         RocmAiterRMSNormQuantFusionPass(config),
                     ]
+                self.passes += [RMSNormQuantFusionPass(config)]
+
             if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
@@ -173,6 +183,7 @@ def configure(self, config: VllmConfig) -> None:
                 self.passes += [QKNormRoPEFusionPass(config)]
 
             self.ir_lowering = VllmIRLoweringPass(config)
+            self.clone_elimination = UnsafeCloneEliminationPass(config)
             self.post_cleanup = PostCleanupPass(config)
             self.fix_functionalization = FixFunctionalizationPass(config)
 
@@ -194,6 +205,7 @@ def uuid(self) -> str:
 
         passes.append(self.post_cleanup.uuid())
         passes.append(self.ir_lowering.uuid())
+        passes.append(self.clone_elimination.uuid())
         passes.append(self.post_cleanup.uuid())
         passes.append(self.fix_functionalization.uuid())
 
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index dc48528e9fcf..5635fe03ae2f 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -53,12 +53,6 @@ class TorchCompileWithNoGuardsWrapper:
     since we drop all guards.
     """
 
-    def check_invariants_and_forward(self, *args: Any, **kwargs: Any) -> Any:
-        assert hasattr(self, "_check_shape_invariants")
-        self._check_shape_invariants(*args, **kwargs)
-
-        return self.forward(*args, **kwargs)
-
     def _call_with_optional_nvtx_range(
         self, callable_fn: Callable[P, R], *args: P.args, **kwargs: P.kwargs
     ) -> Any:
@@ -115,6 +109,9 @@ def __init__(
                     "compilation_config.dynamic_shapes_config.evaluate_guards "
                     "requires VLLM_USE_BYTECODE_HOOK=0. "
                 )
+                assert ds_type != DynamicShapesType.UNBACKED, (
+                    "UNBACKED dynamic shapes do not add guards"
+                )
 
                 options["guard_filter_fn"] = lambda x: [
                     entry.guard_type == "SHAPE_ENV" for entry in x
@@ -130,19 +127,6 @@ def __init__(
         compiled_ptr: Any = self.forward
         # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
 
-        if ds_type == DynamicShapesType.UNBACKED:
-            # reason is that bytecode does torch._dynamo.eval_frame.
-            # remove_from_cache(self.original_code_object()) to force a new
-            # re-compilation. And if we use
-            # compiled_ptr = self.check_invariants_and_forward
-            # it will reset all entries.
-            assert not envs.VLLM_USE_BYTECODE_HOOK, (
-                "UNBACKED dynamic shapes requires VLLM_USE_BYTECODE_HOOK=0. "
-            )
-            assert not self.evaluate_guards, "UNBACKED dynamic shapes do not add guards"
-
-            compiled_ptr = self.check_invariants_and_forward
-
         # Apply the constrain_to_fx_strides patch before first compilation.
         # This covers STOCK_TORCH_COMPILE and DYNAMO_ONCE paths. The VLLM
         # compile paths call this from their own compile() methods too.
diff --git a/vllm/config/attention.py b/vllm/config/attention.py
index 18973f5d66d8..b5dc7a5bf602 100644
--- a/vllm/config/attention.py
+++ b/vllm/config/attention.py
@@ -6,8 +6,12 @@
 from pydantic import field_validator
 
 from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
+logger = init_logger(__name__)
+
 
 @config
 class AttentionConfig:
@@ -33,7 +37,7 @@ class AttentionConfig:
     and buffers can be pre-allocated to avoid inflating the memory estimate."""
 
     use_cudnn_prefill: bool = False
-    """Whether to use cudnn prefill."""
+    """Deprecated: cuDNN prefill backend has been removed."""
 
     use_trtllm_ragged_deepseek_prefill: bool = False
     """Whether to use TRTLLM ragged deepseek prefill."""
@@ -42,12 +46,18 @@ class AttentionConfig:
     """If set to True/False, use or don't use the TRTLLM attention backend
     in flashinfer. If None, auto-detect the attention backend in flashinfer."""
 
-    disable_flashinfer_prefill: bool = True
+    disable_flashinfer_prefill: bool | None = None
     """Whether to disable flashinfer prefill."""
 
     disable_flashinfer_q_quantization: bool = False
     """If set, when using fp8 kv, do not quantize Q to fp8."""
 
+    mla_prefill_backend: MLAPrefillBackendEnum | None = None
+    """MLA prefill backend to use. If None, will be selected automatically.
+    Valid options: FLASH_ATTN (FA3/FA4), FLASHINFER, TRTLLM_RAGGED.
+    This option supersedes use_trtllm_ragged_deepseek_prefill
+    and disable_flashinfer_prefill which are deprecated."""
+
     use_prefill_query_quantization: bool = False
     """If set, quantize query for attention in prefill."""
 
@@ -84,3 +94,48 @@ def validate_backend_before(cls, value: Any) -> Any:
                 return None
             return AttentionBackendEnum[value.upper()]
         return value
+
+    @field_validator("mla_prefill_backend", mode="before")
+    @classmethod
+    def validate_mla_prefill_backend_before(cls, value: Any) -> Any:
+        """Enable parsing of the `mla_prefill_backend` enum type from string."""
+        if isinstance(value, str):
+            return MLAPrefillBackendEnum[value.upper()]
+        return value
+
+    def __post_init__(self) -> None:
+        self._migrate_deprecated_mla_prefill_flags()
+
+    def _migrate_deprecated_mla_prefill_flags(self) -> None:
+        """Migrate deprecated MLA prefill flags to mla_prefill_backend."""
+        # If the new option is already set, it takes precedence
+        if self.mla_prefill_backend is not None:
+            return
+
+        # Check for deprecated flags and migrate them.
+        # Only the first flag encountered sets the backend.
+        if self.use_cudnn_prefill:
+            raise ValueError(
+                "The cuDNN MLA prefill backend has been removed. "
+                "Use --attention-config.mla_prefill_backend=FLASH_ATTN or "
+                "FLASHINFER or TRTLLM_RAGGED instead."
+            )
+
+        if self.use_trtllm_ragged_deepseek_prefill:
+            if self.mla_prefill_backend is None:
+                self.mla_prefill_backend = MLAPrefillBackendEnum.TRTLLM_RAGGED
+            logger.warning_once(
+                "use_trtllm_ragged_deepseek_prefill is deprecated and "
+                "will be removed in v0.22. Use "
+                "--attention-config.mla_prefill_backend=TRTLLM_RAGGED "
+                "instead."
+            )
+
+        if self.disable_flashinfer_prefill:
+            if self.mla_prefill_backend is None:
+                self.mla_prefill_backend = MLAPrefillBackendEnum.FLASH_ATTN
+            logger.warning_once(
+                "disable_flashinfer_prefill is deprecated and will be removed "
+                "in v0.22. Use --attention-config.mla_prefill_backend="
+                "FLASH_ATTN instead."
+            )
diff --git a/vllm/config/kernel.py b/vllm/config/kernel.py
index 93fb4c54b7f1..da1b1f9f1b11 100644
--- a/vllm/config/kernel.py
+++ b/vllm/config/kernel.py
@@ -31,6 +31,9 @@ class IrOpPriorityConfig:
     rms_norm: list[str] = Field(default_factory=list)
     """Priority list for vllm.ir.ops.rms_norm"""
 
+    fused_add_rms_norm: list[str] = Field(default_factory=list)
+    """Priority list for vllm.ir.ops.fused_add_rms_norm"""
+
     def compute_hash(self) -> str:
         """
         Produces a hash unique to the pass configuration.
@@ -115,6 +118,8 @@ def with_default(
     "flashinfer_cutlass",
     "flashinfer_cutedsl",
     "marlin",
+    "humming",
+    "triton_unfused",
     "aiter",
     "emulation",
 ]
@@ -145,6 +150,8 @@ class KernelConfig:
     - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
     - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
     - "marlin": Use Marlin kernels (weight-only quantization)
+    - "humming": Use Humming Mixed Precision kernels
+    - "triton_unfused": Use Triton unfused MoE kernels
     - "aiter": Use AMD AITer kernels (ROCm only)
     - "emulation": use BF16/FP16 GEMM, dequantizing weights and
                    running QDQ on activations.
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index afd0d1dd501a..95fd8787afe5 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -135,8 +135,10 @@ class ParallelConfig:
     data_parallel_external_lb: bool = False
     """Whether to use "external" DP LB mode. Applies only to online serving
     and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
-    wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank
-    is provided explicitly to vllm serve."""
+    wide-EP setup in Kubernetes. Supported only for MoE deployments; non-MoE
+    models should use independent vLLM instances without --data-parallel-*
+    arguments. Set implicitly when --data-parallel-rank is provided explicitly
+    to vllm serve."""
     data_parallel_hybrid_lb: bool = False
     """Whether to use "hybrid" DP LB mode. Applies only to online serving
     and when data_parallel_size > 0. Enables running an AsyncLLM
@@ -663,6 +665,33 @@ def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
         aggregated_has_unfinished = bool(tensor.item())
         return aggregated_has_unfinished
 
+    @staticmethod
+    def sync_dp_state(
+        dp_group: ProcessGroup, has_unfinished: bool, pending_pause: bool
+    ) -> tuple[bool, bool]:
+        """Combined all-reduce for DP state synchronization.
+
+        Uses a single SUM all-reduce on a 2-element tensor:
+          [0] = 1 if this rank has unfinished work, else 0.
+                SUM > 0 ≡ logical OR across ranks → any rank has work.
+          [1] = 1 if this rank has a pending pause request, else 0.
+                SUM == dp_size ≡ all ranks reached pause consensus.
+
+        has_unfinished_global is true if any rank has unfinished work,
+        or if some ranks are waiting for a pause consensus.
+
+        Returns:
+            (has_unfinished_global, pause_consensus)
+        """
+        tensor = torch.tensor(
+            [int(has_unfinished), int(pending_pause)], dtype=torch.int32, device="cpu"
+        )
+        torch.distributed.all_reduce(tensor, op=ReduceOp.SUM, group=dp_group)
+        dp_size = dp_group.size()
+        pause_count = tensor[1].item()
+        has_unfinished_global = tensor[0].item() > 0 or pause_count % dp_size != 0
+        return has_unfinished_global, pause_count == dp_size
+
     @staticmethod
     def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
         if kv_cache_memory == -1:
@@ -713,6 +742,14 @@ def compute_hash(self):
             "worker_extension_cls",
             "_api_process_count",
             "_api_process_rank",
+            # NUMA binding is per-rank host-side memory locality; it does
+            # not affect collective-communication semantics. When numa_bind
+            # is enabled with auto-detection, each DP rank stores its own
+            # NUMA node in numa_bind_nodes (see vllm/utils/numa_utils.py
+            # `_get_numa_node`), which would otherwise diverge the DP hash.
+            "numa_bind",
+            "numa_bind_nodes",
+            "numa_bind_cpus",
         }
 
         from vllm.config.utils import get_hash_factors, hash_factors
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 8763e6a0525a..e2904de9aa72 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -50,6 +50,7 @@
     "pangu_ultra_moe_mtp",
     "step3p5_mtp",
     "hy_v3_mtp",
+    "gemma4_mtp",
 ]
 NgramGPUTypes = Literal["ngram_gpu"]
 DFlashModelTypes = Literal["dflash"]
@@ -491,6 +492,17 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
                 {"n_predict": n_predict, "architectures": ["HYV3MTPModel"]}
             )
 
+        if hf_config.model_type == "gemma4_assistant":
+            hf_config.model_type = "gemma4_mtp"
+            text_config = getattr(hf_config, "text_config", hf_config)
+            # The assistant runs all decoder layers in a single forward
+            # call to produce one draft token, so n_predict=1.
+            # num_kv_shared_layers must be 0: cross-model KV sharing is
+            # set up by the proposer after model construction.
+            if hasattr(text_config, "num_kv_shared_layers"):
+                text_config.num_kv_shared_layers = 0
+            hf_config.update({"n_predict": 1, "architectures": ["Gemma4MTPModel"]})
+
         return hf_config
 
     def __post_init__(self):
@@ -626,6 +638,7 @@ def __post_init__(self):
                     revision=self.revision,
                     code_revision=self.code_revision,
                     tokenizer_revision=self.target_model_config.tokenizer_revision,
+                    max_model_len=self.max_model_len,  # type: ignore[arg-type]
                     spec_target_max_model_len=self.target_model_config.max_model_len,
                     quantization=self.quantization,
                     enforce_eager=self.target_model_config.enforce_eager,
@@ -837,10 +850,17 @@ def _maybe_override_draft_max_model_len(
 
             return speculative_max_model_len
 
-        return min(
+        result = min(
             draft_max_model_len,
             target_max_model_len,
         )
+        if result != draft_max_model_len:
+            logger.info(
+                "Overriding draft model max model len from %d to %d",
+                draft_max_model_len,
+                result,
+            )
+        return result
 
     @staticmethod
     def _verify_and_get_draft_tp(
@@ -1032,6 +1052,14 @@ def max_num_new_slots_for_drafting(self) -> int:
             slots_per_req += 1
         return slots_per_req
 
+    def use_gemma4_mtp(self) -> bool:
+        return (
+            self.method == "mtp"
+            and self.draft_model_config is not None
+            and getattr(self.draft_model_config.hf_config, "model_type", None)
+            == "gemma4_mtp"
+        )
+
     def use_eagle(self) -> bool:
         return self.method in ("eagle", "eagle3", "mtp", "dflash")
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index f591605d08c7..52c04509b2fa 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -121,6 +121,13 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
     from vllm.platforms import current_platform
     from vllm.utils.flashinfer import has_flashinfer
 
+    if current_platform.is_rocm():
+        from vllm._aiter_ops import rocm_aiter_ops
+
+        return (
+            rocm_aiter_ops.is_enabled() and cfg.parallel_config.tensor_parallel_size > 1
+        )
+
     return (
         cfg.parallel_config.tensor_parallel_size > 1
         and current_platform.is_cuda()
@@ -129,12 +136,6 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
             current_platform.is_device_capability_family(100)
             or current_platform.is_device_capability(90)
         )
-        # tp-dp combination broken:
-        # https://github.com/vllm-project/vllm/issues/34458
-        and cfg.parallel_config.data_parallel_size == 1
-        # tp-pp combination broken:
-        # https://github.com/vllm-project/vllm/issues/35426
-        and cfg.parallel_config.pipeline_parallel_size == 1
     )
 
 
@@ -156,10 +157,9 @@ def enable_rope_kvcache_fusion(cfg: "VllmConfig") -> bool:
 
 def enable_norm_pad_fusion(cfg: "VllmConfig") -> bool:
     """Enable if using AITER RMSNorm and hidden size is 2880 i.e. gpt-oss."""
-    from vllm._aiter_ops import rocm_aiter_ops
 
     return (
-        rocm_aiter_ops.is_rmsnorm_enabled()
+        cfg.kernel_config.ir_op_priority.fused_add_rms_norm[0] == "aiter"
         and cfg.model_config is not None
         and cfg.model_config.get_hidden_size() == 2880
     )
@@ -209,7 +209,9 @@ def enable_mla_dual_rms_norm_fusion(cfg: "VllmConfig") -> bool:
         "use_inductor_graph_partition": False,
     },
     "kernel_config": {
-        "enable_flashinfer_autotune": True,
+        # Disabled for now due to correctness issues:
+        # https://github.com/flashinfer-ai/flashinfer/issues/3197
+        "enable_flashinfer_autotune": False,
     },
 }
 OPTIMIZATION_LEVEL_02 = {
@@ -229,7 +231,9 @@ def enable_mla_dual_rms_norm_fusion(cfg: "VllmConfig") -> bool:
         "use_inductor_graph_partition": False,
     },
     "kernel_config": {
-        "enable_flashinfer_autotune": True,
+        # Disabled for now due to correctness issues:
+        # https://github.com/flashinfer-ai/flashinfer/issues/3197
+        "enable_flashinfer_autotune": False,
     },
 }
 OPTIMIZATION_LEVEL_03 = {
@@ -1432,6 +1436,10 @@ def _set_cudagraph_sizes(self):
         cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
             range(256, max_graph_size + 1, 16))
 
+        `max_num_batched_tokens` is also appended to the list if it fits
+        within `max_cudagraph_capture_size`, so the max batch size is captured
+        even when off-stride.
+
         In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
         will be the final sizes to capture cudagraph (in ascending order).
 
@@ -1520,6 +1528,12 @@ def _set_cudagraph_sizes(self):
                     cudagraph_capture_sizes += list(
                         range(256, max_cudagraph_capture_size + 1, 16)
                     )
+                # ensure max_num_tokens is captured if within max capture size
+                if (
+                    max_num_tokens <= max_cudagraph_capture_size
+                    and max_num_tokens not in cudagraph_capture_sizes
+                ):
+                    cudagraph_capture_sizes.append(max_num_tokens)
                 # de-duplicate and sort the sizes
                 cudagraph_capture_sizes = sorted(set(cudagraph_capture_sizes))
 
@@ -1594,11 +1608,16 @@ def _set_compile_ranges(self):
         if compile_range_end is not None:
             computed_compile_ranges_endpoints.append(compile_range_end)
 
-        # Add the compile ranges for flashinfer
+        # Add the compile ranges for flashinfer/aiter.
         if compilation_config.pass_config.fuse_allreduce_rms:
             tp_size = self.parallel_config.tensor_parallel_size
-            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
-            if max_size is not None:
+            from vllm._aiter_ops import rocm_aiter_ops
+
+            if rocm_aiter_ops.is_enabled():
+                max_size = rocm_aiter_ops.get_aiter_allreduce_max_size()
+            else:
+                max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
+            if max_size is not None and self.model_config is not None:
                 assert isinstance(self.model_config.dtype, torch.dtype)
                 max_token_num = max_size // (
                     self.model_config.get_hidden_size()
@@ -1874,6 +1893,18 @@ def validate_block_size(self) -> None:
                 "in the middle of a mm input"
             )
 
+    @model_validator(mode="after")
+    def validate_nvfp4_kv_cache_with_mla(self) -> "VllmConfig":
+        if self.model_config is None:
+            return self
+        if self.cache_config.cache_dtype == "nvfp4" and self.model_config.use_mla:
+            raise ValueError(
+                "nvfp4 KV cache is not supported with MLA (Multi-head Latent "
+                "Attention) backends. Please use a different --kv-cache-dtype "
+                "(e.g., 'fp8' or 'auto') for MLA models such as DeepSeek."
+            )
+        return self
+
     @model_validator(mode="after")
     def validate_mamba_block_size(self) -> "VllmConfig":
         if self.model_config is None:
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 6a15d3f6168a..57ef6e9cf148 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -10,7 +10,6 @@
 from vllm.distributed import get_dp_group, get_ep_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils.flashinfer import (
     has_flashinfer_nvlink_one_sided,
     has_flashinfer_nvlink_two_sided,
@@ -225,11 +224,8 @@ def _make_all2all_kwargs(self) -> dict[Any, Any]:
             num_rdma_bytes=num_rdma_bytes,
             low_latency_mode=False,
             num_qps_per_rank=num_qps_per_rank,
+            explicitly_destroy=True,
         )
-        if not current_platform.is_rocm():
-            kwargs.update(
-                explicitly_destroy=True,
-            )
         return kwargs
 
     def get_handle(self, kwargs):
@@ -303,13 +299,10 @@ def _make_all2all_kwargs(
             num_rdma_bytes=num_rdma_bytes,
             low_latency_mode=True,
             num_qps_per_rank=num_qps_per_rank,
+            allow_nvlink_for_low_latency_mode=True,
+            allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
+            explicitly_destroy=True,
         )
-        if not current_platform.is_rocm():
-            kwargs.update(
-                allow_nvlink_for_low_latency_mode=True,
-                allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
-                explicitly_destroy=True,
-            )
         return kwargs
 
     def get_handle(self, kwargs):
@@ -584,6 +577,8 @@ def initialize(
         top_k: int,
         num_experts: int,
         hidden_size: int,
+        dispatch_dtype_bytes_per_elem: int = 0,
+        dispatch_scale_bytes_per_token: int = 0,
     ):
         """Initialize the MoeAlltoAll workspace."""
         if self.initialized:
@@ -614,9 +609,13 @@ def initialize(
         ep_config = MnnvlConfig(
             comm_backend=CustomCommunicator(self.cpu_group),
         )
+        if dispatch_dtype_bytes_per_elem == 0:
+            hidden_bytes = hidden_size // 2
+        else:
+            hidden_bytes = hidden_size * dispatch_dtype_bytes_per_elem
         total_dispatch_payload_size_per_token = (
-            hidden_size // 2  # nvfp4 hidden states
-            + hidden_size // 16  # fp8 scaling factors
+            hidden_bytes
+            + dispatch_scale_bytes_per_token
             + top_k * 4  # int32 topks ids
             + top_k * 4  # float32 topk weights
         )
diff --git a/vllm/distributed/eplb/eplb_communicator.py b/vllm/distributed/eplb/eplb_communicator.py
index 6ff41272fce9..ffd0406f76f5 100644
--- a/vllm/distributed/eplb/eplb_communicator.py
+++ b/vllm/distributed/eplb/eplb_communicator.py
@@ -11,6 +11,7 @@
 from collections.abc import Sequence
 from datetime import timedelta
 
+import numpy as np
 import torch
 from torch.distributed import (
     P2POp,
@@ -47,15 +48,25 @@ class EplbCommunicator(ABC):
     """Abstract EPLB communicator for expert weight transfers."""
 
     @abstractmethod
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
+    def add_send(
+        self,
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,
+    ) -> None:
         pass
 
     @abstractmethod
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,
+    ) -> None:
         pass
 
     @abstractmethod
-    def execute(self) -> None:
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
         pass
 
     @property
@@ -85,27 +96,39 @@ def __init__(
         self._p2p_ops: list[P2POp] = []
         self._log_initialized()
 
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
-        self._p2p_ops.append(
-            P2POp(
-                torch.distributed.isend,
-                tensor,
-                dst_rank,
-                self._ep_group,
+    def add_send(
+        self,
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
+        for tensor in tensors:
+            self._p2p_ops.append(
+                P2POp(
+                    torch.distributed.isend,
+                    tensor,
+                    dst_rank,
+                    self._ep_group,
+                )
             )
-        )
 
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
-        self._p2p_ops.append(
-            P2POp(
-                torch.distributed.irecv,
-                tensor,
-                src_rank,
-                self._ep_group,
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
+        for tensor in tensors:
+            self._p2p_ops.append(
+                P2POp(
+                    torch.distributed.irecv,
+                    tensor,
+                    src_rank,
+                    self._ep_group,
+                )
             )
-        )
 
-    def execute(self) -> None:
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
         if not self._p2p_ops:
             return
         try:
@@ -130,13 +153,25 @@ def __init__(
         self._ops: list[tuple[str, torch.Tensor, int]] = []
         self._log_initialized()
 
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
-        self._ops.append(("send", tensor, dst_rank))
+    def add_send(
+        self,
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
+        for tensor in tensors:
+            self._ops.append(("send", tensor, dst_rank))
 
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
-        self._ops.append(("recv", tensor, src_rank))
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
+        for tensor in tensors:
+            self._ops.append(("recv", tensor, src_rank))
 
-    def execute(self) -> None:
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
         if not self._ops:
             return
 
@@ -207,17 +242,17 @@ def __init__(
         self._cuda_stream = cuda_stream
         self._world_size = cpu_group.size()
         self._rank = cpu_group.rank()
-        self._send_tensors: dict[torch.dtype, list[list[torch.Tensor]]] = {}
-        self._recv_tensors: dict[torch.dtype, list[list[torch.Tensor]]] = {}
-        self._dtypes: list[torch.dtype] = []
+        # expert_id -> weight tensors to pack into the send buffer.
+        self._expert_send_map: dict[int, list[torch.Tensor]] = {}
+        # src_rank -> expert_id -> weight tensors to unpack after transfer.
+        self._recv_map: dict[int, dict[int, list[torch.Tensor]]] = {}
+        self._num_local_experts: int = expert_weights[0].shape[0]
         self._device = expert_weights[0].device
         for tensor in expert_weights:
             assert tensor.device == self._device, (
                 "All local EPLB tensors are expected to be on the same device: "
                 f"expected={self._device}, got={tensor.device}"
             )
-            if tensor.dtype not in self._dtypes:
-                self._dtypes.append(tensor.dtype)
 
         config = (
             nixl_agent_config(capture_telemetry=False)
@@ -228,13 +263,12 @@ def __init__(
         self._nixl_memory_type = "VRAM"
         self._registered_desc: object | None = None
         self._remote_agents: dict[int, str] = {}
-        self._remote_send_meta: dict[int, tuple[int, int, int]] = {}
+        self._remote_send_meta: dict[int, tuple[int, int]] = {}
         self._send_buffer: torch.Tensor = torch.empty(0)
         self._recv_buffer: torch.Tensor = torch.empty(0)
-        self._peer_partition_bytes: int = 0
-        self._dtype_max_bytes: dict[torch.dtype, int] = {}
+        self._expert_bytes: int = 0
+
         self._cuda_device_id = int(self._device.index or 0)
-        self._xfer_cache: dict[tuple[int, int, int], tuple[int, int, int]] = {}
         self._init_step("buffers", self._init_registered_buffers, expert_weights)
         self._init_step("agents", self._init_remote_agents)
         self._init_step("send meta", self._exchange_remote_send_meta)
@@ -258,34 +292,33 @@ def _make_agent_name(self) -> str:
         uid = uuid.uuid4().hex[:8]
         return f"eplb-{self._rank}{pp_suffix}-{uid}"
 
-    def _get_peer_buckets(
+    def add_send(
         self,
-        bucket_map: dict[torch.dtype, list[list[torch.Tensor]]],
-        dtype: torch.dtype,
-    ) -> list[list[torch.Tensor]]:
-        peer_buckets = bucket_map.get(dtype)
-        if peer_buckets is None:
-            peer_buckets = [[] for _ in range(self._world_size)]
-            bucket_map[dtype] = peer_buckets
-        return peer_buckets
-
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,
+    ) -> None:
         assert dst_rank != self._rank, (
             "EPLB communicator should not enqueue same-rank sends: "
             f"rank={self._rank}, dst_rank={dst_rank}"
         )
-        self._get_peer_buckets(self._send_tensors, tensor.dtype)[dst_rank].append(
-            tensor
-        )
+        # An expert sent to multiple peers is packed only once; skip duplicates.
+        if expert_id not in self._expert_send_map:
+            self._expert_send_map[expert_id] = tensors
 
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,
+    ) -> None:
         assert src_rank != self._rank, (
             "EPLB communicator should not enqueue same-rank recvs: "
             f"rank={self._rank}, src_rank={src_rank}"
         )
-        self._get_peer_buckets(self._recv_tensors, tensor.dtype)[src_rank].append(
-            tensor
-        )
+        recv_experts = self._recv_map.setdefault(src_rank, {})
+        if expert_id not in recv_experts:
+            recv_experts[expert_id] = tensors
 
     def _init_remote_agents(self) -> None:
         local_metadata = self._nixl_wrapper.get_agent_metadata()
@@ -303,30 +336,18 @@ def _init_remote_agents(self) -> None:
             )
 
     def _init_registered_buffers(self, expert_weights: Sequence[torch.Tensor]) -> None:
-        total_max_bytes = 0
-        for dtype in self._dtypes:
-            max_numel = max(
-                sum(t.numel() for t in expert_weights if t.dtype == dtype), 1
-            )
-            max_bytes = max_numel * dtype.itemsize
-            self._dtype_max_bytes[dtype] = max_bytes
-            total_max_bytes += max_bytes
-
-        self._peer_partition_bytes = total_max_bytes
-
-        # The send buffer needs world_size partitions because remote peers
-        # READ from fixed offsets (rank * partition_bytes).
-        # This allocates world_size * partition_bytes
-        # which can cause OOM on large models.
-        # TODO(ilmarkov): shrink to const * partition_bytes and execute
-        # communication in multiple steps dealing with the worst case.
-        send_total_bytes = self._peer_partition_bytes * self._world_size
+        total_bytes = max(sum(t.nbytes for t in expert_weights), 1)
+        assert total_bytes % self._num_local_experts == 0, (
+            f"Number of bytes in moe layer {total_bytes} is not divisible "
+            f"by number of local experts {self._num_local_experts}"
+        )
+        self._expert_bytes = total_bytes // self._num_local_experts
 
         self._send_buffer = torch.empty(
-            send_total_bytes, device=self._device, dtype=torch.uint8
+            total_bytes, device=self._device, dtype=torch.uint8
         )
         self._recv_buffer = torch.empty(
-            self._peer_partition_bytes, device=self._device, dtype=torch.uint8
+            total_bytes, device=self._device, dtype=torch.uint8
         )
 
         descs = self._nixl_wrapper.get_reg_descs([self._send_buffer, self._recv_buffer])
@@ -336,12 +357,11 @@ def _init_registered_buffers(self, expert_weights: Sequence[torch.Tensor]) -> No
     def _exchange_remote_send_meta(self) -> None:
         """Exchange send-buffer metadata so each rank can build dynamic
         descriptors at execute time."""
-        local_meta: tuple[int, int, int] = (
+        local_meta: tuple[int, int] = (
             self._send_buffer.data_ptr(),
-            self._peer_partition_bytes,
             self._cuda_device_id,
         )
-        gathered_meta: list[tuple[int, int, int] | None] = [None] * self._world_size
+        gathered_meta: list[tuple[int, int] | None] = [None] * self._world_size
         torch.distributed.all_gather_object(
             gathered_meta, local_meta, group=self._cpu_group
         )
@@ -353,14 +373,11 @@ def _exchange_remote_send_meta(self) -> None:
 
     @staticmethod
     def _pack_send_buffer(
-        peer_tensors: list[torch.Tensor],
+        in_tensors: list[torch.Tensor],
         send_buffer: torch.Tensor,
         byte_offset: int,
-    ) -> int:
-        """
-        Returns the byte offset after the last written byte.
-        """
-        for tensor in peer_tensors:
+    ) -> None:
+        for tensor in in_tensors:
             raw = tensor.reshape(-1).view(torch.uint8)
             if raw.numel() == 0:
                 continue
@@ -368,18 +385,14 @@ def _pack_send_buffer(
                 raw, non_blocking=True
             )
             byte_offset += raw.numel()
-        return byte_offset
 
     @staticmethod
     def _unpack_recv_buffer(
         recv_buffer: torch.Tensor,
-        peer_tensors: list[torch.Tensor],
+        out_tensors: list[torch.Tensor],
         byte_offset: int,
-    ) -> int:
-        """
-        Returns the byte offset after the last read byte.
-        """
-        for tensor in peer_tensors:
+    ) -> None:
+        for tensor in out_tensors:
             num_bytes = tensor.numel() * tensor.element_size()
             if num_bytes == 0:
                 continue
@@ -388,19 +401,6 @@ def _unpack_recv_buffer(
                 non_blocking=True,
             )
             byte_offset += num_bytes
-        return byte_offset
-
-    def _release_all_cached_handles(self) -> None:
-        """Best-effort release of every cached dlist and xfer handle."""
-        for local_dlist, remote_dlist, xfer in self._xfer_cache.values():
-            for release_fn, handle in (
-                (self._nixl_wrapper.release_xfer_handle, xfer),
-                (self._nixl_wrapper.release_dlist_handle, local_dlist),
-                (self._nixl_wrapper.release_dlist_handle, remote_dlist),
-            ):
-                with contextlib.suppress(Exception):
-                    release_fn(handle)
-        self._xfer_cache.clear()
 
     def _wait_for_all_transfers(self, handles: list[int]) -> None:
         pending = set(handles)
@@ -418,82 +418,68 @@ def _wait_for_all_transfers(self, handles: list[int]) -> None:
             if pending:
                 time.sleep(0.0005)
 
-    def _get_or_create_xfer(self, src: int, total_bytes: int, recv_offset: int) -> int:
-        """Return a cached xfer handle or create and cache a new one."""
-        key = (src, total_bytes, recv_offset)
-        cached = self._xfer_cache.get(key)
-        if cached is not None:
-            return cached[2]
+    def _create_peer_xfer(
+        self,
+        src: int,
+        local_descs: list[tuple[int, int, int]],
+        remote_descs: list[tuple[int, int, int]],
+    ) -> tuple[int, int, int]:
+        """Create a batched xfer for multiple descriptors from one peer.
+
+        Each element in *local_descs* / *remote_descs* is an
+        ``(address, size, device_id)`` tuple.
 
-        recv_base = self._recv_buffer.data_ptr()
+        Returns ``(local_dlist, remote_dlist, xfer_handle)``.
+        """
         local_desc = self._nixl_wrapper.get_xfer_descs(
-            [
-                (
-                    recv_base + recv_offset,
-                    total_bytes,
-                    self._cuda_device_id,
-                )
-            ],
-            self._nixl_memory_type,
+            local_descs, self._nixl_memory_type
         )
         local_handle = self._nixl_wrapper.prep_xfer_dlist(
             "NIXL_INIT_AGENT",
             local_desc,
         )
 
-        remote_base, remote_part_bytes, remote_dev = self._remote_send_meta[src]
-        agent_name = self._remote_agents[src]
         remote_desc = self._nixl_wrapper.get_xfer_descs(
-            [
-                (
-                    remote_base + self._rank * remote_part_bytes,
-                    total_bytes,
-                    remote_dev,
-                )
-            ],
-            self._nixl_memory_type,
+            remote_descs, self._nixl_memory_type
         )
         remote_handle = self._nixl_wrapper.prep_xfer_dlist(
-            agent_name,
+            self._remote_agents[src],
             remote_desc,
         )
 
+        indices = list(range(len(local_descs)))
         xfer_handle = self._nixl_wrapper.make_prepped_xfer(
             "READ",
             local_handle,
-            [0],
+            indices,
             remote_handle,
-            [0],
+            indices,
+        )
+        return (local_handle, remote_handle, xfer_handle)
+
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
+        assert old_indices is not None, (
+            "NixlEplbCommunicator.execute requires old_indices"
         )
-        self._xfer_cache[key] = (local_handle, remote_handle, xfer_handle)
-        return xfer_handle
 
-    def execute(self) -> None:
-        xfer_handles: list[int] = []
+        xfer_entries: list[tuple[int, int, int]] = []
         try:
-            # Phase 1: pack send buffers.
+            n = self._num_local_experts
+            rank_experts = old_indices[: self._world_size * n].reshape(
+                self._world_size, n
+            )
+            # Build expert_id -> send slot mapping per rank.
+            expert_to_send_slot: list[dict[int, int]] = [
+                {int(eid): i for i, eid in enumerate(row) if eid != -1}
+                for row in rank_experts
+            ]
+
+            # Phase 1: pack each expert at its slot offset in the send buffer.
             with torch.cuda.stream(self._cuda_stream):
-                for dst in range(self._world_size):
-                    byte_offset = dst * self._peer_partition_bytes
-                    for dtype in self._dtypes:
-                        peer_tensors = self._send_tensors.get(
-                            dtype, [[] for _ in range(self._world_size)]
-                        )[dst]
-                        actual_bytes = sum(
-                            t.numel() * t.element_size() for t in peer_tensors
-                        )
-                        if actual_bytes > self._dtype_max_bytes[dtype]:
-                            raise RuntimeError(
-                                "NIXL EPLB send overflow for dtype "
-                                f"{dtype}: peer={dst}, "
-                                f"required={actual_bytes}, "
-                                f"capacity={self._dtype_max_bytes[dtype]}"
-                            )
-                        byte_offset = self._pack_send_buffer(
-                            peer_tensors,
-                            self._send_buffer,
-                            byte_offset,
-                        )
+                for expert_id, tensors in self._expert_send_map.items():
+                    slot = expert_to_send_slot[self._rank][expert_id]
+                    byte_offset = slot * self._expert_bytes
+                    self._pack_send_buffer(tensors, self._send_buffer, byte_offset)
 
             # Ensure all packed data is visible in device memory before pulls.
             if self._cuda_stream is not None:
@@ -508,58 +494,65 @@ def execute(self) -> None:
                 timeout=timedelta(minutes=5),
             )
 
-            # Phase 2: look up or create descriptors and issue all READs.
-            # Data from all peers is packed sequentially into the single
-            # partition-sized recv buffer at running offsets.
-            recv_offsets: dict[int, int] = {}
+            # Phase 2: issue one batched READ per peer.
+            recv_offsets: dict[tuple[int, int], int] = {}
             recv_offset = 0
+            recv_base = self._recv_buffer.data_ptr()
             for src in range(self._world_size):
                 if src == self._rank:
                     continue
-                actual_total_bytes = 0
-                for dtype in self._dtypes:
-                    peer_tensors = self._recv_tensors.get(
-                        dtype, [[] for _ in range(self._world_size)]
-                    )[src]
-                    actual_total_bytes += sum(
-                        t.numel() * t.element_size() for t in peer_tensors
-                    )
-                if actual_total_bytes == 0:
+                recv_experts = self._recv_map.get(src)
+                if not recv_experts:
                     continue
-
-                recv_offsets[src] = recv_offset
-                xfer_handle = self._get_or_create_xfer(
-                    src, actual_total_bytes, recv_offset
+                expert_ids = list(recv_experts.keys())
+                remote_base, remote_dev = self._remote_send_meta[src]
+                local_descs: list[tuple[int, int, int]] = []
+                remote_descs: list[tuple[int, int, int]] = []
+                for expert_id in expert_ids:
+                    slot = expert_to_send_slot[src][expert_id]
+                    remote_off = slot * self._expert_bytes
+                    recv_offsets[(src, expert_id)] = recv_offset
+                    local_descs.append(
+                        (
+                            recv_base + recv_offset,
+                            self._expert_bytes,
+                            self._cuda_device_id,
+                        )
+                    )
+                    remote_descs.append(
+                        (remote_base + remote_off, self._expert_bytes, remote_dev)
+                    )
+                    recv_offset += self._expert_bytes
+                    assert recv_offset <= self._recv_buffer.nbytes
+                local_h, remote_h, xfer_h = self._create_peer_xfer(
+                    src, local_descs, remote_descs
                 )
-                self._nixl_wrapper.transfer(xfer_handle)
-                xfer_handles.append(xfer_handle)
-                recv_offset += actual_total_bytes
+                self._nixl_wrapper.transfer(xfer_h)
+                xfer_entries.append((local_h, remote_h, xfer_h))
 
-            # Phase 3: single wait for all in-flight transfers, then unpack.
-            self._wait_for_all_transfers(xfer_handles)
+            # Phase 3: wait for all in-flight transfers, then unpack.
+            self._wait_for_all_transfers([x[2] for x in xfer_entries])
 
             with torch.cuda.stream(self._cuda_stream):
-                for src, offset in recv_offsets.items():
-                    byte_offset = offset
-                    for dtype in self._dtypes:
-                        peer_tensors = self._recv_tensors.get(
-                            dtype, [[] for _ in range(self._world_size)]
-                        )[src]
-                        byte_offset = self._unpack_recv_buffer(
-                            self._recv_buffer,
-                            peer_tensors,
-                            byte_offset,
-                        )
-        except Exception:
-            self._release_all_cached_handles()
-            raise
+                for (src, expert_id), offset in recv_offsets.items():
+                    self._unpack_recv_buffer(
+                        self._recv_buffer,
+                        self._recv_map[src][expert_id],
+                        offset,
+                    )
         finally:
-            self._send_tensors.clear()
-            self._recv_tensors.clear()
+            for local_h, remote_h, xfer_h in xfer_entries:
+                with contextlib.suppress(Exception):
+                    self._nixl_wrapper.release_xfer_handle(xfer_h)
+                with contextlib.suppress(Exception):
+                    self._nixl_wrapper.release_dlist_handle(local_h)
+                with contextlib.suppress(Exception):
+                    self._nixl_wrapper.release_dlist_handle(remote_h)
+            self._expert_send_map.clear()
+            self._recv_map.clear()
 
     def __del__(self) -> None:
         try:
-            self._release_all_cached_handles()
             if self._registered_desc is not None:
                 self._nixl_wrapper.deregister_memory(self._registered_desc)
                 self._registered_desc = None
@@ -588,15 +581,27 @@ def _ensure_group_started(self) -> None:
             self._pynccl_comm.group_start()
             self._group_started = True
 
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
+    def add_send(
+        self,
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
         self._ensure_group_started()
-        self._pynccl_comm.send(tensor, dst_rank, stream=self._cuda_stream)
+        for tensor in tensors:
+            self._pynccl_comm.send(tensor, dst_rank, stream=self._cuda_stream)
 
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
         self._ensure_group_started()
-        self._pynccl_comm.recv(tensor, src_rank, stream=self._cuda_stream)
+        for tensor in tensors:
+            self._pynccl_comm.recv(tensor, src_rank, stream=self._cuda_stream)
 
-    def execute(self) -> None:
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
         if self._group_started:
             self._pynccl_comm.group_end()
             self._group_started = False
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index f348521c00eb..cf8db9264890 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -294,9 +294,9 @@ def move_to_buffer(
             recver_pos = remainder_start + sender_pos
             if recver_pos < len(ranks_to_recv):
                 recv_ranks.append(ranks_to_recv[recver_pos])
+            expert_tensors = [w[src] for w in expert_weights]
             for dst in recv_ranks:
-                for w in expert_weights:
-                    communicator.add_send(w[src], dst)
+                communicator.add_send(expert_tensors, dst, expert_id=int(expert))
 
     # 3. Post recvs
     if recv_count > 0:
@@ -325,11 +325,14 @@ def move_to_buffer(
                 src = ranks_to_send[recver_pos // num_dst_per_sender]
             else:
                 src = ranks_to_send[recver_pos - remainder_start]
-            for b in expert_weights_buffers:
-                communicator.add_recv(b[dst], src)
+            communicator.add_recv(
+                [b[dst] for b in expert_weights_buffers],
+                src,
+                expert_id=int(expert),
+            )
 
     # 4. Execute the P2P operations. The real communication happens here.
-    communicator.execute()
+    communicator.execute(old_indices=old_indices)
     # wait for the communication to finish
     return TransferMetadata(
         is_unchanged=is_unchanged,
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index 39377aabcce3..64be074cf9f8 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
 
 ## Disaggregated prefilling
 
-The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
+The example usage is in [this file](../../../examples/disaggregated/disaggregated_prefill.sh).
 
 Here is the diagram of how we run disaggregated prefilling.
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 63b56eddfaed..b85416ab3071 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -386,38 +386,6 @@ class EngineTransferInfo:
     """Physical blocks per logical block."""
 
 
-@dataclass(frozen=True)
-class MambaEngineTransferInfo(EngineTransferInfo):
-    """Extends ``EngineTransferInfo`` with Mamba-hybrid transfer geometry.
-
-    For hybrid SSM+Attention models, FA and Mamba layers may require
-    different numbers of reads from different remote ranks.  This
-    dataclass captures that per-engine transfer plan.
-    """
-
-    remote_fa_source_ranks: tuple[int, ...]
-    """Remote ranks carrying unique FA heads for this local rank."""
-
-    remote_all_source_ranks: tuple[int, ...]
-    """All remote ranks this local rank reads from (FA + Mamba)."""
-
-    remote_num_fa_reads: int
-    """Number of distinct remote ranks needed for FA data."""
-
-    remote_num_mamba_reads: int
-    """Number of distinct remote ranks needed for Mamba data."""
-
-    remote_fa_descriptor_bytes: int
-    """Byte size of one FA K (or V) descriptor entry."""
-
-    is_remote_replicated: bool
-    """Whether the remote engine has replicated KV heads
-    (remote_tp_size > total_num_kv_heads)."""
-
-    remote_physical_heads: int
-    """Physical KV heads stored per remote rank."""
-
-
 # ---- Transfer topology ----
 
 
@@ -439,8 +407,6 @@ def __post_init__(self):
         self.local_physical_heads = max(1, self.total_num_kv_heads // self.tp_size)
 
         self._engines: dict[EngineId, EngineTransferInfo] = {}
-        self._fa_source_sets: dict[EngineId, frozenset[int]] = {}
-        self._fa_source_indices: dict[EngineId, dict[int, int]] = {}
 
         # Figure out whether the first dimension of the cache is K/V
         # or num_blocks.
@@ -487,24 +453,12 @@ def __post_init__(self):
     def register_remote_engine(
         self,
         remote_engine_id: EngineId,
-        remote_tp_size: int,
-        remote_block_size: int,
-        remote_block_len: int,
-        remote_physical_blocks_per_logical: int,
-        *,
-        local_block_len: int = 0,
+        info: EngineTransferInfo,
     ) -> EngineTransferInfo:
         """Register a remote engine, unifying worker dicts state.
 
-        Only remote engines should be registered here — the local engine's
-        identity (tp_size, block_size, etc.) is set via ``__init__`` params.
-
-        For Mamba models, also computes the Mamba transfer plan and
-        builds the FA source lookup caches.
-
-        Args:
-            local_block_len: Local representative block_len (bytes).
-                Required for Mamba models to compute ``fa_descriptor_bytes``.
+        The caller (worker) is responsible for computing the info via
+        the transfer policy.  This method only stores and deduplicates.
         """
         assert remote_engine_id != self.engine_id, (
             f"Cannot register local engine {self.engine_id} as remote. "
@@ -512,29 +466,6 @@ def register_remote_engine(
         )
         if remote_engine_id in self._engines:
             return self._engines[remote_engine_id]
-        info: EngineTransferInfo
-        if self.is_mamba:
-            info = self._build_mamba_info(
-                remote_tp_size=remote_tp_size,
-                remote_block_size=remote_block_size,
-                remote_block_len=remote_block_len,
-                remote_physical_blocks_per_logical=(remote_physical_blocks_per_logical),
-                local_block_len=local_block_len,
-            )
-            assert isinstance(info, MambaEngineTransferInfo)
-            self._fa_source_sets[remote_engine_id] = frozenset(
-                info.remote_fa_source_ranks
-            )
-            self._fa_source_indices[remote_engine_id] = {
-                r: i for i, r in enumerate(info.remote_fa_source_ranks)
-            }
-        else:
-            info = EngineTransferInfo(
-                remote_tp_size=remote_tp_size,
-                remote_block_len=remote_block_len,
-                remote_block_size=remote_block_size,
-                remote_physical_blocks_per_logical=(remote_physical_blocks_per_logical),
-            )
         self._engines[remote_engine_id] = info
         return info
 
@@ -622,14 +553,8 @@ def target_remote_ranks(self, remote_engine_id: EngineId) -> list[int]:
         """Get the remote TP rank(s) that the current local TP rank will
         read from.  When remote tp_size > local tp_size, reads from
         multiple remote ranks.
-
-        For Mamba models, returns the precomputed ``all_source_ranks``
-        (FA + Mamba union).
         """
         info = self._engines[remote_engine_id]
-        if isinstance(info, MambaEngineTransferInfo):
-            return list(info.remote_all_source_ranks)
-
         tp_ratio = self.tp_ratio(info.remote_tp_size)
         if tp_ratio > 0:
             return [self.tp_rank // tp_ratio]
@@ -662,309 +587,15 @@ def get_transfer_cache_regions(
         # Regular case: backends like FA register K/V in separate regions
         return cache if self.split_k_and_v else [cache]
 
-    # ============================================================
-    # Mamba-specific methods
-    # ============================================================
-
-    def should_skip_fa(self, remote_engine_id: EngineId, remote_rank: int) -> bool:
-        """Whether to skip FA groups for this remote rank (mamba-only)."""
-        return remote_rank not in self._fa_source_sets[remote_engine_id]
-
-    def fa_head_slot(self, remote_engine_id: EngineId, remote_rank: int) -> int:
-        """Index into local FA block for this remote rank's head data.
-
-        For remote ranks in ``fa_source_ranks``, returns 0, 1, …, reads-1.
-        For ranks NOT in ``fa_source_ranks`` (replicated duplicates),
-        returns the slot of the matching source rank with the same head.
-        """
-        fa_index = self._fa_source_indices[remote_engine_id]
-        if remote_rank in fa_index:
-            return fa_index[remote_rank]
-        mamba_info = self._engines[remote_engine_id]
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        K = self.total_num_kv_heads
-        remote_tp = mamba_info.remote_tp_size
-        r_head = self._physical_head_range(remote_tp, K, remote_rank)
-        for target in mamba_info.remote_fa_source_ranks:
-            t_head = self._physical_head_range(remote_tp, K, target)
-            if self._range_overlap(r_head, t_head):
-                return fa_index[target]
-        return 0
-
-    def fa_rank_offset(
-        self, remote_engine_id: EngineId, remote_kv_block_len: int
-    ) -> int:
-        """Byte offset into remote FA block for this local rank.
-
-        When local TP is replicated (local_tp > K), multiple local ranks
-        share a head.  Computes offset *relative to the target remote
-        rank's first head* so it works regardless of how many heads the
-        remote has.  Returns 0 when local does not index into remote.
-        """
-        mamba_info = self._engines[remote_engine_id]
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        tp_ratio = self.tp_ratio(mamba_info.remote_tp_size)
-        if self.is_mla or tp_ratio <= 0:
-            return 0
-        K = self.total_num_kv_heads
-        is_local_replicated = self.tp_size > K
-        if is_local_replicated:
-            local_head = self.tp_rank * K // self.tp_size
-            p_rank = mamba_info.remote_fa_source_ranks[0]
-            p_start = p_rank * K // mamba_info.remote_tp_size
-            return (local_head - p_start) * remote_kv_block_len
-        return self.tp_rank % tp_ratio * remote_kv_block_len
-
-    def needs_split_handles(self, remote_engine_id: EngineId) -> bool:
-        """Whether per-remote-rank split handles are needed.
-
-        True when FA and mamba have different read counts, requiring
-        different splitting factors in the local handle.
-        """
-        mamba_info = self._engines[remote_engine_id]
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        tp_ratio = self.tp_ratio(mamba_info.remote_tp_size)
-        return (
-            tp_ratio < 0
-            and not self.is_mla
-            and len(mamba_info.remote_all_source_ranks) > 1
-        )
-
-    def compute_split_handle_data(
-        self,
-        remote_engine_id: EngineId,
-        src_blocks_data: list[tuple[int, int, int]],
-        num_fa_descs: int,
-        abs_tp: int,
-    ) -> list[list[tuple[int, int, int]]]:
-        """Per-remote-rank (addr, len, dev) triples for Mamba-HMA split
-        handles.
-
-        FA descriptors (indices < num_fa_descs) are sliced by
-        ``remote_num_fa_reads``; mamba descriptors are sliced uniformly
-        by ``abs_tp``.
-        """
-        mamba_info = self._engines[remote_engine_id]
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        all_handle_data: list[list[tuple[int, int, int]]] = []
-        for p_idx, p_rank in enumerate(mamba_info.remote_all_source_ranks):
-            handle_data: list[tuple[int, int, int]] = []
-            skip_fa = self.should_skip_fa(remote_engine_id, p_rank)
-            fa_slot = self.fa_head_slot(remote_engine_id, p_rank) if not skip_fa else 0
-            for j, (addr, local_len, dev) in enumerate(src_blocks_data):
-                if j < num_fa_descs:
-                    assert mamba_info.remote_num_fa_reads >= 1
-                    fa_chunk = local_len // mamba_info.remote_num_fa_reads
-                    handle_data.append((addr + fa_slot * fa_chunk, fa_chunk, dev))
-                else:
-                    mamba_chunk = local_len // abs_tp
-                    handle_data.append((addr + p_idx * mamba_chunk, mamba_chunk, dev))
-            all_handle_data.append(handle_data)
-        return all_handle_data
-
-    def filter_block_ids_for_rank(
-        self,
-        remote_engine_id: EngineId,
-        remote_rank: int,
-        local_ids: BlockIds,
-        remote_ids: BlockIds,
-        is_mamba_group: list[bool],
-    ) -> tuple[BlockIds, BlockIds]:
-        """Zero out FA groups for remote ranks outside ``fa_source_ranks``.
-
-        Returns (filtered_local_ids, filtered_remote_ids).  When the
-        remote rank carries FA data for this local rank, returns the
-        inputs unchanged.
-        """
-        if not self.should_skip_fa(remote_engine_id, remote_rank):
-            return local_ids, remote_ids
-        num_groups = len(local_ids)
-        filtered_local: list[list[int]] = [
-            [] if not is_mamba_group[g] else local_ids[g] for g in range(num_groups)
-        ]
-        filtered_remote: list[list[int]] = [
-            [] if not is_mamba_group[g] else remote_ids[g] for g in range(num_groups)
-        ]
-        return filtered_local, filtered_remote
-
     def describe(self, remote_engine_id: EngineId) -> str:
         """One-line summary of transfer config for logging."""
         info = self._engines[remote_engine_id]
-        base = (
+        return (
+            f"TransferTopology("
             f"tp_ratio={self.tp_ratio(info.remote_tp_size)}, "
             f"K={self.total_num_kv_heads}, "
             f"local_tp={self.tp_size}, "
             f"remote_tp={info.remote_tp_size}, "
             f"local_rank={self.tp_rank}, "
-            f"remote_block_len={info.remote_block_len}"
-        )
-        if isinstance(info, MambaEngineTransferInfo):
-            return (
-                f"TransferTopology.mamba({base}, "
-                f"fa_reads={info.remote_num_fa_reads}, "
-                f"mamba_reads={info.remote_num_mamba_reads}, "
-                f"fa_sources={list(info.remote_fa_source_ranks)}, "
-                f"all_sources={list(info.remote_all_source_ranks)}, "
-                f"fa_desc_bytes={info.remote_fa_descriptor_bytes})"
-            )
-        return f"TransferTopology({base})"
-
-    # ============================================================
-    # Private helpers
-    # ============================================================
-    # Mamba-HMA hetero-TP transfer config:
-    # With hetero-TP (P_TP > D_TP), FA KV cache may be replicated across
-    # P ranks (when P_TP > num_kv_heads), but Mamba conv/SSM state is
-    # almost always uniquely sharded per P rank.  So the number of P
-    # ranks D must read from can differ between FA and Mamba, and they
-    # must be handled separately.
-
-    @staticmethod
-    def _physical_head_range(tp_size: int, num_heads: int, rank: int) -> range:
-        """Physical KV head range stored in a rank's KV cache tensor.
-
-        When ``tp_size <= num_heads``: sharded, K/TP contiguous heads per rank.
-        When ``tp_size > num_heads``: 1 physical head per rank.  Heads are
-        distributed **contiguously** (matching vLLM's GQA weight partitioning):
-        consecutive ranks share a head before moving to the next one.
-        """
-        if tp_size <= num_heads:
-            assert num_heads % tp_size == 0
-            per_rank = num_heads // tp_size
-            return range(rank * per_rank, (rank + 1) * per_rank)
-        else:
-            h = rank * num_heads // tp_size
-            return range(h, h + 1)
-
-    @staticmethod
-    def _range_overlap(a: range, b: range) -> range:
-        start = max(a.start, b.start)
-        stop = min(a.stop, b.stop)
-        return range(start, max(start, stop))
-
-    # ============================================================
-    # Private: build Mamba transfer info
-    # ============================================================
-
-    def _build_mamba_info(
-        self,
-        remote_tp_size: int,
-        remote_block_size: int,
-        remote_block_len: int,
-        remote_physical_blocks_per_logical: int,
-        local_block_len: int,
-    ) -> MambaEngineTransferInfo:
-        """Compute Mamba transfer plan."""
-        K = self.total_num_kv_heads
-        local_tp = self.tp_size
-        local_rank = self.tp_rank
-
-        is_remote_replicated = remote_tp_size > K
-        remote_physical_heads = max(1, K // remote_tp_size)
-
-        if local_tp >= remote_tp_size:
-            assert local_tp % remote_tp_size == 0
-            tp_ratio = local_tp // remote_tp_size
-        else:
-            assert remote_tp_size % local_tp == 0
-            tp_ratio = -(remote_tp_size // local_tp)
-
-        abs_tp = -tp_ratio if tp_ratio < 0 else 1
-
-        mamba_range: range | None = None
-        if tp_ratio < 0:
-            mamba_range = range(local_rank * abs_tp, (local_rank + 1) * abs_tp)
-
-        # ---- FA read targets ----
-        if self.is_mla or tp_ratio >= 0:
-            num_fa_reads = 1
-            fa_source_ranks: list[int] = (
-                [0]
-                if self.is_mla
-                else [local_rank // tp_ratio if tp_ratio > 0 else local_rank]
-            )
-        else:
-            local_needs = self._physical_head_range(local_tp, K, local_rank)
-            search_range = (
-                mamba_range if mamba_range is not None else range(remote_tp_size)
-            )
-            seen: set[tuple[int, int]] = set()
-            fa_source_ranks = []
-            for p in search_range:
-                p_has = self._physical_head_range(remote_tp_size, K, p)
-                ov = self._range_overlap(local_needs, p_has)
-                if len(ov) > 0:
-                    key = (ov.start, ov.stop)
-                    if key not in seen:
-                        seen.add(key)
-                        fa_source_ranks.append(p)
-            if not fa_source_ranks:
-                for p in range(remote_tp_size):
-                    p_has = self._physical_head_range(remote_tp_size, K, p)
-                    ov = self._range_overlap(local_needs, p_has)
-                    if len(ov) > 0:
-                        key = (ov.start, ov.stop)
-                        if key not in seen:
-                            seen.add(key)
-                            fa_source_ranks.append(p)
-            num_fa_reads = len(fa_source_ranks)
-
-        # ---- All source ranks (mamba + FA) ----
-        if mamba_range is not None and abs_tp > num_fa_reads:
-            num_mamba_reads = abs_tp
-            all_source_ranks = list(mamba_range)
-        else:
-            num_mamba_reads = num_fa_reads
-            all_source_ranks = list(fa_source_ranks)
-
-        # ---- FA descriptor bytes ----
-        effective_block_len = min(local_block_len, remote_block_len)
-        if self.is_kv_layout_blocks_first:
-            fa_descriptor_bytes = effective_block_len // 2
-        else:
-            fa_descriptor_bytes = effective_block_len
-
-        # ---- Validation ----
-        is_local_replicated = local_tp > K
-        if is_local_replicated and is_remote_replicated and tp_ratio > 0:
-            logger.info(
-                "Both-replicated hetero-TP: local_tp=%d > remote_tp=%d > K=%d.",
-                local_tp,
-                remote_tp_size,
-                K,
-            )
-        tt_set = set(all_source_ranks)
-        for t in fa_source_ranks:
-            if t not in tt_set:
-                logger.error(
-                    "FA source rank %d NOT in all_source_ranks %s.",
-                    t,
-                    all_source_ranks,
-                )
-        if self.is_kv_layout_blocks_first and tp_ratio < 0 and num_fa_reads > 0:
-            local_k_half = local_block_len // 2
-            remote_k_half = remote_block_len // 2
-            expected = local_k_half // num_fa_reads
-            if expected != remote_k_half:
-                logger.warning(
-                    "FA size mismatch: local_k_half=%d / reads=%d = %d, "
-                    "but remote_k_half=%d.",
-                    local_k_half,
-                    num_fa_reads,
-                    expected,
-                    remote_k_half,
-                )
-
-        return MambaEngineTransferInfo(
-            remote_tp_size=remote_tp_size,
-            remote_block_len=remote_block_len,
-            remote_block_size=remote_block_size,
-            remote_physical_blocks_per_logical=(remote_physical_blocks_per_logical),
-            remote_fa_source_ranks=tuple(fa_source_ranks),
-            remote_all_source_ranks=tuple(all_source_ranks),
-            remote_num_fa_reads=num_fa_reads,
-            remote_num_mamba_reads=num_mamba_reads,
-            remote_fa_descriptor_bytes=fa_descriptor_bytes,
-            is_remote_replicated=is_remote_replicated,
-            remote_physical_heads=remote_physical_heads,
+            f"remote_block_len={info.remote_block_len})"
         )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
index 715fcbde16c9..608fd8784778 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
@@ -31,10 +31,14 @@
     KVConnectorRole,
     SupportsHMA,
 )
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.mooncake_utils import (
     MooncakeBootstrapServer,
     RegisterWorkerPayload,
 )
+from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.stats import (
+    MooncakeKVConnectorStats,
+)
 from vllm.distributed.parallel_state import (
     get_pp_group,
     get_tensor_model_parallel_rank,
@@ -457,6 +461,25 @@ def save_kv_layer(
     def wait_for_save(self):
         pass
 
+    def get_kv_connector_stats(self) -> KVConnectorStats | None:
+        """Return worker-local transfer stats since the last call.
+
+        Note the P/D asymmetry: because Mooncake is P-push (P calls
+        batch_transfer_sync_write), P records successful transfer latency,
+        bytes, and descriptor counts, while D only records failures
+        (recv/ZMQ errors). Aggregated NIXL-style dashboards will find
+        successful-transfer metrics on the P worker, not D.
+        """
+        if self.connector_worker is None:
+            return None
+        return self.connector_worker.get_kv_connector_stats()
+
+    @classmethod
+    def build_kv_connector_stats(
+        cls, data: dict[str, Any] | None = None
+    ) -> KVConnectorStats | None:
+        return MooncakeKVConnectorStats(data=data or {})
+
 
 class MooncakeConnectorScheduler:
     """Implementation of Scheduler side methods"""
@@ -816,6 +839,8 @@ def __init__(
         self.finished_sending_reqs: set[ReqId] = set()
         self.finished_recving_reqs: set[ReqId] = set()
 
+        self.xfer_stats = MooncakeKVConnectorStats()
+
         self.block_size = vllm_config.cache_config.block_size
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
@@ -1340,11 +1365,23 @@ def _send_blocks(
         ret_value = self.engine.batch_transfer_sync_write(
             remote_session, src_ptrs, dst_ptrs, lengths
         )
+        duration = time.perf_counter() - start_time
         if ret_value == 0:
-            logger.debug(
-                "Sending to %s done, took %s",
+            self.xfer_stats.record_transfer(
+                duration_s=duration,
+                total_bytes=sum(lengths),
+                num_descs=len(src_ptrs),
+            )
+            logger.debug("Sending to %s done, took %s", remote_session, duration)
+        else:
+            self.xfer_stats.record_failed_transfer()
+            logger.warning(
+                "Sending to %s failed (ret=%s) after %s (%d descriptors, %d bytes)",
                 remote_session,
-                time.perf_counter() - start_time,
+                ret_value,
+                duration,
+                len(src_ptrs),
+                sum(lengths),
             )
         return ret_value
 
@@ -1445,6 +1482,7 @@ async def fetch_finished_sending_reqs(self) -> set[ReqId]:
                     send_meta.p_req_id,
                     envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT,
                 )
+                self.xfer_stats.record_kv_expired_req()
                 finished_sending_reqs.add(send_meta.p_req_id)
                 expired_transfer_id.append(transfer_id)
 
@@ -1485,6 +1523,13 @@ def get_finished(self) -> tuple[set[str] | None, set[str] | None]:
 
         return finished_sending_reqs or None, finished_recving_reqs or None
 
+    def get_kv_connector_stats(self) -> KVConnectorStats | None:
+        """Return transfer stats collected since the last call, or None
+        if nothing has been recorded in this interval."""
+        if self.xfer_stats.is_empty():
+            return None
+        return self.xfer_stats.clone_and_reset()
+
     async def receive_kv_from_single_worker(
         self,
         worker_addr: str,
@@ -1531,6 +1576,7 @@ async def receive_kv_from_single_worker(
                             req_ids,
                             response.err_msg,
                         )
+                        self.xfer_stats.record_failed_recv()
                         return
                     self.process_pulling_result(response, pull_metas)
                     if response.status == MooncakeXferResponseStatus.FINISH:
@@ -1539,6 +1585,7 @@ async def receive_kv_from_single_worker(
             logger.debug("ZMQ context terminated, exiting Mooncake receiver thread.")
         except Exception as e:
             logger.error("MooncakeXferMetadata transfer failed for %s: %s", req_ids, e)
+            self.xfer_stats.record_failed_recv()
             return
 
     def process_pulling_result(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/stats.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/stats.py
new file mode 100644
index 000000000000..d177f55cc720
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/stats.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Stats container for the Mooncake connector."""
+
+import threading
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
+    KVConnectorStats,
+)
+
+# TODO(mooncake-stats): add MooncakePromMetrics (mirror NixlPromMetrics)
+# and wire it via MooncakeConnector.build_prom_metrics in a follow-up PR.
+
+
+@dataclass
+class MooncakeKVConnectorStats(KVConnectorStats):
+    """Container for Mooncake KV transfer performance metrics.
+
+    `_lock` serializes record_* against clone_and_reset so each row's
+    appends are atomic and column lengths stay aligned. Writers run on
+    the sender pool / receiver loop / sender loop; reader runs on the
+    main worker thread.
+    """
+
+    def __post_init__(self):
+        self._lock = threading.Lock()
+        if not self.data:
+            self.reset()
+
+    # threading.Lock is not picklable; strip it from the wire form and
+    # rebuild a fresh per-process lock on the receiver side.
+    def __getstate__(self) -> dict[str, Any]:
+        state = self.__dict__.copy()
+        state.pop("_lock", None)
+        return state
+
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        self.__dict__.update(state)
+        self._lock = threading.Lock()
+
+    def reset(self):
+        self.data: dict[str, list[float | int]] = {
+            "transfer_duration": [],
+            "bytes_transferred": [],
+            "num_descriptors": [],
+            "num_failed_transfers": [],
+            "num_failed_recvs": [],
+            "num_kv_expired_reqs": [],
+        }
+
+    def record_transfer(self, duration_s: float, total_bytes: int, num_descs: int):
+        with self._lock:
+            self.data["transfer_duration"].append(duration_s)
+            self.data["bytes_transferred"].append(total_bytes)
+            self.data["num_descriptors"].append(num_descs)
+
+    # Failure counters store a list of 1s so a future Prom counter can iterate
+    # with .inc(list_item), mirroring NIXL's NixlPromMetrics.observe.
+    def record_failed_transfer(self):
+        with self._lock:
+            self.data["num_failed_transfers"].append(1)
+
+    def record_failed_recv(self):
+        with self._lock:
+            self.data["num_failed_recvs"].append(1)
+
+    def record_kv_expired_req(self):
+        with self._lock:
+            self.data["num_kv_expired_reqs"].append(1)
+
+    def clone_and_reset(self) -> "MooncakeKVConnectorStats":
+        # Copy lists under the lock for length alignment; return a fresh
+        # instance so the snapshot has its own _lock.
+        with self._lock:
+            snapshot_data: dict[str, list[float | int]] = {
+                k: list(v) for k, v in self.data.items()
+            }
+            self.reset()
+        return MooncakeKVConnectorStats(data=snapshot_data)
+
+    def is_empty(self) -> bool:
+        return (
+            self.num_successful_transfers == 0
+            and len(self.data["num_failed_transfers"]) == 0
+            and len(self.data["num_failed_recvs"]) == 0
+            and len(self.data["num_kv_expired_reqs"]) == 0
+        )
+
+    def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
+        if not other.is_empty():
+            for k, v in other.data.items():
+                accumulator = self.data[k]
+                assert isinstance(accumulator, list)
+                accumulator.extend(v)
+        return self
+
+    def reduce(self) -> dict[str, int | float]:
+        num_failed_transfers = len(self.data["num_failed_transfers"])
+        num_failed_recvs = len(self.data["num_failed_recvs"])
+        num_kv_expired_reqs = len(self.data["num_kv_expired_reqs"])
+
+        if self.num_successful_transfers == 0:
+            return {
+                "Num successful transfers": 0,
+                "Avg xfer time (ms)": 0,
+                "P90 xfer time (ms)": 0,
+                "Avg MB per transfer": 0,
+                "Throughput (MB/s)": 0,
+                "Avg number of descriptors": 0,
+                "Num failed transfers": num_failed_transfers,
+                "Num failed recvs": num_failed_recvs,
+                "Num KV expired reqs": num_kv_expired_reqs,
+            }
+
+        xfer_time = np.asarray(self.data["transfer_duration"])
+        mb = np.asarray(self.data["bytes_transferred"]) / 2**20
+        descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32)
+        n = len(descs)
+        assert n == self.num_successful_transfers
+
+        total_mb = mb.sum()
+        avg_mb = total_mb / n
+        total_time_seconds = xfer_time.sum()
+        throughput_mb_s = (
+            total_mb / total_time_seconds if total_time_seconds > 0 else 0.0
+        )
+
+        return {
+            "Num successful transfers": n,
+            "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
+            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90).item() * 1e3, 3),
+            "Avg MB per transfer": round(avg_mb, 3),
+            "Throughput (MB/s)": round(throughput_mb_s, 3),
+            "Avg number of descriptors": round(descs.mean(), 1),
+            "Num failed transfers": num_failed_transfers,
+            "Num failed recvs": num_failed_recvs,
+            "Num KV expired reqs": num_kv_expired_reqs,
+        }
+
+    @property
+    def num_successful_transfers(self) -> int:
+        return len(self.data["transfer_duration"])
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 4ef8f0ac9c90..a340f313e0a2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 import torch
 
@@ -18,6 +18,8 @@
     KVConnectorMetadata,
     KVConnectorRole,
     KVConnectorWorkerMetadata,
+    SupportsHMA,
+    supports_hma,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
     KVConnectorPromMetrics,
@@ -123,7 +125,7 @@ def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
             self._prom_metrics[connector_id].observe(stats_data["data"], engine_idx)
 
 
-class MultiConnector(KVConnectorBase_V1):
+class MultiConnector(KVConnectorBase_V1, SupportsHMA):
     """
     A wrapper for using multiple KVConnectors at the same time.
 
@@ -166,6 +168,12 @@ def __init__(
             self._connectors.append(connector_cls(temp_config, role, kv_cache_config))
             self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config)
 
+        self._all_support_hma = all(supports_hma(c) for c in self._connectors)
+        assert (
+            vllm_config.scheduler_config.disable_hybrid_kv_cache_manager
+            or self._all_support_hma
+        ), "HMA should not be enabled unless all sub-connectors support it"
+
         # A mapping from request id to the index of the connector chosen to
         # load the request from (if any).
         self._requests_to_connector: dict[str, int] = {}
@@ -436,15 +444,17 @@ def set_xfer_handshake_metadata(
         for c in self._connectors:
             c.set_xfer_handshake_metadata(metadata)
 
-    def request_finished(
+    def _aggregate_request_finished(
         self,
         request: "Request",
-        blocks: list[int],
+        per_connector_fn: Callable[
+            [KVConnectorBase_V1], tuple[bool, dict[str, Any] | None]
+        ],
     ) -> tuple[bool, dict[str, Any] | None]:
         async_saves = 0
         kv_txfer_params = None
         for c in self._connectors:
-            async_save, txfer_params = c.request_finished(request, blocks)
+            async_save, txfer_params = per_connector_fn(c)
             if async_save:
                 async_saves += 1
             if txfer_params is not None:
@@ -458,11 +468,39 @@ def request_finished(
         if async_saves > 1:
             self._extra_async_saves[request.request_id] = async_saves - 1
 
-        # Clean up other state for this request.
         self._requests_to_connector.pop(request.request_id, None)
 
         return async_saves > 0, kv_txfer_params
 
+    def request_finished(
+        self,
+        request: "Request",
+        blocks: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        return self._aggregate_request_finished(
+            request,
+            lambda c: c.request_finished(request, blocks),
+        )
+
+    def request_finished_all_groups(
+        self,
+        request: "Request",
+        block_ids: tuple[list[int], ...],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        if not self._all_support_hma:
+            assert len(block_ids) == 1, (
+                "HMA with multiple kv_cache_groups requires all "
+                "sub-connectors to support HMA"
+            )
+            return self.request_finished(request, block_ids[0])
+
+        return self._aggregate_request_finished(
+            request,
+            lambda c: cast(SupportsHMA, c).request_finished_all_groups(
+                request, block_ids
+            ),
+        )
+
     def take_events(self) -> Iterable["KVCacheEvent"]:
         for c in self._connectors:
             yield from c.take_events()
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
index 71ebbf1174fb..c56e373ba99d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
@@ -32,8 +32,9 @@
 # Version History:
 #   1: Initial version with compatibility checking
 #   2: Add remote_request_id to kv_transfer_params
+#   3: Add physical_blocks_per_logical_kv_block to NixlAgentMetadata
 #
-NIXL_CONNECTOR_VERSION: int = 2
+NIXL_CONNECTOR_VERSION: int = 3
 
 
 @dataclass
@@ -48,6 +49,7 @@ class NixlAgentMetadata:
     block_size: int
     ssm_sizes: tuple[int, int]
     attn_backend_name: str
+    physical_blocks_per_logical_kv_block: int
 
 
 @dataclass
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
index 9f67d0fc525d..02c418ebd8d7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
@@ -119,6 +119,30 @@ def __init__(
             for n_tokens, block_size in sw_sizes_tokens
         ]
 
+        # Threshold to decide whether to compute kv cache locally
+        # or pull from a remote node: minimum number of remote
+        # tokens to amortize the xfer latencies
+        self.kv_recompute_threshold: int = int(
+            vllm_config.kv_transfer_config.get_from_extra_config(
+                "kv_recompute_threshold", 64
+            )
+        )
+
+        # Bi-directional KV transfer feature supports KV block
+        # transfers from D node to P node
+        self.is_bidirectional_kv_xfer_enabled = (
+            vllm_config.kv_transfer_config.get_from_extra_config(
+                "bidirectional_kv_xfer", False
+            )
+        )
+
+        if self.is_bidirectional_kv_xfer_enabled and self.kv_recompute_threshold > 0:
+            logger.info(
+                "Bidirectional KV transfer is enabled and the kv "
+                "recompute threshold is set to %d tokens",
+                self.kv_recompute_threshold,
+            )
+
     def shutdown(self):
         self._stop_event.set()
         if self._nixl_handshake_listener_t is not None:
@@ -298,6 +322,44 @@ def get_num_new_matched_tokens(
         if params is not None and params.get("do_remote_decode") and self._has_mamba:
             self._truncate_mamba_request_for_prefill(request)
 
+        if (
+            params is not None
+            and params.get("do_remote_decode")
+            and params.get("remote_block_ids")
+            and all(
+                p in params
+                for p in (
+                    "remote_engine_id",
+                    "remote_request_id",
+                    "remote_host",
+                    "remote_port",
+                )
+            )
+        ):
+            # Decode node has kv blocks for part of prefill request, so, provide them
+            # as an external token count to scheduler.
+            # The tokens will be loaded if not already present
+            # in the prefill node local cache
+            remote_num_tokens = params.get("remote_num_tokens") or 0
+            count = (
+                min(remote_num_tokens, request.num_prompt_tokens) - num_computed_tokens
+            )
+            if count > 0:
+                # Check kv_recompute_threshold: skip pull if
+                # remote tokens are below the threshold.
+                if (
+                    self.kv_recompute_threshold > 0
+                    and count < self.kv_recompute_threshold
+                ):
+                    logger.debug(
+                        "Skipping remote pull for %s: %d remote tokens < threshold %d",
+                        request.request_id,
+                        count,
+                        self.kv_recompute_threshold,
+                    )
+                    return 0, False
+                return count, True
+
         # No remote prefill for this request.
         return 0, False
 
@@ -315,13 +377,19 @@ def update_state_after_alloc(
         if not params:
             return
 
-        if params.get("do_remote_decode"):
+        if params.get("do_remote_decode") or (
+            params.get("do_remote_prefill") and self.is_bidirectional_kv_xfer_enabled
+        ):
             self._reqs_in_batch.add(request.request_id)
         if self.use_host_buffer and params.get("do_remote_decode"):
             # NOTE: when accelerator is not directly supported by Nixl,
             # prefilled blocks need to be saved to host memory before transfer.
             self._reqs_need_save[request.request_id] = request
-        elif params.get("do_remote_prefill"):
+        elif params.get("do_remote_prefill") or (
+            params.get("do_remote_decode")
+            and self.is_bidirectional_kv_xfer_enabled
+            and not params.get("_remote_blocks_processed")
+        ):
             if params.get("remote_block_ids"):
                 if all(
                     p in params
@@ -333,8 +401,8 @@ def update_state_after_alloc(
                     )
                 ):
                     # If remote_blocks and num_external_tokens = 0, we have
-                    # a full prefix cache hit on the D worker. We need to call
-                    # send_notif in _read_blocks to free the memory on the P.
+                    # a full prefix cache hit on the local node. We need to call
+                    # send_notif in _read_blocks to free the memory on the remote node.
 
                     unhashed_local_block_ids: BlockIds = (
                         blocks.get_unhashed_block_ids_all_groups()
@@ -362,6 +430,7 @@ def update_state_after_alloc(
                 assert num_external_tokens == 0
             # Only trigger 1 KV transfer per request.
             params["do_remote_prefill"] = False
+            params["_remote_blocks_processed"] = True
 
     def _build_save_meta(
         self,
@@ -450,6 +519,9 @@ def request_finished(
         if not params:
             return False, None
 
+        is_p_node = bool(params.get("do_remote_decode"))
+        is_d_node = not is_p_node
+
         if params.get("do_remote_prefill"):
             # If do_remote_prefill is still True when the request is finished,
             # update_state_after_alloc must not have been called (the request
@@ -461,9 +533,13 @@ def request_finished(
             params["do_remote_prefill"] = False
             return False, None
 
-        if not params.get("do_remote_decode"):
+        if is_d_node and not self.is_bidirectional_kv_xfer_enabled:
             return False, None
-        if request.status != RequestStatus.FINISHED_LENGTH_CAPPED:
+
+        if request.status not in (
+            RequestStatus.FINISHED_LENGTH_CAPPED,
+            RequestStatus.FINISHED_STOPPED,
+        ):
             # Also include the case of a P/D Prefill request with immediate
             # block free (eg abort). Stop tracking this request.
             self._reqs_not_processed.add(request.request_id)
@@ -474,7 +550,7 @@ def request_finished(
         # TODO: check whether block_ids actually ever be 0. If not we could
         # remove the conditional below
         delay_free_blocks = any(len(group) > 0 for group in block_ids)
-
+        remote_num_tokens = 0
         if delay_free_blocks:
             # Prefill request on remote. It will be read from D upon completion
             logger.debug(
@@ -492,13 +568,16 @@ def request_finished(
             # Here we "unpad" blocks to send the actual remote blocks to be read.
             block_ids = self.get_sw_clipped_blocks(block_ids)
 
+            remote_num_tokens = request.num_computed_tokens
+
         return delay_free_blocks, dict(
-            do_remote_prefill=True,
-            do_remote_decode=False,
+            do_remote_prefill=is_p_node,
+            do_remote_decode=is_d_node,
             remote_block_ids=block_ids,
             remote_engine_id=self.engine_id,
             remote_request_id=request.request_id,
             remote_host=self.side_channel_host,
             remote_port=self.side_channel_port,
             tp_size=self.vllm_config.parallel_config.tensor_parallel_size,
+            remote_num_tokens=remote_num_tokens,
         )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py
new file mode 100644
index 000000000000..7115b8bed543
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""TP mapping computation for NIXL KV cache transfers."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    BlockIds,
+)
+from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheSpec, MambaSpec
+
+# ======================================================================
+# Data structures
+# ======================================================================
+
+
+@dataclass(frozen=True)
+class ReadSpec:
+    """Specification for a single remote block read operation."""
+
+    remote_rank: int
+    local_block_ids: BlockIds
+    remote_block_ids: BlockIds
+
+
+def _is_attention_spec(spec_type: type[KVCacheSpec]) -> bool:
+    return issubclass(spec_type, AttentionSpec)
+
+
+def _is_ssm_spec(spec_type: type[KVCacheSpec]) -> bool:
+    return issubclass(spec_type, MambaSpec)
+
+
+@dataclass(frozen=True)
+class TPMapping:
+    """Complete local-to-remote TP mapping for one remote engine.
+
+    Generated once per remote engine during handshake.
+    """
+
+    # Remote TP ranks that this local rank reads from, per group.
+    # Position = local piece index.
+    source_ranks_per_group: tuple[tuple[int, ...], ...]
+
+    # Superset of all source ranks (union of all groups).
+    all_source_ranks: tuple[int, ...]
+
+    # Maps each source rank to its FA head slot index.
+    rank_to_attention_slot: dict[int, int]
+
+    # FA head offset factor for hetero-TP (D_TP > P_TP).
+    rank_offset_factor: int
+
+
+# ======================================================================
+# TP mapping computation
+# ======================================================================
+
+
+def compute_tp_mapping(
+    tp_rank: int,
+    tp_size: int,
+    remote_tp_size: int,
+    is_mla: bool,
+    total_num_kv_heads: int,
+    group_spec_types: tuple[type[KVCacheSpec], ...],
+) -> TPMapping:
+    """Build the complete local-to-remote TP mapping.
+
+    Computes source ranks, head slot assignments, and the rank offset
+    factor in a single pass.
+    """
+    # --- Attention source ranks ---
+    if is_mla:
+        # All heads replicated across all ranks.
+        attn_ranks = [0]
+    elif tp_size >= remote_tp_size:
+        # D (local TP) > P (remote TP): multiple local ranks read different chunks from
+        # *one* remote rank, corresponding to different kv heads.
+        attn_ranks = [tp_rank * remote_tp_size // tp_size]
+    else:
+        # P (remote TP) > D (local TP): one local rank
+        # reads from multiple remote ranks.
+        # GQA dedup: when K < remote_tp_size, several remote ranks
+        # hold the same KV head.  np.unique keeps only the first
+        # rank per unique head so we don't issue redundant reads.
+        abs_tp = remote_tp_size // tp_size
+        start = tp_rank * abs_tp
+        heads = np.arange(start, start + abs_tp) * total_num_kv_heads // remote_tp_size
+        _, unique_idx = np.unique(heads, return_index=True)
+        attn_ranks = (start + np.sort(unique_idx)).tolist()
+
+    # --- SSM source ranks ---
+    has_ssm = any(_is_ssm_spec(t) for t in group_spec_types)
+    if has_ssm:
+        if tp_size < remote_tp_size:
+            abs_tp = remote_tp_size // tp_size
+            ssm_ranks = list(range(tp_rank * abs_tp, (tp_rank + 1) * abs_tp))
+        else:
+            ssm_ranks = list(attn_ranks)
+    else:
+        ssm_ranks = []
+
+    all_ranks = sorted(set(attn_ranks) | set(ssm_ranks))
+
+    # --- Per-group ordered source ranks ---
+    source_ranks_per_group = tuple(
+        tuple(ssm_ranks) if _is_ssm_spec(t) else tuple(attn_ranks)
+        for t in group_spec_types
+    )
+
+    # --- Attention head slots ---
+    head_to_slot: dict[int, int] = {}
+    for i, r in enumerate(attn_ranks):
+        head_to_slot[r * total_num_kv_heads // remote_tp_size] = i
+    rank_to_attention_slot = {
+        r: head_to_slot.get(r * total_num_kv_heads // remote_tp_size, 0)
+        for r in all_ranks
+    }
+
+    # --- Rank offset factor ---
+    if is_mla or tp_size <= remote_tp_size:
+        # We don't index into remote for reading, no offset needed.
+        rank_offset_factor = 0
+    elif tp_size > total_num_kv_heads:
+        local_head = tp_rank * total_num_kv_heads // tp_size
+        p_start = attn_ranks[0] * total_num_kv_heads // remote_tp_size
+        rank_offset_factor = local_head - p_start
+    else:
+        # D TP > P TP: we index into remote to read different heads depending on rank.
+        rank_offset_factor = tp_rank % (tp_size // remote_tp_size)
+
+    return TPMapping(
+        source_ranks_per_group=source_ranks_per_group,
+        all_source_ranks=tuple(all_ranks),
+        rank_to_attention_slot=rank_to_attention_slot,
+        rank_offset_factor=rank_offset_factor,
+    )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
index bd7ef5973f62..caa5f432c5e0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
@@ -9,6 +9,7 @@
 import time
 import uuid
 from collections import defaultdict
+from collections.abc import Iterator
 from concurrent.futures import Future, ThreadPoolExecutor
 from typing import TYPE_CHECKING, Any, cast
 
@@ -21,7 +22,7 @@
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     BlockIds,
     EngineId,
-    MambaEngineTransferInfo,
+    EngineTransferInfo,
     TransferTopology,
     get_current_attn_backends,
     kv_postprocess_blksize_and_layout_on_receive,
@@ -43,13 +44,19 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl.stats import (
     NixlKVConnectorStats,
 )
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl.tp_mapping import (
+    ReadSpec,
+    TPMapping,
+    _is_attention_spec,
+    _is_ssm_spec,
+    compute_tp_mapping,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl.utils import (
     _NIXL_SUPPORTED_DEVICE,
     zmq_ctx,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.ssm_conv_transfer_utils import (
     MambaConvSplitInfo,
-    compute_physical_blocks_per_logical,
     derive_mamba_conv_split,
 )
 from vllm.distributed.nixl_utils import NixlWrapper, nixl_agent_config
@@ -58,7 +65,6 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.logger import init_logger
-from vllm.model_executor.layers.mamba.mamba_utils import is_conv_state_dim_first
 from vllm.platforms import current_platform
 from vllm.utils.network_utils import make_zmq_path
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
@@ -80,6 +86,111 @@
 class NixlConnectorWorker:
     """Implementation of Worker side methods"""
 
+    def _compute_desc_ids(
+        self,
+        block_ids: BlockIds,
+        dst_num_blocks: int,
+        block_size_ratio: float | None,
+        physical_blocks_per_logical: int,
+    ) -> np.ndarray:
+        """Compute NIXL descriptor IDs for given block IDs."""
+        num_fa_regions = self.num_regions
+        num_ssm_regions = len(self.block_len_per_layer) * 4 if self._has_mamba else 0
+
+        num_blocks = dst_num_blocks
+        if block_size_ratio is not None:
+            num_blocks = int(num_blocks * block_size_ratio)
+        ratio = physical_blocks_per_logical
+        logical_blocks = num_blocks // ratio
+
+        num_fa_descs = num_fa_regions * num_blocks
+
+        # All-attention fast path: single vectorized broadcast.
+        if num_ssm_regions == 0:
+            block_arr = np.concatenate(block_ids)[None, :]
+            region_ids = np.arange(num_fa_regions)[:, None]
+            return (region_ids * num_blocks + block_arr).flatten()
+
+        # NOTE (NickLucche) With HMA, every kv group has the same number
+        # of layers and layers from different groups share the same kv
+        # tensor.  Therefore we compute desc IDs per group using the
+        # right stride:
+        # FA descs have num_blocks entries per region (kernel granularity),
+        # SSM descs have logical_blocks entries per region (no kernel
+        # splitting).
+        all_descs: list[np.ndarray] = []
+        for i, group in enumerate(block_ids):
+            group_arr = np.asarray(group)
+            if _is_attention_spec(self._group_spec_types[i]):
+                fa_region_ids = np.arange(num_fa_regions)[:, None]
+                all_descs.append(
+                    (fa_region_ids * num_blocks + group_arr[None, :]).flatten()
+                )
+            elif _is_ssm_spec(self._group_spec_types[i]):
+                # NOTE (NickLucche) SSM and Attention block regions can
+                # be exchanged arbitrarily by manager.  Therefore, descs
+                # are laid out as:
+                #   [descs_fa (all regions) | descs_ssm (all regions)].
+                # num_fa_descs offset must be computed per-engine since
+                # P and D can have different num_blocks (and thus
+                # different FA desc counts).
+                ssm_region_ids = np.arange(num_ssm_regions)[:, None]
+                all_descs.append(
+                    (
+                        ssm_region_ids * logical_blocks
+                        + group_arr[None, :]
+                        + num_fa_descs
+                    ).flatten()
+                )
+            else:
+                raise ValueError(
+                    f"Unknown spec type {self._group_spec_types[i]} at index {i}"
+                )
+
+        return np.concatenate(all_descs)
+
+    def _build_local_splits_from_plan(
+        self,
+        plan: TPMapping,
+        src_blocks_data: list[tuple[int, int, int]],
+        num_fa_descs: int,
+    ) -> Iterator[list[tuple[int, int, int]]]:
+        """Build split handle data for P_TP > D_TP scenario.
+
+        num_fa_descs is the boundary between FA and SSM descriptors.
+        Split counts are derived from source_ranks_per_group lengths.
+        FA uses rank_to_attention_slot for the slot offset;
+        SSM uses the rank's positional index.
+        """
+        fa_idx = next(
+            i for i, t in enumerate(self._group_spec_types) if _is_attention_spec(t)
+        )
+        fa_num_splits = len(plan.source_ranks_per_group[fa_idx])
+
+        has_ssm_descs = num_fa_descs < len(src_blocks_data)
+        ssm_idx = next(
+            (i for i, t in enumerate(self._group_spec_types) if _is_ssm_spec(t)),
+            None,
+        )
+        ssm_num_splits = (
+            len(plan.source_ranks_per_group[ssm_idx])
+            if has_ssm_descs and ssm_idx is not None
+            else 0
+        )
+
+        for p_idx, p_rank in enumerate(plan.all_source_ranks):
+            fa_slot = plan.rank_to_attention_slot.get(p_rank, 0)
+
+            handle: list[tuple[int, int, int]] = []
+            for j, (addr, local_len, dev) in enumerate(src_blocks_data):
+                if j < num_fa_descs:
+                    chunk = local_len // fa_num_splits
+                    handle.append((addr + fa_slot * chunk, chunk, dev))
+                else:
+                    chunk = local_len // ssm_num_splits
+                    handle.append((addr + p_idx * chunk, chunk, dev))
+            yield handle
+
     def __init__(
         self,
         vllm_config: "VllmConfig",
@@ -119,44 +230,37 @@ def __init__(
         }
         self.hma_group_size = len(kv_cache_config.kv_cache_tensors)
 
-        # ---- Mamba model state (derived from model config) ----
-        self._is_mamba_group = [
-            isinstance(group.kv_cache_spec, MambaSpec)
-            for group in kv_cache_config.kv_cache_groups
-        ]
+        # ---- Model state (derived from model config) ----
         mamba_ssm_size = (0, 0)
-        self._has_mamba = any(self._is_mamba_group)
-        if self._has_mamba:
-            assert self._is_hma_required
-            mamba_spec = next(
-                spec
-                for spec in self._layer_specs.values()
-                if isinstance(spec, MambaSpec)
-            )
-            conv_nbytes, ssm_nbytes = (
-                torch.tensor([], dtype=mamba_spec.dtypes[0]).element_size(),  # type: ignore[misc]
-                torch.tensor([], dtype=mamba_spec.dtypes[1]).element_size(),  # type: ignore[misc]
-            )
-            conv_shape, ssm_shape = (
-                torch.Size(mamba_spec.shapes[0]),
-                torch.Size(mamba_spec.shapes[1]),
-            )
-            mamba_ssm_size = (
-                conv_shape.numel() * conv_nbytes,
-                ssm_shape.numel() * ssm_nbytes,
-            )
-        self._mamba_ssm_size = mamba_ssm_size
         # Conv state sub-projection decomposition (None when no Mamba).
         # The 3-read transfer requires DS (dim, state_len) conv layout so
         # that x/B/C sub-projections are contiguous in memory.
         self._conv_decomp: MambaConvSplitInfo | None = None
+        self._has_mamba = any(
+            isinstance(g.kv_cache_spec, MambaSpec)
+            for g in kv_cache_config.kv_cache_groups
+        )
         if self._has_mamba:
+            assert self._is_hma_required
+            from vllm.model_executor.layers.mamba.mamba_utils import (
+                is_conv_state_dim_first,
+            )
+
             assert is_conv_state_dim_first(), (
                 "3-read Mamba conv transfer requires DS conv state layout. "
                 "Set VLLM_SSM_CONV_STATE_LAYOUT=DS"
             )
-            local_tp = vllm_config.parallel_config.tensor_parallel_size
-            self._conv_decomp = derive_mamba_conv_split(mamba_spec, local_tp)
+            mamba_spec = next(
+                spec
+                for spec in self._layer_specs.values()
+                if isinstance(spec, MambaSpec)
+            )
+            self._conv_decomp = derive_mamba_conv_split(
+                mamba_spec,
+                vllm_config.parallel_config.tensor_parallel_size,
+            )
+            mamba_ssm_size = self._conv_decomp.ssm_sizes
+        self._mamba_ssm_size = mamba_ssm_size
 
         # Agent.
         non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
@@ -268,14 +372,6 @@ def __init__(
         self.dst_num_blocks: dict[EngineId, int] = {}
         self._registered_descs: list[Any] = []
 
-        # ---- Mamba-HMA per-engine state (only used when self._has_mamba) ----
-        # NOTE (ZhanqiuHu): _physical_blocks_per_logical MUST be per-engine.
-        # physical_blocks_per_logical = ceil((conv_bytes + ssm_bytes) / block_len)
-        # where conv/ssm bytes are per-TP-rank (dimension-sharded).  With
-        # heterogeneous TP the per-rank sizes differ, so the ratio differs:
-        #   e.g. Nemotron 30B: P(TP=4) → 131, D(TP=1) → 261.
-        self._physical_blocks_per_logical: dict[EngineId, int] = {}
-
         # In progress transfers.
         # [req_id -> list[handle]]
         self._recving_metadata: dict[ReqId, ReqMeta] = {}
@@ -330,6 +426,13 @@ def __init__(
         self._physical_blocks_per_logical_kv_block = 1
         self._sync_block_size_with_kernel()
 
+        self._group_spec_types = tuple(
+            type(g.kv_cache_spec) for g in self.kv_cache_config.kv_cache_groups
+        )
+
+        # Per-engine TP mappings. Generated during handshake.
+        self.tp_mappings: dict[EngineId, TPMapping] = {}
+
         self.enforce_compat_hash = self.kv_transfer_config.get_from_extra_config(
             "enforce_handshake_compat", True
         )
@@ -812,9 +915,6 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         self.dst_num_blocks[self.engine_id] = self.num_blocks
 
         if self._has_mamba:
-            self._physical_blocks_per_logical[self.engine_id] = (
-                self._physical_blocks_per_logical_kv_block
-            )
             logger.info(
                 "Hybrid SSM registration: num_blocks=%s, "
                 "logical_num_blocks=%s, ratio=%s, num_regions=%s, "
@@ -847,6 +947,9 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             block_size=self.block_size,
             ssm_sizes=self._mamba_ssm_size,
             attn_backend_name=self.backend_name,
+            physical_blocks_per_logical_kv_block=(
+                self._physical_blocks_per_logical_kv_block
+            ),
         )
         # Wrap metadata in payload with hash for defensive decoding
         assert self.compat_hash is not None
@@ -875,6 +978,8 @@ def _build_mamba_local(
 
         result: list[tuple[int, int, int]] = []
         for i, base_addr in enumerate(base_addresses):
+            # Jump one page_size, but ssm page_size may be bigger when kernel
+            # locks block size to a specific value (physical_per_logical scale).
             page_stride = (
                 self.block_len_per_layer[i] // block_size_ratio * physical_per_logical
             )
@@ -894,67 +999,11 @@ def _build_mamba_local(
                 )
         return result
 
-    def _build_fa_remote_for_mamba(
-        self,
-        nixl_agent_meta: NixlAgentMetadata,
-        block_size_ratio: int,
-        transfer_topo: TransferTopology,
-        remote_engine_id: EngineId,
-    ) -> list[tuple[int, int, int]]:
-        """Build remote FA descriptors for mamba models.
-
-        Uses TransferTopology for GQA-aware FA divisor and head-based rank
-        offset instead of the standard uniform tp_ratio split.
-        """
-        assert block_size_ratio == 1, (
-            "Mamba 3-read transfer with block_size_ratio != 1 is not tested. "
-            f"Got block_size_ratio={block_size_ratio}."
-        )
-        # TODO (ZhanqiuHu): unify with register_remote_blocks when Mamba-HMA
-        # hetero-TP logic stabilizes.
-        mamba_info = transfer_topo.get_engine_info(remote_engine_id)
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        tp_ratio = transfer_topo.tp_ratio(mamba_info.remote_tp_size)
-        result: list[tuple[int, int, int]] = []
-        for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
-            local_block_len = self.get_backend_aware_kv_block_len(
-                layer_idx=i, first_split=True, mamba_view=False
-            )
-            remote_kv_block_len = local_block_len // block_size_ratio
-            if block_size_ratio > 1:
-                local_block_len = remote_kv_block_len
-
-            if tp_ratio < 0 and not self.use_mla:
-                local_block_len = local_block_len // mamba_info.remote_num_fa_reads
-
-            rank_offset = transfer_topo.fa_rank_offset(
-                remote_engine_id, remote_kv_block_len
-            )
-
-            num_blocks = nixl_agent_meta.num_blocks
-            page_size = nixl_agent_meta.block_lens[i]
-            for block_id in range(num_blocks):
-                block_offset = block_id * page_size
-                addr = base_addr + block_offset + rank_offset
-                result.append((addr, local_block_len, nixl_agent_meta.device_id))
-
-            if transfer_topo.is_kv_layout_blocks_first:
-                second_split = self.get_backend_aware_kv_block_len(
-                    layer_idx=i, first_split=False, mamba_view=False
-                )
-                if tp_ratio < 0 and not self.use_mla:
-                    second_split = second_split // mamba_info.remote_num_fa_reads
-                for block_id in range(num_blocks):
-                    block_offset = block_id * page_size
-                    addr = base_addr + block_offset + rank_offset
-                    v_addr = addr + nixl_agent_meta.block_lens[i] // 2
-                    result.append((v_addr, second_split, nixl_agent_meta.device_id))
-        return result
-
     def _build_mamba_remote(
         self,
         nixl_agent_meta: NixlAgentMetadata,
         tp_ratio: int,
+        transfer_info: EngineTransferInfo,
     ) -> list[tuple[int, int, int]]:
         """Build 4 remote desc regions (x, B, C, ssm) per layer for
         the 3-read transfer.  For hetero-TP, each D rank reads only its
@@ -974,17 +1023,15 @@ def _build_mamba_remote(
             ssm_read_size = self._mamba_ssm_size[1]
         else:
             # NOTE (ZhanqiuHu): tp_ratio < 0 means P_TP > D_TP, so P pages
-            # are smaller than D's.  self._conv_decomp has D-sized dimensions,
-            # but we need P-sized offsets.  Scale down by |tp_ratio|.
+            # are smaller than D's. self._conv_decomp has D-sized dimensions,
+            # but we need P-sized offsets. Scale down by |tp_ratio|.
             abs_ratio = -tp_ratio
             xb_p = self._conv_decomp.x_bytes // abs_ratio
             bb_p = self._conv_decomp.b_bytes // abs_ratio
             conv_offsets = [(0, xb_p), (xb_p, bb_p), (xb_p + bb_p, bb_p)]
             ssm_read_size = nixl_agent_meta.ssm_sizes[1]
 
-        remote_physical_per_logical = self._physical_blocks_per_logical[
-            nixl_agent_meta.engine_id
-        ]
+        remote_physical_per_logical = transfer_info.remote_physical_blocks_per_logical
         num_blocks = nixl_agent_meta.num_blocks // remote_physical_per_logical
         device_id = nixl_agent_meta.device_id
 
@@ -1007,6 +1054,91 @@ def _build_mamba_remote(
                 result.append((ssm_addr, ssm_read_size, device_id))
         return result
 
+    def _build_fa_local(
+        self,
+        base_addresses: list[int],
+        block_size_ratio: int,
+    ) -> list[tuple[int, int, int]]:
+        """Build local FA descriptors for all layers."""
+        assert self.transfer_topo is not None
+        num_blocks = self.num_blocks * block_size_ratio
+        result: list[tuple[int, int, int]] = []
+        for i, base_addr in enumerate(base_addresses):
+            kv_block_len = (
+                self.get_backend_aware_kv_block_len(
+                    layer_idx=i, first_split=True, mamba_view=False
+                )
+                // block_size_ratio
+            )
+            page_stride = self.block_len_per_layer[i] // block_size_ratio
+            for block_id in range(num_blocks):
+                block_offset = block_id * page_stride
+                addr = base_addr + block_offset
+                result.append((addr, kv_block_len, self.device_id))
+
+            if self.transfer_topo.is_kv_layout_blocks_first:
+                # Separate and interleave K/V regions to maintain the same
+                # descs ordering. This is needed for selecting contiguous heads
+                # when split across TP ranks.
+                second_split = self.get_backend_aware_kv_block_len(
+                    layer_idx=i, first_split=False, mamba_view=False
+                )
+                for block_id in range(num_blocks):
+                    block_offset = block_id * page_stride
+                    addr = base_addr + block_offset
+                    v_addr = addr + kv_block_len
+                    result.append((v_addr, second_split, self.device_id))
+        return result
+
+    def _build_fa_remote(
+        self,
+        plan: TPMapping,
+        nixl_agent_meta: NixlAgentMetadata,
+        block_size_ratio: int,
+    ) -> list[tuple[int, int, int]]:
+        """Build remote FA descriptors for all layers."""
+        assert self.transfer_topo is not None
+        fa_group_idx = next(
+            i for i, t in enumerate(self._group_spec_types) if _is_attention_spec(t)
+        )
+        num_attn_reads = len(plan.source_ranks_per_group[fa_group_idx])
+        num_blocks = nixl_agent_meta.num_blocks
+        result: list[tuple[int, int, int]] = []
+        for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
+            # Read our whole local region size from remote..
+            local_block_len = self.get_backend_aware_kv_block_len(
+                layer_idx=i, first_split=True, mamba_view=False
+            )
+            remote_kv_block_len = local_block_len // block_size_ratio
+            if block_size_ratio > 1:
+                # ..using remote kv_block_len as transfer unit
+                local_block_len = remote_kv_block_len
+
+            local_block_len = local_block_len // num_attn_reads
+            rank_offset = plan.rank_offset_factor * remote_kv_block_len
+
+            page_size = nixl_agent_meta.block_lens[i]
+            for block_id in range(num_blocks):
+                block_offset = block_id * page_size
+                # For each block, grab the kv heads chunk belonging to current local
+                # tp rank of size local_block_len.
+                addr = base_addr + block_offset + rank_offset
+                result.append((addr, local_block_len, nixl_agent_meta.device_id))
+
+            if self.transfer_topo.is_kv_layout_blocks_first:
+                # With FlashInfer index V separately to allow head splitting.
+                second_split = self.get_backend_aware_kv_block_len(
+                    layer_idx=i, first_split=False, mamba_view=False
+                )
+                second_split = second_split // num_attn_reads
+                for block_id in range(num_blocks):
+                    block_offset = block_id * page_size
+                    addr = base_addr + block_offset + rank_offset
+                    # Hop over the first split of remote page, K, to read V.
+                    v_addr = addr + nixl_agent_meta.block_lens[i] // 2
+                    result.append((v_addr, second_split, nixl_agent_meta.device_id))
+        return result
+
     def register_local_xfer_handler(
         self,
         block_size: int,
@@ -1023,68 +1155,23 @@ def register_local_xfer_handler(
         data copy correctness.
         """
         assert self.transfer_topo is not None
-        transfer_topo = self.transfer_topo
-
         block_size_ratio = self.block_size // block_size
-        blocks_data: list[tuple[int, int, int]] = []
         local_base_addresses = self.kv_caches_base_addr[self.engine_id][self.tp_rank]
 
-        def register_blocks(blocks_data: list[tuple[int, int, int]], mamba: bool):
-            for i, base_addr in enumerate(local_base_addresses):
-                # The new block_len is using prefill block_len;
-                # and num_blocks is multiple with N
-                kv_block_len = (
-                    self.get_backend_aware_kv_block_len(
-                        layer_idx=i, first_split=True, mamba_view=mamba
-                    )
-                    // block_size_ratio
-                )
-                # Jump one page_size, but ssm page_size may be bigger when kernel
-                # locks block size to a specific value.
-                block_len_per_layer = (
-                    self.block_len_per_layer[i]
-                    // block_size_ratio
-                    * (1 if not mamba else self._physical_blocks_per_logical_kv_block)
-                )
-                num_blocks = self._logical_num_blocks if mamba else self.num_blocks
-                num_blocks = num_blocks * block_size_ratio
-                for block_id in range(num_blocks):
-                    block_offset = block_id * block_len_per_layer
-                    addr = base_addr + block_offset
-                    # (addr, len, device id)
-                    blocks_data.append((addr, kv_block_len, self.device_id))
-
-                if transfer_topo.is_kv_layout_blocks_first:
-                    second_split = self.get_backend_aware_kv_block_len(
-                        layer_idx=i, first_split=False, mamba_view=mamba
-                    )
-                    # Separate and interleave K/V regions to maintain the same
-                    # descs ordering. This is needed for selecting contiguous heads
-                    # when split across TP ranks.
-                    for block_id in range(num_blocks):
-                        block_offset = block_id * block_len_per_layer
-                        addr = base_addr + block_offset
-                        # Register addresses for V cache (K registered first).
-                        v_addr = addr + kv_block_len
-                        blocks_data.append((v_addr, second_split, self.device_id))
-            logger.debug(
-                "Created %s blocks for src engine %s and rank %s on device id %s",
-                len(blocks_data),
-                self.engine_id,
-                self.tp_rank,
-                self.device_id,
-            )
-
-        # NOTE (ZhanqiuHu): mamba=True path in register_blocks is not used
-        # right now — we use _build_mamba_local instead for the 3-read
-        # approach. However, we might still need this as a fallback for homogeneous TP.
-        register_blocks(blocks_data, mamba=False)
+        blocks_data = self._build_fa_local(local_base_addresses, block_size_ratio)
+        logger.debug(
+            "Created %s blocks for src engine %s and rank %s on device id %s",
+            len(blocks_data),
+            self.engine_id,
+            self.tp_rank,
+            self.device_id,
+        )
         if self._has_mamba:
             assert self.num_descs == len(blocks_data)
-            # TODO (ZhanqiuHu): For homogeneous TP (tp_ratio == 1), the 3-read split is
-            # unnecessary — a single conv desc per block suffices.  Consider
+            # TODO (ZhanqiuHu): For homogeneous TP (tp_ratio == 1), the 3-descs split
+            # is unnecessary — a single conv desc per block suffices.  Consider
             # adding a fast path that falls back to the standard 2-region
-            # registration (register_blocks mamba=True) when no hetero-TP
+            # registration (_build_fa_local mamba=True) when no hetero-TP
             # remote has been seen.  Currently we always register 4 regions
             # because local descs are created before knowing the remote TP.
             logger.debug("Registering local Mamba descriptors (4 regions/layer)")
@@ -1160,26 +1247,26 @@ def add_remote_agent(
         assert self.transfer_topo is not None
         transfer_topo = self.transfer_topo
         physical_blocks_per_logical = (
-            compute_physical_blocks_per_logical(
-                nixl_agent_meta.ssm_sizes,
-                nixl_agent_meta.block_lens[0],
-            )
-            if self._has_mamba
-            else 1
+            nixl_agent_meta.physical_blocks_per_logical_kv_block
         )
-        transfer_topo.register_remote_engine(
-            remote_engine_id=engine_id,
+        transfer_info = EngineTransferInfo(
             remote_tp_size=remote_tp_size,
             remote_block_size=nixl_agent_meta.block_size,
             remote_block_len=nixl_agent_meta.block_lens[0],
             remote_physical_blocks_per_logical=physical_blocks_per_logical,
-            local_block_len=self.block_len_per_layer[0],
         )
-        if self._has_mamba and engine_id not in self._physical_blocks_per_logical:
-            self._physical_blocks_per_logical[engine_id] = physical_blocks_per_logical
-
+        transfer_topo.register_remote_engine(engine_id, transfer_info)
         logger.info("Transfer plan: %s", transfer_topo.describe(engine_id))
 
+        self.tp_mappings[engine_id] = compute_tp_mapping(
+            transfer_topo.tp_rank,
+            transfer_topo.tp_size,
+            transfer_info.remote_tp_size,
+            transfer_topo.is_mla,
+            transfer_topo.total_num_kv_heads,
+            self._group_spec_types,
+        )
+
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
             nixl_agent_meta.agent_metadata
         )
@@ -1206,11 +1293,6 @@ def add_remote_agent(
         # this is the ratio between the two sizes.
         tp_ratio = transfer_topo.tp_ratio(remote_tp_size)
 
-        # Handle tp_size>num_kv_heads: replicate KV cache.
-        indexes_into_remote = (
-            not transfer_topo.replicates_kv_cache(engine_id) and tp_ratio > 0
-        )
-
         logger.debug(
             "Registering remote agent (%s, rank %s) memory regions with tp_ratio %s",
             engine_id,
@@ -1218,6 +1300,8 @@ def add_remote_agent(
             tp_ratio,
         )
 
+        plan = self.tp_mappings[engine_id]
+
         ### (Optional) Register local agent memory regions. MLA is not split.
         if (
             tp_ratio < 0
@@ -1227,149 +1311,51 @@ def add_remote_agent(
             # Remote tp_size > local tp_size: read from multiple remote ranks.
             # Logically "split" own regions into |tp_ratio| chunks. Mind that
             # we only do this once per remote tp_size (replica-friendly).
-            abs_tp = -tp_ratio
             self.src_xfer_handles_by_tp_ratio[tp_ratio] = []
 
-            if self._has_mamba:
-                if transfer_topo.needs_split_handles(engine_id):
-                    # Mamba-HMA: FA and Mamba use different split factors.
-                    for handle_data in transfer_topo.compute_split_handle_data(
-                        engine_id, self.src_blocks_data, self.num_descs, abs_tp
-                    ):
-                        descs = self.nixl_wrapper.get_xfer_descs(
-                            handle_data, self.nixl_memory_type
-                        )
-                        handle = self.nixl_wrapper.prep_xfer_dlist(
-                            "NIXL_INIT_AGENT", descs
-                        )
-                        self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
-
-                    logger.info(
-                        "Mamba-HMA split handles: %s, num_descs=%s",
-                        transfer_topo.describe(engine_id),
-                        self.num_descs,
-                    )
-            else:
-                # Original path: uniform divide by abs_tp (non-Mamba-HMA).
-                for i in range(abs_tp):
-                    blocks_data = []
-                    for memory_region in self.src_blocks_data:
-                        addr, local_block_len, own_tp_rank = memory_region
-                        remote_block_len = local_block_len // abs_tp
-                        addr = addr + i * remote_block_len
-                        blocks_data.append((addr, remote_block_len, own_tp_rank))
-                    descs = self.nixl_wrapper.get_xfer_descs(
-                        blocks_data, self.nixl_memory_type
-                    )
-                    handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
-                    self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
+            for handle_data in self._build_local_splits_from_plan(
+                plan,
+                self.src_blocks_data,
+                self.num_descs,
+            ):
+                descs = self.nixl_wrapper.get_xfer_descs(
+                    handle_data, self.nixl_memory_type
+                )
+                handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+                self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
 
         ### Register remote agent memory regions
-        blocks_data = []
-        # With homogeneous TP, D pulls the whole kv cache from corresponding
-        # rank. With heterogeneous TP, prepare the descriptors by splitting the
-        # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
+        # With homogeneous TP, D pulls the whole kv cache from corresponding rank. With
+        # heterogeneous TP, prepare the descriptors by splitting the P KV cache along
+        # kv_head dim, of D worker's kv_head size (D>P).
         # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
 
         # Register all remote blocks, but only the corresponding kv heads.
-        def register_remote_blocks(
-            blocks_data: list[tuple[int, int, int]], mamba: bool
-        ):
-            for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
-                # Read our whole local region size from remote.
-                local_block_len = self.get_backend_aware_kv_block_len(
-                    layer_idx=i, first_split=True, mamba_view=mamba
-                )
-                remote_kv_block_len = local_block_len // block_size_ratio
-                if block_size_ratio > 1:
-                    # using remote kv_block_len as transfer unit
-                    local_block_len = remote_kv_block_len
-
-                if tp_ratio < 0 and not self.use_mla:
-                    # Remote tp is bigger: read a chunk of local region from remote
-                    local_block_len = local_block_len // (-tp_ratio)
-                rank_offset = (
-                    self.tp_rank % tp_ratio * remote_kv_block_len
-                    if indexes_into_remote
-                    else 0
-                )
-
-                # Assume same num_blocks for mamba and fa
-                num_blocks = (
-                    nixl_agent_meta.num_blocks
-                    if not mamba
-                    else nixl_agent_meta.num_blocks
-                    // self._physical_blocks_per_logical_kv_block
-                )
-                page_size = nixl_agent_meta.block_lens[i] * (
-                    1 if not mamba else self._physical_blocks_per_logical_kv_block
-                )
-                for block_id in range(num_blocks):
-                    block_offset = block_id * page_size
-                    # For each block, grab the heads chunk belonging to rank_i
-                    # of size remote_nheads // tp_ratio, which correspond to
-                    # self.block_len == remote_block_len//tp_ratio bytes.
-                    addr = base_addr + block_offset + rank_offset
-                    # (addr, len, device id)
-                    blocks_data.append(
-                        (addr, local_block_len, nixl_agent_meta.device_id)
-                    )
-
-                if transfer_topo.is_kv_layout_blocks_first:
-                    # With FlashInfer index V separately to allow head splitting.
-                    second_split = self.get_backend_aware_kv_block_len(
-                        layer_idx=i, first_split=False, mamba_view=mamba
-                    )
-                    # Apply the same scaling as local_block_len above for when we read
-                    # a chunk of local V from `tp_ratio` separate remote workers.
-                    if tp_ratio < 0 and not self.use_mla:
-                        second_split = second_split // (-tp_ratio)
-                    for block_id in range(num_blocks):
-                        block_offset = block_id * page_size
-                        addr = base_addr + block_offset + rank_offset
-                        # Hop over the first split of remote page: either K or Conv.
-                        if mamba:
-                            v_addr = addr + nixl_agent_meta.ssm_sizes[0]
-                        else:
-                            v_addr = addr + nixl_agent_meta.block_lens[i] // 2
-                        blocks_data.append(
-                            (v_addr, second_split, nixl_agent_meta.device_id)
-                        )
-
-            logger.debug(
-                "Created %s blocks for dst engine %s"
-                " with remote rank %s and local rank %s",
-                len(blocks_data),
-                engine_id,
-                remote_tp_rank,
-                self.tp_rank,
-            )
-
+        blocks_data = self._build_fa_remote(
+            plan,
+            nixl_agent_meta,
+            block_size_ratio,
+        )
+        logger.debug(
+            "Created %s blocks for dst engine %s with remote rank %s and local rank %s",
+            len(blocks_data),
+            engine_id,
+            remote_tp_rank,
+            self.tp_rank,
+        )
         if self._has_mamba:
-            # Mamba-HMA: separate FA registration with GQA-aware sizing,
-            # plus mamba 3-read registration for the Mamba "view" of the
-            # same KV cache tensors.
             logger.debug(
                 "Registering remote Mamba blocks for engine %s rank %s",
                 engine_id,
                 remote_tp_rank,
             )
-            blocks_data.extend(
-                self._build_fa_remote_for_mamba(
-                    nixl_agent_meta,
-                    block_size_ratio,
-                    transfer_topo,
-                    engine_id,
-                )
-            )
             blocks_data.extend(
                 self._build_mamba_remote(
                     nixl_agent_meta,
                     tp_ratio,
+                    transfer_info,
                 )
             )
-        else:
-            register_remote_blocks(blocks_data, mamba=False)
 
         # Register with NIXL.
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
@@ -1897,33 +1883,49 @@ def start_load_kv(self, metadata: NixlConnectorMetadata):
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         assert meta.remote is not None and self.transfer_topo is not None
         engine_id = meta.remote.engine_id
-        remote_ranks = self.transfer_topo.target_remote_ranks(engine_id)
+        plan = self.tp_mappings[engine_id]
         remote_info = self.transfer_topo.get_engine_info(engine_id)
         tp_ratio = self.transfer_topo.tp_ratio(remote_info.remote_tp_size)
 
-        if self._has_mamba:
-            # Expand remote logical → kernel block IDs.
-            meta.remote.block_ids = self._logical_to_remote_kernel_block_ids(
-                meta.remote.block_ids,
-                self._physical_blocks_per_logical[meta.remote.engine_id],
-            )
-        else:
-            meta.remote.block_ids = self._logical_to_kernel_block_ids(
-                meta.remote.block_ids
+        meta.remote.block_ids = self._logical_to_remote_kernel_block_ids(
+            meta.remote.block_ids,
+            remote_info.remote_physical_blocks_per_logical,
+        )
+        remote_block_ids = meta.remote.block_ids
+        local_block_ids = meta.local_physical_block_ids
+        num_groups = len(local_block_ids)
+        read_specs = [
+            ReadSpec(
+                remote_rank=rank,
+                local_block_ids=[
+                    list(local_block_ids[g])
+                    if rank in plan.source_ranks_per_group[g]
+                    else []
+                    for g in range(num_groups)
+                ],
+                remote_block_ids=[
+                    list(remote_block_ids[g])
+                    if rank in plan.source_ranks_per_group[g]
+                    else []
+                    for g in range(num_groups)
+                ],
             )
+            for rank in plan.all_source_ranks
+        ]
+
         # D may have to perform multiple reads from different remote ranks.
-        for i, remote_rank in enumerate(remote_ranks):
-            if self.use_mla and tp_ratio < 0 and i > 0:
-                # MLA opt: when P TP > D TP, only a single read is executed for
-                # the first remote rank (cache is duplicated)..
-                break
+        # MLA opt: when P TP > D TP, only a single read is executed for
+        # the first remote rank (cache is duplicated).
+        if self.use_mla and tp_ratio < 0:
+            read_specs = read_specs[:1]
 
+        for i, spec in enumerate(read_specs):
             remote_block_size = remote_info.remote_block_size
             logger.debug(
                 "Remote agent %s available, calling _read_blocks"
                 " on remote rank %s with remote block size %s for req %s",
                 meta.remote.engine_id,
-                remote_rank,
+                spec.remote_rank,
                 remote_block_size,
                 req_id,
             )
@@ -1942,49 +1944,34 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
 
             # Destination handle: remote_engine_id -> remote_rank -> handle.
             remote_xfer_side_handle = self.dst_xfer_side_handles[meta.remote.engine_id][
-                remote_rank
+                spec.remote_rank
             ]
 
-            local_ids: BlockIds = meta.local_physical_block_ids
-            remote_ids: BlockIds = meta.remote.block_ids
-            if self._has_mamba:
-                # Mamba-HMA: zero out FA groups for P ranks outside fa_read_targets.
-                local_ids, remote_ids = self.transfer_topo.filter_block_ids_for_rank(
-                    engine_id,
-                    remote_rank,
-                    local_ids,
-                    remote_ids,
-                    self._is_mamba_group,
-                )
-
             self._read_blocks(
+                read_spec=spec,
                 request_id=req_id,
                 dst_engine_id=meta.remote.engine_id,
                 remote_request_id=meta.remote.request_id,
-                local_block_ids=local_ids,
-                remote_block_ids=remote_ids,
-                remote_rank=remote_rank,
                 local_xfer_side_handle=local_xfer_side_handle,
                 remote_xfer_side_handle=remote_xfer_side_handle,
             )
 
-            if self.use_mla and tp_ratio < 0:
-                # ..but we still need to notify the other remote ranks that we
-                # have the blocks we need so they can update the request state.
-                notif_id = f"{req_id}:{self.world_size}".encode()
-                remote_agents = self._remote_agents[meta.remote.engine_id]
-                for rank_to_notify, agent in remote_agents.items():
-                    if rank_to_notify != remote_rank:
-                        self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
+        if self.use_mla and tp_ratio < 0 and read_specs:
+            # ..but we still need to notify the other remote ranks that we
+            # have the blocks we need so they can update the request state.
+            notif_id = f"{req_id}:{self.world_size}".encode()
+            remote_agents = self._remote_agents[meta.remote.engine_id]
+            read_ranks = {s.remote_rank for s in read_specs}
+            for rank_to_notify, agent in remote_agents.items():
+                if rank_to_notify not in read_ranks:
+                    self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
 
     def _read_blocks(
         self,
-        local_block_ids: BlockIds,
-        remote_block_ids: BlockIds,
+        read_spec: ReadSpec,
         dst_engine_id: str,
         request_id: str,
         remote_request_id: str,
-        remote_rank: int,
         local_xfer_side_handle: int,
         remote_xfer_side_handle: int,
     ):
@@ -1993,6 +1980,10 @@ def _read_blocks(
         a single remote worker.
         """
         assert self.transfer_topo is not None
+        remote_rank = read_spec.remote_rank
+        local_block_ids = read_spec.local_block_ids
+        remote_block_ids = read_spec.remote_block_ids
+
         remote_info = self.transfer_topo.get_engine_info(dst_engine_id)
         block_size_ratio = self.transfer_topo.block_size_ratio(
             remote_info.remote_block_size
@@ -2061,16 +2052,14 @@ def _read_blocks(
             == len(local_block_ids)
             == len(self.kv_cache_config.kv_cache_groups)
         )
+        # Partial prefix cache hit: just read uncomputed blocks.
+        # Skip mamba groups — their blocks represent full state (conv+ssm),
+        # not per-token data, so trimming would corrupt the transfer.
         remote_block_ids = list(remote_block_ids)
         for i, remote_group in enumerate(remote_block_ids):
-            num_remote_blocks = len(remote_group)
             num_local_blocks = len(local_block_ids[i])
-            if not self._is_mamba_group[i]:
-                assert num_local_blocks <= num_remote_blocks
-            # Partial prefix cache hit: just read uncomputed blocks.
-            # Skip mamba groups — their blocks represent full state (conv+ssm),
-            # not per-token data, so trimming would corrupt the transfer.
-            if num_local_blocks < num_remote_blocks and not self._is_mamba_group[i]:
+            assert num_local_blocks <= len(remote_group)
+            if num_local_blocks < len(remote_group):
                 remote_block_ids[i] = remote_group[-num_local_blocks:]
 
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
@@ -2078,14 +2067,17 @@ def _read_blocks(
         # workers will issue xfers to parts of the P worker remote kv caches.
 
         # Get descs ids.
-        remote_block_descs_ids = self._get_block_descs_ids(
-            dst_engine_id,
-            remote_block_ids,
+        remote_block_descs_ids = self._compute_desc_ids(
+            block_ids=remote_block_ids,
+            dst_num_blocks=self.dst_num_blocks[dst_engine_id],
+            block_size_ratio=None,
+            physical_blocks_per_logical=remote_info.remote_physical_blocks_per_logical,
         )
-        local_block_descs_ids = self._get_block_descs_ids(
-            self.engine_id,
-            local_block_ids,
+        local_block_descs_ids = self._compute_desc_ids(
+            block_ids=local_block_ids,
+            dst_num_blocks=self.dst_num_blocks[self.engine_id],
             block_size_ratio=block_size_ratio,
+            physical_blocks_per_logical=self._physical_blocks_per_logical_kv_block,
         )
 
         assert len(local_block_descs_ids) == len(remote_block_descs_ids)
@@ -2147,63 +2139,6 @@ def get_mapped_blocks(
 
         return mapped_2d.flatten().astype(np.int64)
 
-    def _get_block_descs_ids(
-        self,
-        engine_id: str,
-        block_ids: BlockIds,
-        block_size_ratio: float | None = None,
-    ) -> np.ndarray:
-        """
-        Get the descs ids for a set of block ids.
-        When HMA is enabled number of descriptors across kv cache groups might differ.
-        A single flattened array is returned for all groups anyway.
-        """
-        region_ids = np.arange(self.num_regions)
-
-        # NOTE (NickLucche) With HMA, every kv group has the same number of layers and
-        # layers from different groups share the same kv tensor.
-        # eg block_ids=[[1, 2], [3]]->blocks [1, 2] need to be read across all regions,
-        # same for [3], but group0-group1 blocks will always differ (different areas).
-        # Therefore we can just flatten the block_ids and compute the descs ids for all
-        # groups at once.
-        num_blocks = self.dst_num_blocks[engine_id]
-        if block_size_ratio is not None:
-            num_blocks = int(num_blocks * block_size_ratio)
-
-        # Compute desc ids per group using the right stride: FA descs have
-        # num_blocks entries per region (kernel granularity), SSM descs have
-        # logical_blocks entries per region (no kernel splitting).
-        region_ids = region_ids[:, None]
-        if not self._has_mamba:
-            block_ids = np.concatenate(block_ids)[None, :]
-            descs_ids = region_ids * num_blocks + block_ids
-            return descs_ids.flatten()
-        else:
-            # NOTE (NickLucche) SSM and Attention blocks regions can be exchanged
-            # arbitrarily by manager. Therefore, descs are duplicated for SSM and
-            # Attention like so:
-            # desc_handle->[descs_fa (all regions) | descs_ssm (all regions)].
-            # This is like having two "low-level views" of the same storage.
-            # `num_fa_descs` offset must be computed per-engine since P and D can
-            # have different num_blocks (and thus different FA descs counts).
-            physical_per_logical = self._physical_blocks_per_logical[engine_id]
-            logical_blocks = num_blocks // physical_per_logical
-            num_fa_descs = self.num_regions * num_blocks
-            # 3-read mamba: 4 regions per unique cache tensor (x, B, C, ssm).
-            mamba_region_ids = np.arange(len(self.block_len_per_layer) * 4)[:, None]
-            all_descs = []
-            for i, group in enumerate(block_ids):
-                group_arr = np.asarray(group)[None, :]
-                if self._is_mamba_group[i]:
-                    all_descs.append(
-                        (
-                            mamba_region_ids * logical_blocks + group_arr + num_fa_descs
-                        ).flatten()
-                    )
-                else:
-                    all_descs.append((region_ids * num_blocks + group_arr).flatten())
-            return np.concatenate(all_descs)
-
     def _logical_to_kernel_block_ids(self, block_ids: BlockIds) -> BlockIds:
         """
         Convert logical block ids to kernel physical block ids.
@@ -2295,10 +2230,7 @@ def get_backend_aware_kv_block_len(
         """
         assert self.transfer_topo is not None
         if self.transfer_topo.is_kv_layout_blocks_first:
-            # For indexing only half (either just the K or V part).
             if mamba_view:
-                # NOTE (NickLucche) Mamba Opt: this is already skipping the padding so
-                # we're only transferring the minimum required bytes.
                 block_len = self._mamba_ssm_size[not first_split]
             else:
                 block_len = self.block_len_per_layer[layer_idx] // 2
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/common.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/common.py
index 06a727a27b55..c5a251a2a515 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/common.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/common.py
@@ -1,15 +1,60 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
-from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorMetadata,
+    KVConnectorWorkerMetadata,
+)
 from vllm.v1.kv_offload.worker.worker import TransferSpec
 
 ReqId = str
 
 
+@dataclass
+class TransferJob:
+    """A transfer job bundling request context with transfer spec.
+
+    Used for both loads and stores, keyed by scheduler-assigned job ID.
+    The worker reports the job ID back when the transfer finishes,
+    and the scheduler processes the completion.
+    """
+
+    req_id: ReqId
+    transfer_spec: TransferSpec
+
+
 @dataclass
 class OffloadingConnectorMetadata(KVConnectorMetadata):
-    reqs_to_load: dict[ReqId, TransferSpec]
-    reqs_to_store: dict[ReqId, TransferSpec]
-    reqs_to_flush: set[str] | None = None
+    # Keyed by scheduler-assigned job IDs.
+    load_jobs: dict[int, TransferJob]
+    store_jobs: dict[int, TransferJob]
+    jobs_to_flush: set[int] | None = None
+
+
+@dataclass
+class OffloadingWorkerMetadata(KVConnectorWorkerMetadata):
+    """Worker -> Scheduler metadata for completed transfer jobs.
+
+    Each worker reports {job_id: 1} for newly completed transfer jobs
+    (load or store). aggregate() sums counts across workers within a step.
+    The scheduler accumulates across steps and processes
+    a transfer completion only when count reaches num_workers.
+    """
+
+    completed_jobs: dict[int, int] = field(default_factory=dict)
+
+    def mark_completed(self, job_id: int) -> None:
+        """Record a transfer job completion from this worker."""
+        self.completed_jobs[job_id] = 1
+
+    def aggregate(
+        self, other: "KVConnectorWorkerMetadata"
+    ) -> "KVConnectorWorkerMetadata":
+        assert isinstance(other, OffloadingWorkerMetadata)
+
+        merged = dict(self.completed_jobs)
+        for job_id, v in other.completed_jobs.items():
+            merged[job_id] = merged.get(job_id, 0) + v
+
+        return OffloadingWorkerMetadata(completed_jobs=merged)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
index 1ef99eaa4461..773fe8f056ac 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections import defaultdict
 from collections.abc import Iterable, Sequence
 from dataclasses import dataclass, field
 from itertools import islice
@@ -11,42 +10,87 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1.offloading.common import (
     OffloadingConnectorMetadata,
+    OffloadingWorkerMetadata,
     ReqId,
+    TransferJob,
 )
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.kv_offload.abstract import (
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheSpec,
+    MambaSpec,
+    SlidingWindowSpec,
+)
+from vllm.v1.kv_offload.base import (
+    GPULoadStoreSpec,
     OffloadingManager,
+    OffloadingSpec,
     OffloadKey,
     ReqContext,
     get_offload_block_hash,
     make_offload_key,
 )
-from vllm.v1.kv_offload.mediums import GPULoadStoreSpec
-from vllm.v1.kv_offload.spec import OffloadingSpec
-from vllm.v1.kv_offload.worker.worker import TransferSpec
 from vllm.v1.outputs import KVConnectorOutput
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
 
+@dataclass(slots=True)
+class TransferJobStatus:
+    """Tracks scheduler-side state for a single transfer job."""
+
+    req_id: ReqId
+    # Number of workers still pending. Starts at num_workers,
+    # decremented as each worker reports completion. Job is done at 0.
+    pending_count: int
+    # Offload keys this job covers; passed to manager.complete_*().
+    keys: set[OffloadKey]
+    is_store: bool
+    # Store src block IDs whose ref_cnt protects them while the request
+    # runs. Only registered in _block_id_to_pending_jobs on request_finished.
+    non_sliding_window_block_ids: list[int] | None = None
+    # Store src block IDs that may be freed before the request finishes.
+    # Registered in _block_id_to_pending_jobs at store creation time.
+    sliding_window_block_ids: list[int] | None = None
+
+
 class GroupOffloadConfig(NamedTuple):
     group_idx: int
     gpu_block_size: int
     offloaded_block_size: int
     hash_block_size_factor: int
+    # None below means full attention
+    sliding_window_size_in_blocks: int | None
+
+
+def get_sliding_window_size_in_blocks(
+    kv_cache_spec: KVCacheSpec, offloaded_block_size: int
+) -> int | None:
+    if isinstance(kv_cache_spec, SlidingWindowSpec):
+        assert kv_cache_spec.sliding_window > 0
+        return cdiv(kv_cache_spec.sliding_window, offloaded_block_size)
+
+    if isinstance(kv_cache_spec, MambaSpec):
+        # Mamba depends on a single state
+        return 1
+
+    assert isinstance(kv_cache_spec, FullAttentionSpec)
+    return None
 
 
 class SchedulerOffloadConfig(NamedTuple):
     kv_group_configs: tuple[GroupOffloadConfig, ...]
     block_size_factor: int
+    num_workers: int
 
     @classmethod
     def from_spec(cls, spec: OffloadingSpec) -> "SchedulerOffloadConfig":
         return cls(
+            num_workers=spec.vllm_config.parallel_config.world_size,
             kv_group_configs=tuple(
                 GroupOffloadConfig(
                     group_idx=idx,
@@ -56,6 +100,10 @@ def from_spec(cls, spec: OffloadingSpec) -> "SchedulerOffloadConfig":
                         (gpu_block_size * spec.block_size_factor)
                         // spec.hash_block_size
                     ),
+                    sliding_window_size_in_blocks=get_sliding_window_size_in_blocks(
+                        spec.kv_cache_config.kv_cache_groups[idx].kv_cache_spec,
+                        gpu_block_size * spec.block_size_factor,
+                    ),
                 )
                 for idx, gpu_block_size in enumerate(spec.gpu_block_size)
             ),
@@ -69,6 +117,9 @@ class RequestGroupState:
     block_ids: list[int] = field(default_factory=list)
     # index of next block (of size offloaded_block_size) to offload
     next_stored_block_idx: int = 0
+    # number of offloaded blocks hit (including GPU prefix cache)
+    # when the request first started
+    num_hit_blocks: int = 0
 
 
 @dataclass(slots=True)
@@ -79,6 +130,9 @@ class RequestOffloadState:
     req_context: ReqContext = field(init=False)
     # number of hits in the GPU cache
     num_locally_computed_tokens: int = 0
+    # In-flight job IDs. Per the connector's invariant, at any given time
+    # this contains either a single load job, or one or more store jobs.
+    transfer_jobs: set[int] = field(default_factory=set)
 
     def __post_init__(self) -> None:
         self.group_states = tuple(
@@ -119,6 +173,14 @@ def advance_stored_idx(self, num_offloadable_tokens: int) -> None:
             num_blocks = num_offloadable_tokens // group_config.offloaded_block_size
             group_state.next_stored_block_idx = num_blocks
 
+    def update_num_hit_blocks(self, num_cached_tokens: int) -> None:
+        for group_config, group_state in zip(
+            self.config.kv_group_configs, self.group_states
+        ):
+            group_state.num_hit_blocks = (
+                num_cached_tokens // group_config.offloaded_block_size
+            )
+
 
 class OffloadingConnectorScheduler:
     """Implementation of Scheduler side methods"""
@@ -127,30 +189,63 @@ def __init__(self, spec: OffloadingSpec):
         self.config = SchedulerOffloadConfig.from_spec(spec)
         self.manager: OffloadingManager = spec.get_manager()
 
-        attention_groups: list[int] = []
-        for idx, _ in enumerate(spec.kv_cache_config.kv_cache_groups):
-            # currently treat all groups as full attention
-            attention_groups.append(idx)
+        full_attention_groups: list[int] = []
+        sliding_window_groups: list[int] = []
+        for group_config in self.config.kv_group_configs:
+            if group_config.sliding_window_size_in_blocks is None:
+                full_attention_groups.append(group_config.group_idx)
+            else:
+                sliding_window_groups.append(group_config.group_idx)
 
-        self.lookup_groups = attention_groups
+        # sort sliding window groups by window size in decreasing order
+        def _sliding_window_sort_key(i: int) -> int:
+            val = self.config.kv_group_configs[i].sliding_window_size_in_blocks
+            assert val is not None
+            return val
+
+        sliding_window_groups.sort(key=_sliding_window_sort_key, reverse=True)
+
+        # used by _lookup
+        self._sliding_window_groups: tuple[int, ...] = tuple(sliding_window_groups)
+        self._lookup_groups = tuple(full_attention_groups) + self._sliding_window_groups
 
         self._req_status: dict[ReqId, RequestOffloadState] = {}
-        # requests to load for the current scheduler step
-        self._reqs_to_load: dict[ReqId, TransferSpec] = {}
+        self._current_batch_load_jobs: dict[int, TransferJob] = {}
+        self._current_batch_jobs_to_flush: set[int] = set()
         # if GPU prefix caching is enabled,
         # track loaded blocks to avoid redundant loads
         self._blocks_being_loaded: set[OffloadKey] | None = (
             set() if spec.vllm_config.cache_config.enable_prefix_caching else None
         )
 
-        # request ID -> set(offload keys being stored/loaded)
-        self._reqs_being_stored = defaultdict[ReqId, set[OffloadKey]](set)
-        self._reqs_being_loaded = defaultdict[ReqId, set[OffloadKey]](set)
+        # Job ID counter shared by loads and stores.
+        self._job_counter: int = 0
+        self._jobs: dict[int, TransferJobStatus] = {}
+
+        # block_id -> pending store job_ids. Used to track jobs that needs
+        # flushing in case a block is re-allocated by the KV cache manager.
+        # Populated only for finished requests (running-request blocks are
+        # protected by their ref_cnt) and for sliding window blocks (which can
+        # be freed before a request finishes).
+        self._block_id_to_pending_jobs: dict[int, set[int]] = {}
+
+    def _generate_job_id(self) -> int:
+        job_id = self._job_counter
+        self._job_counter += 1
+        return job_id
+
+    def _remove_pending_job(self, job_id: int, block_ids: list[int] | None) -> None:
+        for bid in block_ids or ():
+            pending = self._block_id_to_pending_jobs[bid]
+            pending.remove(job_id)
+            if not pending:
+                del self._block_id_to_pending_jobs[bid]
 
     def _maximal_prefix_lookup(
         self, keys: Iterable[OffloadKey], req_context: ReqContext
     ) -> int | None:
-        """Find the length of the maximal prefix of offloaded blocks."""
+        """Return the number of consecutive offloaded blocks from the start,
+        or None if the backend deferred a lookup."""
         hit_count = 0
         defer_lookup = False
         for key in keys:
@@ -171,8 +266,9 @@ def _sliding_window_lookup(
         sliding_window_size: int,
         req_context: ReqContext,
     ) -> int | None:
-        """Find the maximal ending position of consecutive offloaded blocks
-        within a sliding window."""
+        """Return the end index (in `keys`) of the last run of
+        `sliding_window_size` consecutive hits, scanning from the end.
+        Returns 0 on miss, None if the backend deferred a lookup."""
         defer_lookup = False
         consecutive_hits = 0
         for idx in range(len(keys) - 1, -1, -1):
@@ -190,6 +286,160 @@ def _sliding_window_lookup(
                     return idx + sliding_window_size if not defer_lookup else None
         return consecutive_hits if not defer_lookup else None
 
+    def _touch(self, req_status: RequestOffloadState):
+        for group_config, group_state in zip(
+            self.config.kv_group_configs, req_status.group_states
+        ):
+            if group_config.sliding_window_size_in_blocks is None:
+                self.manager.touch(group_state.offload_keys)
+            else:
+                # we aim to keep just blocks that are necessary to hit
+                # the original request (+ decoded blocks)
+                blocks_to_skip = max(
+                    0,
+                    group_state.num_hit_blocks
+                    - group_config.sliding_window_size_in_blocks,
+                )
+                self.manager.touch(group_state.offload_keys[blocks_to_skip:])
+
+    def _lookup(self, req_status: RequestOffloadState) -> int | None:
+        """
+        Find how many tokens beyond num_locally_computed_tokens can be loaded.
+
+        Iterates full-attention groups first (prefix lookup), then sliding-window
+        groups (suffix lookup). Each group may tighten max_hit_size_tokens, which
+        can invalidate an earlier group's result, so the loop re-runs when that
+        happens until num_hit_tokens converges.
+        """
+        num_computed_tokens = req_status.num_locally_computed_tokens
+        max_hit_size_tokens: int = req_status.req.num_tokens
+        if self._sliding_window_groups:
+            # the last prompt token has to be recomputed to get the logprobs
+            # for sliding window attention, we must reduce by 1 to make sure
+            # we still have a hit after reduction
+            max_hit_size_tokens -= 1
+        num_hit_tokens: int = 0
+        defer_lookup = False
+        lookup_groups = self._lookup_groups
+        while lookup_groups:
+            looked_up_sliding_window: bool = False
+            groups_iter = iter(lookup_groups)
+            lookup_groups = ()
+            for group_idx in groups_iter:
+                group_config: GroupOffloadConfig = self.config.kv_group_configs[
+                    group_idx
+                ]
+                group_state: RequestGroupState = req_status.group_states[group_idx]
+                offloaded_block_size = group_config.offloaded_block_size
+                offload_keys = group_state.offload_keys
+
+                assert (
+                    len(offload_keys)
+                    >= req_status.req.num_tokens // offloaded_block_size
+                )
+
+                # Constrain to block-aligned boundary for this group
+                max_hit_size_tokens = min(
+                    max_hit_size_tokens, len(offload_keys) * offloaded_block_size
+                )
+                if max_hit_size_tokens - num_computed_tokens < offloaded_block_size:
+                    # we can only load less than a block, better skip
+                    return 0
+
+                num_blocks = min(
+                    cdiv(max_hit_size_tokens, offloaded_block_size), len(offload_keys)
+                )
+                start_block_idx = num_computed_tokens // offloaded_block_size
+                offload_keys = offload_keys[start_block_idx:num_blocks]
+                sliding_window_size_in_blocks = (
+                    group_config.sliding_window_size_in_blocks
+                )
+
+                # end index (in the sliced offload_keys) up to which we
+                # have backend-confirmed hits
+                num_hit_blocks: int | None
+                if sliding_window_size_in_blocks is None:
+                    num_hit_blocks = self._maximal_prefix_lookup(
+                        offload_keys, req_status.req_context
+                    )
+                else:
+                    num_hit_blocks = self._sliding_window_lookup(
+                        offload_keys,
+                        sliding_window_size_in_blocks,
+                        req_status.req_context,
+                    )
+                if num_hit_blocks == 0:
+                    return 0
+
+                if num_hit_blocks is None:
+                    defer_lookup = True
+                else:
+                    max_hit_size_tokens = min(
+                        max_hit_size_tokens,
+                        offloaded_block_size * (start_block_idx + num_hit_blocks),
+                    )
+
+                new_num_hit_tokens = max_hit_size_tokens - num_computed_tokens
+                if new_num_hit_tokens < offloaded_block_size:
+                    # we can only load less than a block, better skip
+                    return 0
+
+                if new_num_hit_tokens < num_hit_tokens:
+                    if defer_lookup:
+                        # make another iteration on all groups to check
+                        # if we still need to defer lookup
+                        defer_lookup = False
+                        lookup_groups = self._lookup_groups
+                    elif looked_up_sliding_window and not lookup_groups:
+                        # we need another iteration to confirm previously looked up
+                        # sliding window works with the new_num_hit_tokens
+                        lookup_groups = self._sliding_window_groups
+
+                looked_up_sliding_window |= sliding_window_size_in_blocks is not None
+                num_hit_tokens = new_num_hit_tokens
+
+        if defer_lookup:
+            logger.debug(
+                "Offloading manager delayed request %s as backend requested",
+                req_status.req.request_id,
+            )
+            return None
+
+        # possibly delay request if any of the hit blocks is already being loaded
+        if self._blocks_being_loaded:
+            for group_config, group_state in zip(
+                self.config.kv_group_configs, req_status.group_states
+            ):
+                offloaded_block_size = group_config.offloaded_block_size
+                sliding_window_size_in_blocks = (
+                    group_config.sliding_window_size_in_blocks
+                )
+                offload_keys = group_state.offload_keys
+                num_blocks = cdiv(
+                    num_computed_tokens + num_hit_tokens, offloaded_block_size
+                )
+                start_block_idx = num_computed_tokens // offloaded_block_size
+                offload_keys = offload_keys[start_block_idx:num_blocks]
+                if sliding_window_size_in_blocks is not None:
+                    offload_keys = offload_keys[-sliding_window_size_in_blocks:]
+                if any(key in self._blocks_being_loaded for key in offload_keys):
+                    # hit blocks are being loaded, delay request
+                    logger.debug(
+                        "Delaying request %s since some of its"
+                        " blocks are already being loaded",
+                        req_status.req.request_id,
+                    )
+                    return None
+
+        logger.debug(
+            "Request %s hit %s offloaded tokens after %s GPU hit tokens",
+            req_status.req.request_id,
+            num_hit_tokens,
+            num_computed_tokens,
+        )
+
+        return num_hit_tokens
+
     def get_num_new_matched_tokens(
         self, request: Request, num_computed_tokens: int
     ) -> tuple[int | None, bool]:
@@ -212,96 +462,28 @@ def get_num_new_matched_tokens(
                 - `True` if tokens will be loaded asynchronously
                   (between scheduler steps).
         """
+        is_new_request = False
         if req_status := self._req_status.get(request.request_id):
             # make sure block IDs are cleared
             for group_state in req_status.group_states:
                 group_state.block_ids.clear()
         else:
+            is_new_request = True
             req_status = RequestOffloadState(config=self.config, req=request)
             self._req_status[request.request_id] = req_status
 
         req_status.update_offload_keys()
         req_status.num_locally_computed_tokens = num_computed_tokens
 
-        for gs in req_status.group_states:
-            self.manager.touch(gs.offload_keys)
-
-        # Start with the full request size as the maximum loadable
-        max_hit_size_tokens: int = req_status.req.num_tokens
-        num_hit_tokens: int = 0
-        defer_lookup = False
-        delay_request = False
-        for group_idx in self.lookup_groups:
-            group_config: GroupOffloadConfig = self.config.kv_group_configs[group_idx]
-            offloaded_block_size = group_config.offloaded_block_size
-            offload_keys = req_status.group_states[group_idx].offload_keys
-
-            num_blocks = max_hit_size_tokens // offloaded_block_size
-            assert len(offload_keys) >= num_blocks
-
-            # Constrain to block-aligned boundary for this group
-            max_hit_size_tokens = num_blocks * offloaded_block_size
-            num_hit_tokens = max_hit_size_tokens - num_computed_tokens
-            if num_hit_tokens < offloaded_block_size:
-                # we can only load less than a block, better skip
-                return 0, False
-
-            start_block_idx = num_computed_tokens // offloaded_block_size
-            offload_keys = offload_keys[start_block_idx:num_blocks]
-            # Full attention relies on all previous KV cache blocks.
-            # Thus, we search for a maximal prefix of KV cache which are all cached.
-            block_hits = self._maximal_prefix_lookup(
-                offload_keys, req_status.req_context
+        num_hit_tokens = self._lookup(req_status)
+        if is_new_request:
+            req_status.update_num_hit_blocks(
+                num_computed_tokens + (num_hit_tokens or 0)
             )
-            if block_hits == 0:
-                return 0, False
 
-            if block_hits is None:
-                defer_lookup = True
-            else:
-                # Further constrain based on what's actually available by backend
-                max_hit_size_tokens = offloaded_block_size * (
-                    start_block_idx + block_hits
-                )
+        self._touch(req_status)
 
-            num_hit_tokens = max_hit_size_tokens - num_computed_tokens
-            if num_hit_tokens < offloaded_block_size:
-                # we can only load less than a block, better skip
-                return 0, False
-
-            if (
-                block_hits
-                and self._blocks_being_loaded
-                and any(
-                    key in self._blocks_being_loaded
-                    for key in offload_keys[:block_hits]
-                )
-            ):
-                # hit blocks are being loaded, delay request
-                delay_request = True
-
-        if defer_lookup:
-            logger.debug(
-                "Offloading manager delayed request %s as backend requested",
-                req_status.req.request_id,
-            )
-            return None, False
-
-        if delay_request:
-            logger.debug(
-                "Delaying request %s since some of its blocks are already being loaded",
-                req_status.req.request_id,
-            )
-            return None, False
-
-        logger.debug(
-            "Request %s hit %s offloaded tokens after %s GPU hit tokens",
-            request.request_id,
-            num_hit_tokens,
-            num_computed_tokens,
-        )
-
-        return num_hit_tokens, True
+        return num_hit_tokens, bool(num_hit_tokens)
 
     def update_state_after_alloc(
         self, request: Request, blocks: KVCacheBlocks, num_external_tokens: int
@@ -346,6 +528,13 @@ def update_state_after_alloc(
             )
             num_pending_gpu_blocks = num_gpu_blocks - num_locally_computed_gpu_blocks
 
+            if group_config.sliding_window_size_in_blocks is not None:
+                assert (
+                    num_pending_gpu_blocks
+                    <= group_config.sliding_window_size_in_blocks
+                    * self.config.block_size_factor
+                )
+
             num_blocks = cdiv(num_cached_tokens, offloaded_block_size)
             assert len(offload_keys) >= num_blocks
             if num_pending_gpu_blocks:
@@ -369,23 +558,46 @@ def update_state_after_alloc(
                 # entire KV cache so a remote decode node can consume it.
                 group_state.next_stored_block_idx = num_blocks
 
+        # Fence dst blocks against finished-request pending stores.
+        if (
+            self._block_id_to_pending_jobs
+            and not self._block_id_to_pending_jobs.keys().isdisjoint(dst_block_ids)
+        ):
+            self._current_batch_jobs_to_flush.update(
+                jid
+                for bid in dst_block_ids
+                for jid in self._block_id_to_pending_jobs.get(bid, ())
+            )
+
         src_spec = self.manager.prepare_load(keys_to_load, req_status.req_context)
         dst_spec = GPULoadStoreSpec(
             dst_block_ids, group_sizes=group_sizes, block_indices=block_indices
         )
 
-        self._reqs_to_load[request.request_id] = (src_spec, dst_spec)
-        req_blocks_being_loaded = self._reqs_being_loaded[request.request_id]
-        req_blocks_being_loaded.update(keys_to_load)
+        load_job_id = self._generate_job_id()
+        self._current_batch_load_jobs[load_job_id] = TransferJob(
+            req_id=request.request_id,
+            transfer_spec=(src_spec, dst_spec),
+        )
+        # a load can only be issued when no other jobs are pending.
+        assert not req_status.transfer_jobs
+        req_status.transfer_jobs.add(load_job_id)
+        self._jobs[load_job_id] = TransferJobStatus(
+            req_id=request.request_id,
+            pending_count=self.config.num_workers,
+            keys=set(keys_to_load),
+            is_store=False,
+        )
 
         if self._blocks_being_loaded is not None:
-            self._blocks_being_loaded.update(req_blocks_being_loaded)
+            self._blocks_being_loaded.update(keys_to_load)
 
-    def _get_reqs_to_store(
-        self, scheduler_output: SchedulerOutput
-    ) -> dict[ReqId, TransferSpec]:
+    def _build_store_jobs(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> dict[int, TransferJob]:
         block_size_factor = self.config.block_size_factor
-        reqs_to_store: dict[ReqId, TransferSpec] = {}
+        store_jobs: dict[int, TransferJob] = {}
         # iterate over both new and cached requests
         for req_id, new_block_id_groups, preempted in yield_req_data(scheduler_output):
             req_status = self._req_status[req_id]
@@ -398,6 +610,19 @@ def _get_reqs_to_store(
 
             if new_block_id_groups:
                 req_status.update_block_id_groups(new_block_id_groups)
+                # Fence new blocks against in-flight stores.
+                if self._block_id_to_pending_jobs:
+                    new_blocks_flat = [
+                        bid for new_blocks in new_block_id_groups for bid in new_blocks
+                    ]
+                    if not self._block_id_to_pending_jobs.keys().isdisjoint(
+                        new_blocks_flat
+                    ):
+                        self._current_batch_jobs_to_flush.update(
+                            jid
+                            for bid in new_blocks_flat
+                            for jid in self._block_id_to_pending_jobs.get(bid, ())
+                        )
 
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_tokens_after_batch = req.num_computed_tokens + num_scheduled_tokens
@@ -448,17 +673,21 @@ def _get_reqs_to_store(
                 req_status.advance_stored_idx(num_offloadable_tokens)
                 continue
 
-            for group_state in req_status.group_states:
-                self.manager.touch(group_state.offload_keys)
+            self._touch(req_status)
 
             keys_to_store = set(store_output.keys_to_store)
 
             group_sizes: list[int] = []
             block_indices: list[int] = []
             src_block_ids: list[int] = []
+            sliding_window_block_ids: list[int] = []
+            non_sliding_window_block_ids: list[int] = []
             for group_config, group_state in zip(
                 self.config.kv_group_configs, req_status.group_states
             ):
+                is_sliding_window = (
+                    group_config.sliding_window_size_in_blocks is not None
+                )
                 num_blocks = num_offloadable_tokens // group_config.offloaded_block_size
                 start_block_idx = group_state.next_stored_block_idx
                 block_ids = group_state.block_ids
@@ -482,6 +711,11 @@ def _get_reqs_to_store(
                         elif start_gpu_block_idx is None:
                             start_gpu_block_idx = gpu_block_idx + i
                         src_block_ids.append(block_id)
+                        if is_sliding_window:
+                            sliding_window_block_ids.append(block_id)
+                        else:
+                            non_sliding_window_block_ids.append(block_id)
+
                 group_sizes.append(num_group_blocks)
                 block_indices.append(start_gpu_block_idx or 0)
                 group_state.next_stored_block_idx = num_blocks
@@ -491,36 +725,61 @@ def _get_reqs_to_store(
             )
             dst_spec = store_output.store_spec
 
-            reqs_to_store[req_id] = (src_spec, dst_spec)
-            self._reqs_being_stored[req_id] |= keys_to_store
+            job_id = self._generate_job_id()
+            # a store can only be issued when no load is pending.
+            if req_status.transfer_jobs:
+                any_jid = next(iter(req_status.transfer_jobs))
+                assert self._jobs[any_jid].is_store
+            req_status.transfer_jobs.add(job_id)
+
+            # Watch sliding window blocks as they may get evicted
+            # before the request finishes
+            for bid in sliding_window_block_ids or ():
+                self._block_id_to_pending_jobs.setdefault(bid, set()).add(job_id)
+
+            # the non-sliding window blocks will be watched only
+            # when the request finishes
+            self._jobs[job_id] = TransferJobStatus(
+                req_id=req_id,
+                pending_count=self.config.num_workers,
+                keys=set(keys_to_store),
+                is_store=True,
+                non_sliding_window_block_ids=non_sliding_window_block_ids,
+                sliding_window_block_ids=sliding_window_block_ids or None,
+            )
+
+            store_jobs[job_id] = TransferJob(
+                req_id=req_id, transfer_spec=(src_spec, dst_spec)
+            )
 
             logger.debug(
-                "Request %s offloading %s blocks upto %d tokens",
+                "Request %s offloading %s blocks upto %d tokens (job %d)",
                 req_id,
                 len(keys_to_store),
                 num_offloadable_tokens,
+                job_id,
             )
 
-        return reqs_to_store
+        return store_jobs
 
     def build_connector_meta(
         self, scheduler_output: SchedulerOutput
     ) -> KVConnectorMetadata:
-        meta = OffloadingConnectorMetadata(
-            reqs_to_load=self._reqs_to_load,
-            reqs_to_store=self._get_reqs_to_store(scheduler_output),
-            reqs_to_flush=scheduler_output.preempted_req_ids,
-        )
-        self._reqs_to_load = {}
-
-        # NOTE (orozery): we should move this logic to update_connector_output
-        # once KVConnectorOutput allows us to report completed transfers
         for req_id in scheduler_output.preempted_req_ids or ():
-            keys = self._reqs_being_stored.get(req_id)
-            if keys:
-                self.manager.complete_store(keys)
-                keys.clear()
+            req_status = self._req_status.get(req_id)
+            if req_status is None or not req_status.transfer_jobs:
+                continue
+            any_jid = next(iter(req_status.transfer_jobs))
+            assert self._jobs[any_jid].is_store
+            self._current_batch_jobs_to_flush.update(req_status.transfer_jobs)
 
+        meta = OffloadingConnectorMetadata(
+            load_jobs=self._current_batch_load_jobs,
+            store_jobs=self._build_store_jobs(scheduler_output),
+            jobs_to_flush=self._current_batch_jobs_to_flush,
+        )
+        self._current_batch_load_jobs = {}
+        self._current_batch_jobs_to_flush = set()
         return meta
 
     def update_connector_output(self, connector_output: KVConnectorOutput):
@@ -531,22 +790,45 @@ def update_connector_output(self, connector_output: KVConnectorOutput):
             connector_output (KVConnectorOutput): the worker-side
                 connectors output.
         """
-        for req_id in connector_output.finished_sending or []:
-            keys = self._reqs_being_stored.pop(req_id, None)
-            if keys:
-                self.manager.complete_store(keys)
-
-        for req_id in connector_output.finished_recving or []:
-            keys = self._reqs_being_loaded.pop(req_id, None)
-            if keys:
+        meta = connector_output.kv_connector_worker_meta
+        if not isinstance(meta, OffloadingWorkerMetadata):
+            assert meta is None
+            meta = OffloadingWorkerMetadata()
+        for job_id, count in meta.completed_jobs.items():
+            assert count > 0
+            job_status = self._jobs[job_id]
+            job_status.pending_count -= count
+            if job_status.pending_count > 0:
+                continue
+            assert job_status.pending_count == 0
+
+            if job_status.is_store:
+                self.manager.complete_store(job_status.keys)
+            else:
+                self.manager.complete_load(job_status.keys)
                 if self._blocks_being_loaded:
-                    self._blocks_being_loaded.difference_update(keys)
-                self.manager.complete_load(keys)
+                    self._blocks_being_loaded.difference_update(job_status.keys)
+
+            req_status = self._req_status[job_status.req_id]
+            if self._block_id_to_pending_jobs:
+                # Sliding window blocks are tracked from store creation
+                # and must be cleaned up unconditionally.
+                self._remove_pending_job(job_id, job_status.sliding_window_block_ids)
+                # Non-sliding-window blocks are only tracked after
+                # request_finished, so only clean up for finished requests.
+                if req_status.req.is_finished():
+                    self._remove_pending_job(
+                        job_id, job_status.non_sliding_window_block_ids
+                    )
+
+            del self._jobs[job_id]
+            req_status.transfer_jobs.remove(job_id)
+            if not req_status.transfer_jobs and req_status.req.is_finished():
+                del self._req_status[job_status.req_id]
 
     def request_finished(
         self,
         request: Request,
-        block_ids: list[int],
     ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called when a request has finished, before its blocks are freed.
@@ -558,14 +840,21 @@ def request_finished(
             Optional KVTransferParams to be included in the request outputs
             returned by the engine.
         """
-        req_id = request.request_id
-
         # TODO(orozery): possibly kickoff offload for last block
         # which may have been deferred due to async scheduling
-        self._req_status.pop(req_id, None)
-
-        request_being_stored = req_id in self._reqs_being_stored
-        return request_being_stored, None
+        req_status = self._req_status.get(request.request_id)
+        if req_status is None:
+            return False, None
+        if not req_status.transfer_jobs:
+            del self._req_status[request.request_id]
+            return False, None
+        # Pending stores will outlive the request's block ownership.
+        # Register them so future block reuse triggers a flush.
+        for job_id in req_status.transfer_jobs:
+            job_status = self._jobs[job_id]
+            for bid in job_status.non_sliding_window_block_ids or ():
+                self._block_id_to_pending_jobs.setdefault(bid, set()).add(job_id)
+        return False, None
 
     def take_events(self) -> Iterable[KVCacheEvent]:
         """Take the KV cache events from the connector.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py
index cc6d8262c7e6..9acfd9c66ce4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py
@@ -11,6 +11,7 @@
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.offloading.common import (
     OffloadingConnectorMetadata,
+    OffloadingWorkerMetadata,
     ReqId,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.offloading.metrics import (
@@ -24,7 +25,7 @@
     MambaSpec,
     UniformTypeKVCacheSpecs,
 )
-from vllm.v1.kv_offload.spec import (
+from vllm.v1.kv_offload.base import (
     CanonicalKVCacheRef,
     CanonicalKVCaches,
     CanonicalKVCacheTensor,
@@ -45,24 +46,11 @@ def __init__(self, spec: OffloadingSpec):
         self.spec = spec
         self.worker = OffloadingWorker()
 
-        self._job_counter = 0
-
         self.kv_connector_stats = OffloadingConnectorStats()
-        # req_id -> (job_id, store)
-        self._jobs: dict[int, tuple[ReqId, bool]] = {}
-        # req_id -> active job IDs
-        self._load_job: dict[ReqId, int] = {}
-        # req_id -> set(active job IDs)
-        self._store_jobs = defaultdict[ReqId, set[int]](set)
-        # list of store jobs pending submission (job_id, transfer_spec)
+        # job_id -> req_id for in-flight loads.
+        self._load_jobs: dict[int, ReqId] = {}
         self._unsubmitted_store_jobs: list[tuple[int, TransferSpec]] = []
-
-        self._finished_reqs_waiting_for_store: set[ReqId] = set()
-
-    def _generate_job_id(self) -> int:
-        job_id = self._job_counter
-        self._job_counter = job_id + 1
-        return job_id
+        self._connector_worker_meta = OffloadingWorkerMetadata()
 
     def _register_handlers(self, kv_caches: CanonicalKVCaches):
         for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches):
@@ -301,10 +289,8 @@ def handle_preemptions(self, kv_connector_metadata: OffloadingConnectorMetadata)
             assert success
         self._unsubmitted_store_jobs.clear()
 
-        for req_id in kv_connector_metadata.reqs_to_flush or ():
-            job_ids = self._store_jobs.get(req_id)
-            if job_ids:
-                self.worker.wait(job_ids)
+        if kv_connector_metadata.jobs_to_flush:
+            self.worker.wait(kv_connector_metadata.jobs_to_flush)
 
     def start_kv_transfers(self, metadata: OffloadingConnectorMetadata):
         for job_id, transfer_spec in self._unsubmitted_store_jobs:
@@ -312,41 +298,33 @@ def start_kv_transfers(self, metadata: OffloadingConnectorMetadata):
             assert success
         self._unsubmitted_store_jobs.clear()
 
-        for req_id, transfer_spec in metadata.reqs_to_load.items():
-            job_id = self._generate_job_id()
-            self._jobs[job_id] = (req_id, False)
-            assert req_id not in self._load_job
-            self._load_job[req_id] = job_id
-            success = self.worker.transfer_async(job_id, transfer_spec)
+        for job_id, entry in metadata.load_jobs.items():
+            self._load_jobs[job_id] = entry.req_id
+            success = self.worker.transfer_async(job_id, entry.transfer_spec)
             assert success
 
     def prepare_store_kv(self, metadata: OffloadingConnectorMetadata):
-        for req_id, transfer_spec in metadata.reqs_to_store.items():
-            job_id = self._generate_job_id()
-            self._jobs[job_id] = (req_id, True)
-            self._store_jobs[req_id].add(job_id)
-            # NOTE(orozery): defer the store to the beginning of the next engine step,
-            # so that offloading starts AFTER transfers related to token sampling,
-            # thereby avoiding delays to token generation due to offloading.
-            self._unsubmitted_store_jobs.append((job_id, transfer_spec))
+        for job_id, entry in metadata.store_jobs.items():
+            # NOTE(orozery): defer the store to the beginning of the next
+            # engine step, so that offloading starts AFTER transfers related
+            # to token sampling, thereby avoiding delays to token generation.
+            self._unsubmitted_store_jobs.append((job_id, entry.transfer_spec))
 
     def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         """
-        Notifies worker-side connector ids of requests that have
-        finished generating tokens.
-        Returns a list of request IDs that finished loading or storing.
-
         Returns:
-            ids of requests that have finished asynchronous transfer
-            tuple of (sending/saving ids, recving/loading ids).
+            tuple of (finished_sending, finished_recving). Stores never
+            emit finished_sending — the scheduler tracks store completion
+            via kv_connector_worker_meta.completed_jobs and fences any
+            block reuse via jobs_to_flush. Loads still emit
+            finished_recving so the base scheduler can resume requests
+            blocked on remote KV (and free aborted-during-load reqs).
         """
-        finished_sending = set()
-        finished_recving = set()
+        finished_recving: set[str] = set()
         for transfer_result in self.worker.get_finished():
             # we currently do not support job failures
             job_id = transfer_result.job_id
             assert transfer_result.success
-            req_id, store = self._jobs.pop(job_id)
             if (
                 transfer_result.transfer_time
                 and transfer_result.transfer_size is not None
@@ -357,31 +335,21 @@ def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
                     time=transfer_result.transfer_time,
                     transfer_type=transfer_result.transfer_type,
                 )
-            if store:
-                req_jobs = self._store_jobs[req_id]
-                req_jobs.remove(job_id)
-                if req_jobs:
-                    continue
-
-                if req_id in self._finished_reqs_waiting_for_store:
-                    self._finished_reqs_waiting_for_store.remove(req_id)
-                    finished_sending.add(req_id)
-                    del self._store_jobs[req_id]
-            else:
-                req_job = self._load_job[req_id]
-                assert job_id == req_job
-                del self._load_job[req_id]
+
+            self._connector_worker_meta.mark_completed(job_id)
+            req_id = self._load_jobs.pop(job_id, None)
+            if req_id is not None:
                 finished_recving.add(req_id)
 
-        for req_id in finished_req_ids:
-            pending_req_jobs = self._store_jobs.get(req_id)
-            if pending_req_jobs:
-                self._finished_reqs_waiting_for_store.add(req_id)
-            elif pending_req_jobs is not None:
-                finished_sending.add(req_id)
-                del self._store_jobs[req_id]
+        return set(), finished_recving
 
-        return finished_sending, finished_recving
+    def build_connector_worker_meta(self) -> OffloadingWorkerMetadata | None:
+        """Return completed transfer job IDs since the last call."""
+        if not self._connector_worker_meta.completed_jobs:
+            return None
+        meta = self._connector_worker_meta
+        self._connector_worker_meta = OffloadingWorkerMetadata()
+        return meta
 
     def get_kv_connector_stats(self) -> KVConnectorStats | None:
         """
@@ -396,11 +364,7 @@ def get_kv_connector_stats(self) -> KVConnectorStats | None:
         return kv_connector_stats
 
     def shutdown(self) -> None:
-        # Drop deferred store jobs: there is no point in submitting
-        # them during shutdown.
         self._unsubmitted_store_jobs.clear()
-        self._jobs.clear()
-        self._load_job.clear()
-        self._store_jobs.clear()
-        self._finished_reqs_waiting_for_store.clear()
+        self._load_jobs.clear()
+        self._connector_worker_meta = OffloadingWorkerMetadata()
         self.worker.shutdown()
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index f11281dcf14e..8b264dd726e4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -10,6 +10,7 @@
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
+    SupportsHMA,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -20,6 +21,7 @@
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.offloading.common import (
     OffloadingConnectorMetadata,
+    OffloadingWorkerMetadata,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.offloading.metrics import (
     OffloadingConnectorStats,
@@ -41,7 +43,7 @@
 from vllm.v1.request import Request
 
 
-class OffloadingConnector(KVConnectorBase_V1):
+class OffloadingConnector(KVConnectorBase_V1, SupportsHMA):
     @property
     def prefer_cross_layer_blocks(self) -> bool:
         return True
@@ -111,6 +113,11 @@ def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         assert self.connector_worker is not None
         return self.connector_worker.get_finished(finished_req_ids)
 
+    def build_connector_worker_meta(self) -> OffloadingWorkerMetadata | None:
+        if self.connector_worker is not None:
+            return self.connector_worker.build_connector_worker_meta()
+        return None
+
     def get_num_new_matched_tokens(
         self, request: "Request", num_computed_tokens: int
     ) -> tuple[int | None, bool]:
@@ -143,7 +150,15 @@ def request_finished(
         block_ids: list[int],
     ) -> tuple[bool, dict[str, Any] | None]:
         assert self.connector_scheduler is not None
-        return self.connector_scheduler.request_finished(request, block_ids)
+        return self.connector_scheduler.request_finished(request)
+
+    def request_finished_all_groups(
+        self,
+        request: "Request",
+        block_ids: tuple[list[int], ...],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.request_finished(request)
 
     def take_events(self) -> Iterable[KVCacheEvent]:
         assert self.connector_scheduler is not None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
index 309426814c68..00b8e2bb7275 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
@@ -31,6 +31,7 @@ class MambaConvSplitInfo:
     x_local: int  # intermediate_size / TP  (columns for x)
     b_local: int  # groups_ss / TP  (columns for B; C is same size)
     conv_dtype_size: int  # bytes per element (e.g. 2 for float16)
+    ssm_sizes: tuple[int, int]  # (conv_state_bytes, ssm_state_bytes)
 
     @property
     def conv_dim_local(self) -> int:
@@ -99,8 +100,8 @@ def derive_mamba_conv_split(
         local_tp: this engine's tensor-parallel size.
 
     Returns:
-        MambaConvSplitInfo with per-rank x_local, b_local, conv_rows, and
-        conv_dtype_size.
+        MambaConvSplitInfo with per-rank x_local, b_local, conv_rows,
+        conv_dtype_size, and ssm_sizes (conv_state_bytes, ssm_state_bytes).
     """
     if mamba_spec.mamba_type != "mamba2":
         raise NotImplementedError(
@@ -142,12 +143,20 @@ def derive_mamba_conv_split(
         dtype=mamba_spec.dtypes[0],  # type: ignore[misc]
     ).element_size()
 
+    ssm_dtype_size = torch.tensor(
+        [],
+        dtype=mamba_spec.dtypes[1],  # type: ignore[misc]
+    ).element_size()
+    conv_state_bytes = torch.Size(mamba_spec.shapes[0]).numel() * conv_dtype_size
+    ssm_state_bytes = torch.Size(mamba_spec.shapes[1]).numel() * ssm_dtype_size
+
     # Divide by TP to get per-rank column counts.
     return MambaConvSplitInfo(
         conv_rows=conv_rows,
         x_local=intermediate_size // local_tp,
         b_local=groups_ss // local_tp,
         conv_dtype_size=conv_dtype_size,
+        ssm_sizes=(conv_state_bytes, ssm_state_bytes),
     )
 
 
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 473acb908b28..58c49c09dc54 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -472,6 +472,7 @@ def graph_capture(self, graph_capture_context: GraphCaptureContext | None = None
         # only cuda uses this function,
         # so we don't abstract it into the base class
         maybe_ca_context = nullcontext()
+        maybe_aiter_context = nullcontext()
         from vllm.distributed.device_communicators.cuda_communicator import (
             CudaCommunicator,
         )
@@ -482,13 +483,20 @@ def graph_capture(self, graph_capture_context: GraphCaptureContext | None = None
             if ca_comm is not None:
                 maybe_ca_context = ca_comm.capture()  # type: ignore
 
+            from vllm._aiter_ops import rocm_aiter_ops
+
+            if rocm_aiter_ops.is_enabled():
+                aiter_ar = rocm_aiter_ops.get_aiter_allreduce()
+                if aiter_ar is not None:
+                    maybe_aiter_context = aiter_ar.capture()  # type: ignore
+
         # ensure all initialization operations complete before attempting to
         # capture the graph on another stream
         curr_stream = torch.cuda.current_stream()
         if curr_stream != stream:
             stream.wait_stream(curr_stream)
 
-        with torch.cuda.stream(stream), maybe_ca_context:
+        with torch.cuda.stream(stream), maybe_ca_context, maybe_aiter_context:
             yield graph_capture_context
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cd9551003339..1b3803139217 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -962,7 +962,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "-dpn",
             type=int,
             help="Data parallel rank of this instance. "
-            "When set, enables external load balancer mode.",
+            "When set, enables external load balancer mode for MoE "
+            "data-parallel deployments. Unsupported for non-MoE models; "
+            "launch independent vLLM instances instead.",
         )
         parallel_group.add_argument(
             "--data-parallel-start-rank",
@@ -1697,29 +1699,15 @@ def create_engine_config(
             kv_offloading_backend=self.kv_offloading_backend,
         )
 
-        # TurboQuant: auto-skip first/last 2 layers (boundary protection).
-        # These layers are most sensitive to quantization error.
-        # Users can add extra layers via --kv-cache-dtype-skip-layers.
         if resolved_cache_dtype.startswith("turboquant_"):
-            if model_config.is_hybrid:
-                raise NotImplementedError(
-                    "TurboQuant KV cache is not supported for hybrid "
-                    "(attention + Mamba) models. Boundary layer protection "
-                    "requires uniform attention layers."
-                )
             from vllm.model_executor.layers.quantization.turboquant.config import (
                 TurboQuantConfig,
             )
 
-            num_layers = model_config.hf_text_config.num_hidden_layers
-            boundary = TurboQuantConfig.get_boundary_skip_layers(num_layers)
+            boundary = TurboQuantConfig.get_boundary_skip_layers(model_config)
             existing = set(cache_config.kv_cache_dtype_skip_layers)
-            merged = sorted(existing | set(boundary), key=lambda x: int(x))
-            cache_config.kv_cache_dtype_skip_layers = merged
-            logger.info(
-                "TQ: skipping layers %s for boundary protection (num_layers=%d)",
-                merged,
-                num_layers,
+            cache_config.kv_cache_dtype_skip_layers = sorted(
+                existing | set(boundary), key=int
             )
 
         ray_runtime_env = None
@@ -1793,6 +1781,16 @@ def create_engine_config(
         data_parallel_external_lb = (
             self.data_parallel_external_lb or self.data_parallel_rank is not None
         )
+        if (
+            self.data_parallel_size > 1
+            and data_parallel_external_lb
+            and not model_config.is_moe
+        ):
+            raise ValueError(
+                "Non-MoE models do not support external data parallel mode. "
+                "For external load balancing, launch independent vLLM "
+                "instances without --data-parallel-* arguments."
+            )
         # Local DP rank = 1, use pure-external LB.
         if data_parallel_external_lb:
             assert self.data_parallel_rank is not None, (
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 50013a060a8f..6058d8ed86b7 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -78,6 +78,7 @@ def generate(
         priority: int = 0,
         data_parallel_rank: int | None = None,
         reasoning_ended: bool | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request."""
         ...
diff --git a/vllm/entrypoints/anthropic/protocol.py b/vllm/entrypoints/anthropic/protocol.py
index 52eb77b51167..f3c4dd7f3e32 100644
--- a/vllm/entrypoints/anthropic/protocol.py
+++ b/vllm/entrypoints/anthropic/protocol.py
@@ -39,6 +39,7 @@ class AnthropicContentBlock(BaseModel):
         "image",
         "tool_use",
         "tool_result",
+        "tool_reference",
         "thinking",
         "redacted_thinking",
     ]
@@ -52,6 +53,8 @@ class AnthropicContentBlock(BaseModel):
     input: dict[str, Any] | None = None
     content: str | list[dict[str, Any]] | None = None
     is_error: bool | None = None
+    # For tool_reference content
+    tool_name: str | None = None
     # For thinking content
     thinking: str | None = None
     signature: str | None = None
@@ -72,6 +75,7 @@ class AnthropicTool(BaseModel):
     name: str
     description: str | None = None
     input_schema: dict[str, Any]
+    defer_loading: bool | None = None
 
     @field_validator("input_schema")
     @classmethod
diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py
index 939f5a7ed4c5..867ee73948ff 100644
--- a/vllm/entrypoints/anthropic/serving.py
+++ b/vllm/entrypoints/anthropic/serving.py
@@ -237,6 +237,10 @@ def _convert_block(
             cls._convert_tool_use_block(block, tool_calls)
         elif block.type == "tool_result":
             cls._convert_tool_result_block(block, role, openai_messages, content_parts)
+        elif block.type == "tool_reference":
+            # Tool references are expanded during tool_result processing
+            # when they appear inside tool_result content.
+            pass
 
     @classmethod
     def _convert_tool_use_block(cls, block, tool_calls: list[dict[str, Any]]) -> None:
@@ -275,6 +279,7 @@ def _convert_user_tool_result(
         """Convert user tool_result with text and image support"""
         tool_text = ""
         tool_image_urls: list[str] = []
+        tool_reference: list[dict[str, Any]] = []
 
         if isinstance(block.content, str):
             tool_text = block.content
@@ -291,6 +296,12 @@ def _convert_user_tool_result(
                     url = cls._convert_image_source_to_url(source)
                     if url:
                         tool_image_urls.append(url)
+                elif item_type == "tool_reference":
+                    ref_name = item.get("tool_name") or item.get("name")
+                    if ref_name:
+                        tool_reference.append(
+                            {"type": "tool_reference", "name": ref_name}
+                        )
             tool_text = "\n".join(text_parts)
 
         openai_messages.append(
@@ -312,6 +323,15 @@ def _convert_user_tool_result(
                 }
             )
 
+        if tool_reference:
+            openai_messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": block.tool_use_id or "",
+                    "content": tool_reference,  # type: ignore[dict-item]
+                }
+            )
+
     @classmethod
     def _build_base_request(
         cls,
@@ -400,6 +420,7 @@ def _convert_tools(
                             "name": tool.name,
                             "description": tool.description,
                             "parameters": tool.input_schema,
+                            "defer_loading": tool.defer_loading,
                         },
                     }
                 )
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bd4f29a00410..cfe0857b679e 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -11,7 +11,7 @@
 from functools import cached_property, lru_cache, partial
 from itertools import accumulate
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast
+from typing import TYPE_CHECKING, Any, Final, Generic, Literal, TypeAlias, TypeVar, cast
 
 from openai.types.chat import (
     ChatCompletionAssistantMessageParam,
@@ -36,10 +36,11 @@
 from pydantic import BaseModel, ConfigDict, TypeAdapter
 
 # pydantic needs the TypedDict from typing_extensions
-from typing_extensions import Required, TypedDict
+from typing_extensions import Required, TypedDict, override
 
 from vllm import envs
 from vllm.config import ModelConfig
+from vllm.exceptions import VLLMValidationError
 from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
@@ -54,6 +55,10 @@
 )
 from vllm.multimodal.media import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.renderers.embed_utils import (
+    safe_load_prompt_embeds,
+    safe_load_prompt_embeds_async,
+)
 from vllm.utils import random_uuid
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import LazyLoader
@@ -97,9 +102,40 @@ class ChatTemplateResolutionError(ValueError):
     "image": "<##IMAGE##>",
     "audio": "<##AUDIO##>",
     "video": "<##VIDEO##>",
+    "prompt_embeds": "<##PROMPT_EMBEDS##>",
 }
 
 
+PROMPT_EMBEDS_PLACEHOLDER_TOKEN: Final[str] = "<prompt_embeds>"
+"""The special token used as a placeholder for each embedding
+position during chat template rendering.
+
+Registered as an additional special token when `--enable-prompt-embeds` is set.
+See `_ensure_prompt_embeds_placeholder_token` in `vllm/renderers/hf.py`.
+"""
+
+
+_REQUIRE_MM_PROCESSOR_ERROR: Final[str] = (
+    "Resolving modality {modality!r} requires a multimodal processor "
+    "but none is available."
+)
+
+_ENABLE_PROMPT_EMBEDS_ERROR: Final[str] = (
+    "You must set `--enable-prompt-embeds` to input `prompt_embeds`"
+)
+
+_PROMPT_EMBEDS_MISSING_DATA_ERROR: Final[str] = (
+    "prompt_embeds content part requires a non-empty `data` field "
+    "with base64-encoded tensor bytes."
+)
+
+_RESERVED_PLACEHOLDER_IN_TEXT_ERROR: Final[str] = (
+    "Text content may not contain the reserved placeholder {token!r}. "
+    "This placeholder is used internally to mark `prompt_embeds` splice "
+    "positions in the tokenized prompt."
+)
+
+
 class AudioURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -146,6 +182,17 @@ class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False):
     """
 
 
+class ChatCompletionContentPartPromptEmbedsParam(TypedDict, total=False):
+    data: Required[str]
+    """
+    Base64-encoded bytes of a serialized `torch.Tensor` of shape
+    `(num_tokens, hidden_size)`. The tensor's `dtype` and `hidden_size` must
+    match the model's input embedding layer.
+    """
+    type: Required[Literal["prompt_embeds"]]
+    """The type of the content part."""
+
+
 class VideoURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -254,6 +301,23 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     """The thinking type."""
 
 
+class CustomChatCompletionContentToolReferenceParam(TypedDict, total=False):
+    """A tool reference content param that only accepts a plain tool name.
+
+    Example:
+    {
+        "name": "get_weather",
+        "type": "tool_reference"
+    }
+    """
+
+    name: str
+    """The name of the tool being referenced."""
+
+    type: Literal["tool_reference"]
+    """The content type."""
+
+
 ChatCompletionContentPartParam: TypeAlias = (
     OpenAIChatCompletionContentPartParam
     | ChatCompletionContentPartAudioParam
@@ -264,8 +328,10 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     | CustomChatCompletionContentSimpleImageParam
     | ChatCompletionContentPartImageEmbedsParam
     | ChatCompletionContentPartAudioEmbedsParam
+    | ChatCompletionContentPartPromptEmbedsParam
     | CustomChatCompletionContentSimpleAudioParam
     | CustomChatCompletionContentSimpleVideoParam
+    | CustomChatCompletionContentToolReferenceParam
     | str
     | CustomThinkCompletionContentParam
 )
@@ -348,7 +414,13 @@ class ConversationMessage(TypedDict, total=False):
 
 
 ModalityStr = Literal[
-    "image", "audio", "video", "image_embeds", "audio_embeds", "vision_chunk"
+    "image",
+    "audio",
+    "video",
+    "image_embeds",
+    "audio_embeds",
+    "vision_chunk",
+    "prompt_embeds",
 ]
 _T = TypeVar("_T")
 
@@ -530,7 +602,17 @@ def add(self, modality: ModalityStr, item: _T) -> str | None:
 
         An optional uuid can be added which serves as a unique identifier of the
         media.
+
+        Note:
+            `prompt_embeds` bypass MM-processor validation because they are
+            pre-computed embeddings that do not go through any HF processor, encoder,
+            or model-specific placeholder logic. The corresponding placeholder string is
+            managed by the parser via `_add_placeholder`, so we return None here.
         """
+        if modality == "prompt_embeds":
+            self._items_by_modality["prompt_embeds"].append(item)
+            return None
+
         input_modality = modality.replace("_embeds", "")
         original_modality = modality
         use_vision_chunk = (
@@ -641,17 +723,32 @@ def _resolve_vision_chunk_items(
 
 def _resolve_items(
     items_by_modality: dict[str, list[tuple[object, str | None]]],
-    mm_processor: BaseMultiModalProcessor,
+    mm_processor: BaseMultiModalProcessor | None,
     modality_order: dict[str, list[str]],
 ) -> tuple[MultiModalDataDict, MultiModalUUIDDict]:
+    """
+    Materialize the tracker's per-modality items into `mm_data` / `mm_uuids`.
+
+    Note:
+        `mm_processor` is `None` for text-only models (no registered HF
+        processor) whose only modality is `prompt_embeds`. Every other
+        modality requires a processor, enforced by the guard below.
+    """
     if "image" in items_by_modality and "image_embeds" in items_by_modality:
         raise ValueError("Mixing raw image and embedding inputs is not allowed")
     if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
         raise ValueError("Mixing raw audio and embedding inputs is not allowed")
+    # `prompt_embeds` bypasses HF MM processors. Every other modality requires one.
+    processor_modalities = items_by_modality.keys() - {"prompt_embeds"}
+    if processor_modalities and mm_processor is None:
+        raise RuntimeError(
+            _REQUIRE_MM_PROCESSOR_ERROR.format(modality=processor_modalities)
+        )
 
     mm_data = {}
     mm_uuids = {}
     if "image_embeds" in items_by_modality:
+        assert mm_processor is not None
         mm_data["image"] = _get_embeds_data(
             "image",
             [data for data, uuid in items_by_modality["image_embeds"]],
@@ -662,6 +759,7 @@ def _resolve_items(
         mm_data["image"] = [data for data, uuid in items_by_modality["image"]]
         mm_uuids["image"] = [uuid for data, uuid in items_by_modality["image"]]
     if "audio_embeds" in items_by_modality:
+        assert mm_processor is not None
         mm_data["audio"] = _get_embeds_data(
             "audio",
             [data for data, uuid in items_by_modality["audio_embeds"]],
@@ -675,6 +773,7 @@ def _resolve_items(
         mm_data["video"] = [data for data, uuid in items_by_modality["video"]]
         mm_uuids["video"] = [uuid for data, uuid in items_by_modality["video"]]
     if "vision_chunk" in items_by_modality:
+        assert mm_processor is not None
         # Process vision_chunk items - extract from (data, modality) tuples
         # and convert to VisionChunk types with proper UUID handling
         processed_chunks, vision_chunk_uuids = _resolve_vision_chunk_items(
@@ -684,6 +783,10 @@ def _resolve_items(
         )
         mm_data["vision_chunk"] = processed_chunks
         mm_uuids["vision_chunk"] = vision_chunk_uuids
+    if "prompt_embeds" in items_by_modality:
+        mm_data["prompt_embeds"] = [
+            data for data, _uuid in items_by_modality["prompt_embeds"]
+        ]
 
     return mm_data, mm_uuids
 
@@ -695,8 +798,16 @@ def resolve_items(
         if not self._items_by_modality:
             return None, None
 
+        # Text-only models (`is_multimodal_model=False`) with inputs of
+        # modality `prompt_embeds` have no MM processor since `prompt_embeds` are
+        # pre-computed and require no processing, so we pass `None`.
+        mm_processor = (
+            self.mm_processor if self._model_config.is_multimodal_model else None
+        )
         return _resolve_items(
-            dict(self._items_by_modality), self.mm_processor, self._modality_order
+            dict(self._items_by_modality),
+            mm_processor,
+            self._modality_order,
         )
 
     def create_parser(
@@ -719,8 +830,13 @@ async def resolve_items(
             for modality, coros in self._items_by_modality.items()
         }
 
+        mm_processor = (
+            self.mm_processor if self._model_config.is_multimodal_model else None
+        )
         return _resolve_items(
-            resolved_items_by_modality, self.mm_processor, self._modality_order
+            resolved_items_by_modality,
+            mm_processor,
+            self._modality_order,
         )
 
     def create_parser(
@@ -739,10 +855,16 @@ def __init__(self) -> None:
         # general MM placeholder:
         # {
         #   "<##IMAGE##>": ["<image>", "<image>", "<image>"],
-        #   "<##AUDIO##>": ["<audio>", "<audio>"]
+        #   "<##AUDIO##>": ["<audio>", "<audio>"],
+        #   "<##PROMPT_EMBEDS##>": ["<prompt_embeds>", "<prompt_embeds>"]
         # }
         self._placeholder_storage: dict[str, list] = defaultdict(list)
 
+    @property
+    @abstractmethod
+    def model_config(self) -> ModelConfig:
+        raise NotImplementedError
+
     def _add_placeholder(self, modality: ModalityStr, placeholder: str | None):
         mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality]
         if placeholder:
@@ -787,6 +909,10 @@ def parse_audio_embeds(
     ) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_prompt_embeds(self, data: str) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
@@ -815,6 +941,21 @@ def __init__(
     def model_config(self) -> ModelConfig:
         return self._tracker.model_config
 
+    @override
+    def parse_prompt_embeds(self, data: str) -> None:
+        """Decode a base64 prompt embeds tensor and store it in the tracker.
+
+        Emits a single `PROMPT_EMBEDS_PLACEHOLDER_TOKEN` sentinel per
+        content part. The renderer later expands each sentinel to a span of
+        `tensor.shape[0]` placeholder tokens after tokenization.
+        """
+        if not self.model_config.enable_prompt_embeds:
+            raise ValueError(_ENABLE_PROMPT_EMBEDS_ERROR)
+
+        tensor = safe_load_prompt_embeds(self.model_config, data.encode())
+        self._tracker.add("prompt_embeds", (tensor, None))
+        self._add_placeholder("prompt_embeds", PROMPT_EMBEDS_PLACEHOLDER_TOKEN)
+
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image = self._connector.fetch_image(image_url) if image_url else None
 
@@ -939,6 +1080,29 @@ def __init__(
     def model_config(self) -> ModelConfig:
         return self._tracker.model_config
 
+    @override
+    def parse_prompt_embeds(self, data: str) -> None:
+        """Schedule async prompt embeds decode and store the coroutine in the tracker.
+
+        Like the sync variant, emits a single sentinel `PROMPT_EMBEDS_PLACEHOLDER_TOKEN`
+        per content part. Unlike the sync variant, the tensor decode is deferred to a
+        thread-pool executor via `safe_load_prompt_embeds_async`.
+        """
+        if not self.model_config.enable_prompt_embeds:
+            raise ValueError(_ENABLE_PROMPT_EMBEDS_ERROR)
+
+        coro = self._load_prompt_embeds_async(data.encode())
+        self._tracker.add("prompt_embeds", coro)
+        self._add_placeholder("prompt_embeds", PROMPT_EMBEDS_PLACEHOLDER_TOKEN)
+
+    async def _load_prompt_embeds_async(
+        self, data_bytes: bytes
+    ) -> tuple[torch.Tensor, None]:
+        # Second tuple slot fills the tracker's generic `(item, uuid | None)`
+        # contract. prompt_embeds has no UUID concept, so it's always `None`.
+        tensor = await safe_load_prompt_embeds_async(self.model_config, data_bytes)
+        return tensor, None
+
     async def _image_with_uuid_async(self, image_url: str | None, uuid: str | None):
         image = (
             await self._connector.fetch_image_async(image_url) if image_url else None
@@ -1250,6 +1414,7 @@ def _get_full_multimodal_text_prompt(
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _AudioEmbedsParser = partial(cast, ChatCompletionContentPartAudioEmbedsParam)
+_PromptEmbedsParser = partial(cast, ChatCompletionContentPartPromptEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
@@ -1275,11 +1440,15 @@ def _get_full_multimodal_text_prompt(
     "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
     "audio_embeds": lambda part: _AudioEmbedsParser(part).get("audio_embeds", None),
+    "prompt_embeds": lambda part: _PromptEmbedsParser(part).get("data", None),
     "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
     "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None),
     "refusal": lambda part: _RefusalParser(part).get("refusal", None),
     "video_url": lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
+    "tool_reference": lambda part: cast(
+        CustomChatCompletionContentToolReferenceParam, part
+    ).get("name", None),
 }
 
 
@@ -1350,6 +1519,11 @@ def _parse_chat_message_content_mm_part(
             )
             audio_embeds = audio_params.get("audio_embeds", None)
             return "audio_embeds", audio_embeds
+        if "prompt_embeds" in part:
+            prompt_embeds_params = cast(  # type: ignore[assignment]
+                ChatCompletionContentPartPromptEmbedsParam, part
+            )
+            return "prompt_embeds", prompt_embeds_params.get("data", None)
         if "audio_url" in part:
             audio_params = cast(  # type: ignore[assignment]
                 CustomChatCompletionContentSimpleAudioParam, part
@@ -1371,6 +1545,12 @@ def _parse_chat_message_content_mm_part(
                 # with url as a dict of {"url": url}
                 video_url = video_url.get("url", None)
             return "video_url", video_url
+        if "tool_reference" in part:
+            tool_reference_params = cast(
+                CustomChatCompletionContentToolReferenceParam, part
+            )
+            tool_reference = tool_reference_params.get("name", None)
+            return "tool_reference", tool_reference
         # Raise an error if no 'type' or direct URL is found.
         raise ValueError("Missing 'type' field in multimodal part.")
 
@@ -1427,6 +1607,24 @@ def _parse_chat_message_content_parts(
     return [ConversationMessage(role=role, content=text_prompt)]
 
 
+def _reject_reserved_placeholder_in_text(text: str, model_config: ModelConfig) -> None:
+    """Reject user-supplied text parts that contains the reserved `prompt_embeds`
+    placeholder sentinel.
+
+    When the server accepts `prompt_embeds`, the placeholder token is
+    registered as a single unsplittable special token on the tokenizer. Any
+    user text that happens to contain the literal sequence would tokenize to
+    the same ID and be mistaken for a splice point by the renderer, letting a
+    caller move or inject splice positions via plain text content.
+    """
+    if model_config.enable_prompt_embeds and PROMPT_EMBEDS_PLACEHOLDER_TOKEN in text:
+        raise ValueError(
+            _RESERVED_PLACEHOLDER_IN_TEXT_ERROR.format(
+                token=PROMPT_EMBEDS_PLACEHOLDER_TOKEN
+            )
+        )
+
+
 def _parse_chat_message_content_part(
     part: ChatCompletionContentPartParam,
     mm_parser: BaseMultiModalContentParser,
@@ -1442,6 +1640,7 @@ def _parse_chat_message_content_part(
     with multimodal placeholders.
     """
     if isinstance(part, str):  # Handle plain text parts
+        _reject_reserved_placeholder_in_text(part, mm_parser.model_config)
         if wrap_dicts:
             return {"type": "text", "text": part}
         return part
@@ -1460,6 +1659,7 @@ def _parse_chat_message_content_part(
 
     if part_type in ("text", "input_text", "output_text", "refusal", "thinking"):
         str_content = cast(str, content)
+        _reject_reserved_placeholder_in_text(str_content, mm_parser.model_config)
         if wrap_dicts:
             return {"type": "text", "text": str_content}
         else:
@@ -1488,6 +1688,11 @@ def _parse_chat_message_content_part(
         content = cast(str | dict[str, str], content) if content is not None else None
         mm_parser.parse_audio_embeds(content, uuid)
         modality = "audio"
+    elif part_type == "prompt_embeds":
+        if not content:
+            raise ValueError(_PROMPT_EMBEDS_MISSING_DATA_ERROR)
+        mm_parser.parse_prompt_embeds(cast(str, content))
+        modality = "prompt_embeds"
     elif part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content, uuid)
@@ -1500,11 +1705,34 @@ def _parse_chat_message_content_part(
         str_content = cast(str, content)
         mm_parser.parse_video(str_content, uuid)
         modality = "video"
+    elif part_type == "tool_reference":
+        # Tool references are not multimodal data — they reference deferred
+        # tools and are passed through as-is for the chat template to expand.
+        if wrap_dicts:
+            return {"type": "tool_reference", "name": cast(str, content)}
+        return cast(str, content)
     else:
-        raise NotImplementedError(f"Unknown part type: {part_type}")
+        supported = sorted(MM_PARSER_MAP.keys() | set(PART_TYPES_TO_SKIP_NONE_CONTENT))
+        raise VLLMValidationError(
+            f"Unsupported chat content part type: {part_type!r}. "
+            f"Supported types: {', '.join(supported)}.",
+            parameter="type",
+            value=part_type,
+        )
 
     if wrap_dicts:
+        if modality == "prompt_embeds":
+            # Chat templates don't know about the "prompt_embeds" modality,
+            # emit the single sentinel token as text so the template renders
+            # it inline. The renderer later expands it to N tokens post-tokenize.
+            return {"type": "text", "text": PROMPT_EMBEDS_PLACEHOLDER_TOKEN}
         return {"type": modality}
+    if modality == "prompt_embeds":
+        # Emit the renderer token inline regardless of `interleave_strings`,
+        # prompt_embeds are spliced at the token offset so position matters.
+        # Falling back to front-padding via `missing_placeholders` would
+        # reorder them relative to surrounding text.
+        return PROMPT_EMBEDS_PLACEHOLDER_TOKEN
     return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
 
 
@@ -1560,14 +1788,24 @@ def _parse_chat_message_content(
             # string. Clients like Claude Code / Cursor send tool results as
             # [{"type": "text", "text": "..."}], but most chat templates only
             # handle string content for tool messages.
+            # However, tool_reference items must be preserved as structured
+            # dicts for the chat template to expand them.
             msg_content = result_msg.get("content")
             if isinstance(msg_content, list):
-                texts = [
-                    item.get("text", "")
+                has_non_text = any(
+                    isinstance(item, dict) and item.get("type") != "text"
                     for item in msg_content
-                    if isinstance(item, dict) and item.get("type") == "text"
-                ]
-                result_msg["content"] = "\n".join(texts) if texts else ""
+                )
+                if has_non_text:
+                    # Keep structured content (e.g., tool_reference)
+                    result_msg["content"] = msg_content
+                else:
+                    texts = [
+                        item.get("text", "")
+                        for item in msg_content
+                        if isinstance(item, dict) and item.get("type") == "text"
+                    ]
+                    result_msg["content"] = "\n".join(texts) if texts else ""
 
         if "name" in message and isinstance(message["name"], str):
             result_msg["name"] = message["name"]
@@ -1618,7 +1856,10 @@ def parse_chat_messages(
     MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
-    mm_tracker = MultiModalItemTracker(model_config, media_io_kwargs=media_io_kwargs)
+    mm_tracker = MultiModalItemTracker(
+        model_config,
+        media_io_kwargs=media_io_kwargs,
+    )
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1655,7 +1896,8 @@ async def parse_chat_messages_async(
 ]:
     conversation: list[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(
-        model_config, media_io_kwargs=media_io_kwargs
+        model_config,
+        media_io_kwargs=media_io_kwargs,
     )
 
     for msg in messages:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 29cc2b47e7be..c34a5bae309d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -334,7 +334,7 @@ def _make_config(value: Any, cls: type[_R]) -> _R:
                 f"LLM(data_parallel_size={_dp_size}) is not supported for single-"
                 "process usage and may hang. Please use "
                 "the explicit multi-process data-parallel example at "
-                "'examples/offline_inference/data_parallel.py'."
+                "'examples/features/data_parallel/data_parallel_offline.py'."
             )
 
         engine_args = EngineArgs(
@@ -926,7 +926,10 @@ def _preprocess_chat(
                     add_generation_prompt=add_generation_prompt,
                     continue_final_message=continue_final_message,
                     tools=tools,
-                    tokenize=is_mistral_tokenizer(renderer.tokenizer),
+                    tokenize=(
+                        is_mistral_tokenizer(renderer.tokenizer)
+                        or self.model_config.enable_prompt_embeds
+                    ),
                 ),
             ),
             mm_processor_kwargs=mm_processor_kwargs,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9aac19e2fda5..da2ec10284c5 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -321,6 +321,21 @@ async def init_app_state(
     supported_tasks: tuple["SupportedTask", ...] | None = None,
 ) -> None:
     vllm_config = engine_client.vllm_config
+
+    # Propagate enable_in_reasoning to the API-server process. The engine core
+    # runs in a separate process, so the contextvar that backs
+    # `get_current_vllm_config_or_none()` is None on this stack. Tool parsers
+    # call `get_enable_structured_outputs_in_reasoning()` during request
+    # handling and need to see the real flag, otherwise they silently fall
+    # back to False and mismatch the engine-side bitmask gating.
+    from vllm.tool_parsers.structural_tag_registry import (
+        set_enable_structured_outputs_in_reasoning,
+    )
+
+    set_enable_structured_outputs_in_reasoning(
+        vllm_config.structured_outputs_config.enable_in_reasoning
+    )
+
     if supported_tasks is None:
         warnings.warn(
             "The 'supported_tasks' parameter was not provided to "
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
index 01d2df88d69b..c92cc13da01f 100644
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -11,7 +11,7 @@
     ChatCompletionAudio as OpenAIChatCompletionAudio,
 )
 from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
-from pydantic import Field, PrivateAttr, model_validator
+from pydantic import Field, PrivateAttr, model_serializer, model_validator
 
 from vllm.config import ModelConfig
 from vllm.config.utils import replace
@@ -139,6 +139,20 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
 class ChatCompletionToolsParam(OpenAIBaseModel):
     type: Literal["function"] = "function"
     function: FunctionDefinition
+    defer_loading: bool | None = None
+
+    @model_validator(mode="after")
+    def _propagate_defer_loading(self) -> "ChatCompletionToolsParam":
+        if self.defer_loading is not None and self.function.defer_loading is None:
+            self.function.defer_loading = self.defer_loading
+        return self
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.defer_loading is None:
+            data.pop("defer_loading", None)
+        return data
 
 
 class ChatCompletionNamedFunction(OpenAIBaseModel):
@@ -182,7 +196,19 @@ class ChatCompletionRequest(OpenAIBaseModel):
         | ChatCompletionNamedToolChoiceParam
         | None
     ) = "none"
-    reasoning_effort: Literal["none", "low", "medium", "high"] | None = None
+    reasoning_effort: (
+        Literal["none", "minimal", "low", "medium", "high", "xhigh", "max"] | None
+    ) = Field(
+        default=None,
+        description=(
+            "Constrains effort on reasoning for reasoning models. "
+            "Currently supported values are none, minimal, low, medium, "
+            "high, xhigh, and max. Reducing reasoning effort can result in "
+            "faster responses and fewer tokens used on reasoning in a response. "
+            "Note that 'max' is specific to the DeepSeek V4 series and is not "
+            "part of the standard OpenAI API specification."
+        ),
+    )
     thinking_token_budget: int | None = None
     include_reasoning: bool = True
     parallel_tool_calls: bool | None = True
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
index f001aac15a68..f42cc8afeeb1 100644
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -70,7 +70,10 @@
 from vllm.renderers import ChatParams
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import TokenizerLike
-from vllm.tool_parsers.streaming import extract_required_tool_call_streaming
+from vllm.tool_parsers.streaming import (
+    extract_named_tool_call_streaming,
+    extract_required_tool_call_streaming,
+)
 from vllm.utils.collection_utils import as_list
 from vllm.utils.mistral import is_mistral_tokenizer, is_mistral_tool_parser
 
@@ -286,6 +289,7 @@ async def create_chat_completion(
                 self._extract_prompt_len(engine_input),
                 self.default_sampling_params,
                 self.override_max_tokens,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
             )
 
             sampling_params: SamplingParams | BeamSearchParams
@@ -344,6 +348,11 @@ async def create_chat_completion(
                     priority=request.priority,
                     data_parallel_rank=data_parallel_rank,
                     reasoning_ended=reasoning_ended,
+                    reasoning_parser_kwargs={
+                        "chat_template_kwargs": chat_template_kwargs,
+                    }
+                    if reasoning_parser
+                    else None,
                 )
 
             generators.append(generator)
@@ -773,43 +782,24 @@ async def chat_completion_stream_generator(
                                 delta_text = previous_text + delta_text
                                 current_text = ""
 
-                            if function_name_returned[i]:
-                                delta_tool_call = DeltaToolCall(
-                                    function=DeltaFunctionCall(arguments=delta_text),
-                                    index=i,
-                                )
-                            else:
-                                # Generate ID based on tokenizer type
-                                if is_mistral_tokenizer(tokenizer):
-                                    from vllm.tool_parsers.mistral_tool_parser import (
-                                        MistralToolCall,
-                                    )
-
-                                    tool_call_id = MistralToolCall.generate_random_id()
-                                else:
-                                    tool_call_id = make_tool_call_id(
-                                        id_type=self.tool_call_id_type,
-                                        func_name=tool_choice_function_name,
-                                        idx=history_tool_call_cnt,
-                                    )
-                                delta_tool_call = DeltaToolCall(
-                                    id=tool_call_id,
-                                    type="function",
-                                    function=DeltaFunctionCall(
-                                        name=tool_choice_function_name,
-                                        arguments=delta_text,
-                                    ),
-                                    index=i,
+                            delta_message, function_name_returned[i] = (
+                                extract_named_tool_call_streaming(
+                                    delta_text=delta_text,
+                                    function_name=tool_choice_function_name,
+                                    function_name_returned=function_name_returned[i],
+                                    tool_call_idx=history_tool_call_cnt,
+                                    tool_call_id_type=self.tool_call_id_type,
+                                    tokenizer=tokenizer,
+                                    tool_call_array_index=i,
                                 )
-                                function_name_returned[i] = True
-                                history_tool_call_cnt += 1
-
-                            delta_message = DeltaMessage(
-                                tool_calls=[
-                                    delta_tool_call,
-                                ]
                             )
-                            tools_streamed[i] = True
+                            if (
+                                delta_message
+                                and delta_message.tool_calls
+                                and delta_message.tool_calls[0].id is not None
+                            ):
+                                history_tool_call_cnt += 1
+                                tools_streamed[i] = True
 
                     # Skip when tool_choice_uses_parser so it falls through
                     # to the auto tool_parser branches below.
@@ -1318,8 +1308,8 @@ async def chat_completion_full_generator(
                 request.tool_choice
                 and type(request.tool_choice) is ChatCompletionNamedToolChoiceParam
             ):
-                assert tool_calls is not None and len(tool_calls) > 0
                 tool_call_class_items = []
+                tool_calls = tool_calls or []
                 for idx, tc in enumerate(tool_calls):
                     # Use native ID if available (e.g., Kimi K2),
                     # otherwise generate ID with correct id_type
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
index 454b170a5fa5..816c62163992 100644
--- a/vllm/entrypoints/openai/completion/serving.py
+++ b/vllm/entrypoints/openai/completion/serving.py
@@ -151,6 +151,7 @@ async def create_completion(
                 self._extract_prompt_len(engine_input),
                 self.default_sampling_params,
                 self.override_max_tokens,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
             )
 
             sampling_params: SamplingParams | BeamSearchParams
diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py
index 8f6cdb3e6241..890af0300efc 100644
--- a/vllm/entrypoints/openai/engine/protocol.py
+++ b/vllm/entrypoints/openai/engine/protocol.py
@@ -12,6 +12,7 @@
     BaseModel,
     ConfigDict,
     Field,
+    model_serializer,
     model_validator,
 )
 
@@ -166,6 +167,14 @@ class FunctionDefinition(OpenAIBaseModel):
     name: str
     description: str | None = None
     parameters: dict[str, Any] | None = None
+    defer_loading: bool | None = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.defer_loading is None:
+            data.pop("defer_loading", None)
+        return data
 
 
 # extra="forbid" is a workaround to have kwargs as a field,
diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index 77cce6bec5b2..f0f84a82204c 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -638,8 +638,9 @@ def _parse_tool_calls_from_content(
             and request.tool_choice
             and isinstance(request.tool_choice, ToolChoiceFunction)
         ):
-            assert content is not None
             # Forced Function Call (Responses API)
+            if content is None:
+                return [], None
             function_calls.append(
                 FunctionCall(name=request.tool_choice.name, arguments=content)
             )
@@ -651,7 +652,8 @@ def _parse_tool_calls_from_content(
             and (tool_parser_cls is None or tool_parser_cls.supports_required_and_named)
         ):
             # Named function with standard JSON-based parsing
-            assert content is not None
+            if content is None:
+                return [], None
             function_calls.append(
                 FunctionCall(name=request.tool_choice.function.name, arguments=content)
             )
@@ -755,6 +757,8 @@ def _get_decoded_token(
     def _is_model_supported(self, model_name: str | None) -> bool:
         if not model_name:
             return True
+        if envs.VLLM_SKIP_MODEL_NAME_VALIDATION:
+            return True
         return self.models.is_base_model(model_name)
 
 
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
index 7550a02867a6..33e68b4c40e8 100644
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -3,7 +3,6 @@
 
 import datetime
 from collections.abc import Iterable, Sequence
-from typing import Literal
 
 from openai.types.responses.tool import Tool
 from openai_harmony import (
@@ -66,7 +65,7 @@ def get_encoding():
 
 def get_system_message(
     model_identity: str | None = None,
-    reasoning_effort: Literal["high", "medium", "low"] | None = None,
+    reasoning_effort: str | None = None,
     start_date: str | None = None,
     browser_description: str | None = None,
     python_description: str | None = None,
@@ -84,6 +83,12 @@ def get_system_message(
         )
         sys_msg_content = sys_msg_content.with_model_identity(new_identity)
     if reasoning_effort is not None:
+        if reasoning_effort not in REASONING_EFFORT:
+            supported_values = ", ".join(REASONING_EFFORT)
+            raise ValueError(
+                f"reasoning_effort={reasoning_effort!r} is not supported by "
+                f"Harmony. Supported values are: {supported_values}."
+            )
         sys_msg_content = sys_msg_content.with_reasoning_effort(
             REASONING_EFFORT[reasoning_effort]
         )
diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py
index 96876e3f00f8..b5d69ea1cccc 100644
--- a/vllm/entrypoints/openai/responses/protocol.py
+++ b/vllm/entrypoints/openai/responses/protocol.py
@@ -23,7 +23,9 @@
     ResponseOutputItem,
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
+    ResponseOutputMessage,
     ResponsePrompt,
+    ResponseReasoningItem,
     ResponseReasoningTextDeltaEvent,
     ResponseReasoningTextDoneEvent,
     ResponseStatus,
@@ -451,18 +453,21 @@ def check_cache_salt_support(cls, data):
 
     @model_validator(mode="before")
     @classmethod
-    def function_call_parsing(cls, data):
-        """Parse function_call dictionaries into ResponseFunctionToolCall objects.
-        This ensures Pydantic can properly resolve union types in the input field.
-        Function calls provided as dicts are converted to ResponseFunctionToolCall
-        objects before validation, while invalid structures are left for Pydantic
-        to reject with appropriate error messages.
-        """
+    def input_item_parsing(cls, data):
+        """Parse input items that are missing required fields or that Pydantic
+        cannot disambiguate in a Union of TypedDict / BaseModel types.
+
+        Specifically handles:
+        - function_call -> ResponseFunctionToolCall
+        - reasoning     -> ResponseReasoningItem (auto-generates id)
+        - message(role=assistant) -> ResponseOutputMessage (auto-generates
+          id/status and annotations)
 
+        Invalid structures are left for Pydantic to reject.
+        """
         input_data = data.get("input")
 
         # Early return for None, strings, or bytes
-        # (strings are iterable but shouldn't be processed)
         if input_data is None or isinstance(input_data, (str, bytes)):
             return data
 
@@ -476,16 +481,61 @@ def function_call_parsing(cls, data):
 
         processed_input = []
         for item in input_data:
-            if isinstance(item, dict) and item.get("type") == "function_call":
+            if not isinstance(item, dict):
+                processed_input.append(item)
+                continue
+
+            item_type = item.get("type")
+
+            if item_type == "function_call":
                 try:
                     processed_input.append(ResponseFunctionToolCall(**item))
                 except ValidationError:
-                    # Let Pydantic handle validation for malformed function calls
                     logger.debug(
                         "Failed to parse function_call to ResponseFunctionToolCall, "
                         "leaving for Pydantic validation"
                     )
                     processed_input.append(item)
+
+            elif item_type == "reasoning":
+                if "id" not in item:
+                    item = {**item, "id": f"rs_{random_uuid()}"}
+                try:
+                    processed_input.append(ResponseReasoningItem(**item))
+                except ValidationError:
+                    logger.debug(
+                        "Failed to parse reasoning to ResponseReasoningItem, "
+                        "leaving for Pydantic validation"
+                    )
+                    processed_input.append(item)
+
+            elif item_type == "message" and item.get("role") == "assistant":
+                item = dict(item)
+                if "id" not in item:
+                    item["id"] = f"msg_{random_uuid()}"
+                if "status" not in item:
+                    item["status"] = "completed"
+                # ResponseOutputText requires annotations
+                if isinstance(item.get("content"), list):
+                    new_content = []
+                    for c in item["content"]:
+                        if (
+                            isinstance(c, dict)
+                            and c.get("type") == "output_text"
+                            and "annotations" not in c
+                        ):
+                            c = {**c, "annotations": []}
+                        new_content.append(c)
+                    item["content"] = new_content
+                try:
+                    processed_input.append(ResponseOutputMessage(**item))
+                except ValidationError:
+                    logger.debug(
+                        "Failed to parse assistant message to ResponseOutputMessage, "
+                        "leaving for Pydantic validation"
+                    )
+                    processed_input.append(item)
+
             else:
                 processed_input.append(item)
 
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
index ed9204ccaf52..9c4dc48589ff 100644
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -416,6 +416,9 @@ async def create_responses(
                 self._extract_prompt_len(engine_input),
                 self.default_sampling_params,
                 self.override_max_tokens,
+                truncate_prompt_tokens=(
+                    -1 if request.truncation != "disabled" else None
+                ),
             )
 
             sampling_params = request.to_sampling_params(
@@ -456,9 +459,13 @@ async def create_responses(
                     context = SimpleContext()
 
             if self.parser and self.parser.reasoning_parser_cls is not None:
+                chat_template_kwargs = self._effective_chat_template_kwargs(request)
+                reasoning_parser_kwargs = {
+                    "chat_template_kwargs": chat_template_kwargs,
+                }
                 reasoning_parser = self.parser.reasoning_parser_cls(
                     tokenizer,
-                    chat_template_kwargs=self._effective_chat_template_kwargs(request),
+                    chat_template_kwargs=chat_template_kwargs,
                 )
                 if (
                     isinstance(
@@ -481,6 +488,9 @@ async def create_responses(
                 lora_request=lora_request,
                 priority=request.priority,
                 trace_headers=trace_headers,
+                reasoning_parser_kwargs=reasoning_parser_kwargs
+                if self.parser and self.parser.reasoning_parser_cls is not None
+                else None,
             )
             generators.append(generator)
 
@@ -627,6 +637,7 @@ async def _generate_with_builtin_tools(
         lora_request: LoRARequest | None = None,
         priority: int = 0,
         trace_headers: Mapping[str, str] | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ):
         max_model_len = self.model_config.max_model_len
 
@@ -650,6 +661,7 @@ async def _generate_with_builtin_tools(
                 lora_request=lora_request,
                 trace_headers=trace_headers,
                 priority=priority,
+                reasoning_parser_kwargs=reasoning_parser_kwargs,
             )
 
             async for res in generator:
@@ -691,6 +703,9 @@ async def _generate_with_builtin_tools(
                     self._extract_prompt_len(engine_input),
                     self.default_sampling_params,  # type: ignore
                     self.override_max_tokens,  # type: ignore
+                    truncate_prompt_tokens=(
+                        -1 if context.request.truncation != "disabled" else None
+                    ),
                 )
 
             # OPTIMIZATION
diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py
index 32c3351cef80..782b2eaea24b 100644
--- a/vllm/entrypoints/serve/render/serving.py
+++ b/vllm/entrypoints/serve/render/serving.py
@@ -164,6 +164,7 @@ async def render_chat_request(
             input_length,
             self.default_sampling_params,
             self.override_max_tokens,
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
         )
         params = request.to_sampling_params(max_tokens, self.default_sampling_params)
 
@@ -298,6 +299,7 @@ async def render_completion_request(
                 input_length,
                 self.default_sampling_params,
                 self.override_max_tokens,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
             )
             params = request.to_sampling_params(
                 max_tokens, self.default_sampling_params
@@ -541,7 +543,10 @@ async def preprocess_chat(
             default_template_kwargs,
             dict(
                 tools=tool_dicts,
-                tokenize=is_mistral_tokenizer(renderer.tokenizer),
+                tokenize=(
+                    is_mistral_tokenizer(renderer.tokenizer)
+                    or self.model_config.enable_prompt_embeds
+                ),
             ),
         )
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index e3682280ec50..cd1010457d98 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -177,7 +177,14 @@ def get_max_tokens(
     input_length: int,
     default_sampling_params: dict,
     override_max_tokens: int | None = None,
+    truncate_prompt_tokens: int | None = None,
 ) -> int:
+    if truncate_prompt_tokens is not None:
+        limit = truncate_prompt_tokens
+        input_length = min(
+            input_length,
+            max_model_len if limit == -1 else limit,
+        )
     if max_model_len < input_length:
         raise ValueError(
             f"Input length ({input_length}) exceeds model's maximum "
diff --git a/vllm/env_override.py b/vllm/env_override.py
index f0dc91e11b4c..bbd76fe4ab71 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -634,3 +634,120 @@ def _patch_fxgraphcache_pickle_if_needed():
 
 
 _patch_fxgraphcache_pickle_if_needed()
+
+# ===================================================
+# torch 2.11 Inductor cpp codegen indirect_assert scalar-mask fix
+# ===================================================
+# CppVecKernel.indirect_assert wraps a scalar mask with
+# `VecMask<...>(scalar)`, which is not a valid constructor and triggers a
+# C++ compile error during torch.compile of any model that does indirect
+# indexing inside a tail-vectorized loop (e.g. Qwen3-VL).
+# Failure looks like:
+#   no matching function for call to 'VecMask<int64_t,2>::VecMask(int&)'
+# Upstream fix in PyTorch mainline replaces the call with
+# `VecMask<...>::from(scalar)`, see pytorch/pytorch#178148 (lands in 2.12).
+# This is a thin backport for torch >= 2.11 and < 2.12; remove once the
+# minimum supported torch is 2.12.
+
+
+def _apply_cpp_indirect_assert_patch():
+    """Replace CppVecKernel.indirect_assert with a fixed copy that uses
+    `VecMask<...>::from(scalar)` for scalar masks.
+
+    Idempotent: marks the class with `_vllm_indirect_assert_patched` after
+    the first apply.
+    """
+    from torch._inductor.codegen.cpp import CppVecKernel
+
+    if getattr(CppVecKernel, "_vllm_indirect_assert_patched", False):
+        return
+
+    from torch._inductor.codegen.cpp import CppCSEVariable, cexpr_index
+
+    def patched_indirect_assert(self, var, lower, upper, mask=None):
+        assert isinstance(var, CppCSEVariable)
+        assert var.dtype is not None
+        if not var.is_vec:
+            if isinstance(mask, CppCSEVariable) and mask.is_vec:
+                mask = f"({mask}).all_masked()"
+            return super(CppVecKernel, self).indirect_assert(var, lower, upper, mask)
+        lower_scalar = lower
+        upper_scalar = upper
+        if lower:
+            lower = f"{self._get_vec_type(var.dtype)}({lower})"
+        if upper:
+            upper = f"{self._get_vec_type(var.dtype)}({upper})"
+        if lower and upper:
+            cond = f"({lower} <= {var}) & ({var} < {upper})"
+            cond_print = f"{lower_scalar} <= {var} < {upper_scalar}"
+        elif lower:
+            cond = f"{lower} <= {var}"
+            cond_print = f"{lower_scalar} <= {var}"
+        else:
+            assert upper
+            cond = f"{var} < {upper}"
+            cond_print = f"{var} < {upper_scalar}"
+        cond = f"{self._get_mask_type(var.dtype)}({cond})"
+        if mask:
+            if not mask.is_vec:
+                # Backport of pytorch/pytorch#178148 -- use ::from for
+                # scalar masks so g++ picks the correct overload.
+                mask = f"{self._get_mask_type(var.dtype)}::from({mask})"
+            cond = f"({cond}) | ~({mask})"
+        if self.tail_size:
+            cond = (
+                f"{self._get_mask_type(var.dtype)}::set("
+                f"{self._get_mask_type(var.dtype)}::from(1)"
+                f", ({cond}), {cexpr_index(self.tail_size)})"
+            )
+        cond = f"({cond}).all_masked()"
+        return f'{self.assert_function}({cond}, "index out of bounds: {cond_print}")'
+
+    CppVecKernel.indirect_assert = patched_indirect_assert
+    CppVecKernel._vllm_indirect_assert_patched = True  # type: ignore[attr-defined]
+
+
+def _patch_cpp_indirect_assert_if_needed():
+    """Apply cpp codegen indirect_assert backport when on torch 2.11.x.
+
+    Defers application until torch._inductor.codegen.cpp is naturally
+    imported by Inductor. Importing it eagerly during vllm.__init__ pulls
+    in torch._inductor.scheduler, whose top-level
+    `import torch._inductor.async_compile` can fail with
+    `ModuleNotFoundError: import of torch._inductor.async_compile halted;
+    None in sys.modules` depending on the import order on the runner
+    (observed in vLLM CPU CI).
+    """
+    if not is_torch_equal_or_newer("2.11.0") or is_torch_equal_or_newer("2.12.0.dev"):
+        return
+
+    import sys
+
+    target_name = "torch._inductor.codegen.cpp"
+    if target_name in sys.modules:
+        _apply_cpp_indirect_assert_patch()
+        return
+
+    import importlib.abc
+
+    class _CppCodegenPatchFinder(importlib.abc.MetaPathFinder):
+        def find_spec(self, fullname, path, target=None):
+            if fullname != target_name:
+                return None
+            sys.meta_path.remove(self)
+            spec = importlib.util.find_spec(fullname)
+            if spec is None or spec.loader is None:
+                return None
+            original_exec = spec.loader.exec_module
+
+            def _exec_then_patch(module):
+                original_exec(module)
+                _apply_cpp_indirect_assert_patch()
+
+            spec.loader.exec_module = _exec_then_patch  # type: ignore[method-assign]
+            return spec
+
+    sys.meta_path.insert(0, _CppCodegenPatchFinder())
+
+
+_patch_cpp_indirect_assert_if_needed()
diff --git a/vllm/envs.py b/vllm/envs.py
index 806aed2a0414..73e4b147f88b 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,7 +45,7 @@
     NO_COLOR: bool = False
     VLLM_LOG_STATS_INTERVAL: float = 10.0
     VLLM_TRACE_FUNCTION: int = 0
-    VLLM_USE_FLASHINFER_SAMPLER: bool | None = None
+    VLLM_USE_FLASHINFER_SAMPLER: bool = True
     VLLM_PP_LAYER_PARTITION: str | None = None
     VLLM_CPU_KVCACHE_SPACE: int | None = 0
     VLLM_CPU_OMP_THREADS_BIND: str = "auto"
@@ -126,6 +126,7 @@
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
     VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT: bool = False
+    VLLM_ROCM_USE_V4_TRITON_FALLBACK: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -143,7 +144,7 @@
     VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
     VLLM_USE_STANDALONE_COMPILE: bool = True
-    VLLM_ENABLE_PREGRAD_PASSES: bool = False
+    VLLM_ENABLE_PREGRAD_PASSES: bool = True
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
@@ -226,6 +227,7 @@
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
     VLLM_SYSTEM_START_DATE: str | None = None
     VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False
+    VLLM_ENFORCE_STRICT_TOOL_CALLING: bool = False
     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
     VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False
     VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
@@ -245,6 +247,7 @@
     VLLM_DEBUG_WORKSPACE: bool = False
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
+    VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD: int = 1024
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_USE_V2_MODEL_RUNNER: bool = False
     VLLM_LOG_MODEL_INSPECTION: bool = False
@@ -255,11 +258,16 @@
     VLLM_LORA_DISABLE_PDL: bool = False
     VLLM_ENABLE_CUDA_COMPATIBILITY: bool = False
     VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
+    VLLM_SKIP_MODEL_NAME_VALIDATION: bool = False
+    """If set, vLLM will skip model name validation in API requests.
+    This allows any model name to be accepted in the 'model' field of requests,
+    making the server model-name agnostic. Useful for proxy/gateway scenarios."""
     VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
     VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
     VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = True
     VLLM_NIXL_EP_MAX_NUM_RANKS: int = 32
     VLLM_XPU_ENABLE_XPU_GRAPH: bool = False
+    VLLM_XPU_USE_SAMPLER_KERNEL: bool = True
     VLLM_LORA_ENABLE_DUAL_STREAM: bool = False
 
 
@@ -615,9 +623,10 @@ def _get_or_set_default() -> str:
     # The pre-grad passes get run even on cache-hit and negatively impact
     # vllm cold compile times by O(1s)
     # Can remove this after the following issue gets fixed
+    # TODO(luka): maybe_inplace requires this
     # https://github.com/pytorch/pytorch/issues/174502
     "VLLM_ENABLE_PREGRAD_PASSES": lambda: (
-        os.environ.get("VLLM_ENABLE_PREGRAD_PASSES", "0") == "1"
+        os.environ.get("VLLM_ENABLE_PREGRAD_PASSES", "1") == "1"
     ),
     # Debug pattern matching inside custom passes.
     # Should be set to the fx.Node name (e.g. 'getitem_34' or 'scaled_mm_3').
@@ -712,11 +721,13 @@ def _get_or_set_default() -> str:
     # If set to 1, vllm will trace function calls
     # Useful for debugging
     "VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
-    # If set, vllm will use flashinfer sampler
+    # Whether to use the FlashInfer top-k / top-p sampler on CUDA. Enabled
+    # by default when the hardware supports it — set to 0 to opt out
+    # explicitly, which forces the PyTorch-native (Triton for bs>=8) path.
     "VLLM_USE_FLASHINFER_SAMPLER": lambda: (
         bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
         if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
-        else None
+        else True
     ),
     # Pipeline stage partition strategy
     "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
@@ -773,9 +784,8 @@ def _get_or_set_default() -> str:
     ),
     # When True and distributed_executor_backend="ray", use RayExecutorV2
     # (MQ-based) instead of RayDistributedExecutor (compiled-graph backend).
-    # TODO (jeffreywang): Enabled by default in vLLM 0.20.0.
     "VLLM_USE_RAY_V2_EXECUTOR_BACKEND": lambda: bool(
-        int(os.getenv("VLLM_USE_RAY_V2_EXECUTOR_BACKEND", "0"))
+        int(os.getenv("VLLM_USE_RAY_V2_EXECUTOR_BACKEND", "1"))
     ),
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
@@ -997,6 +1007,7 @@ def _get_or_set_default() -> str:
     # use aiter linear op if aiter ops are enabled
     # The following list of related ops
     # - scaled_mm (per-tensor / rowwise)
+    # - use aiter tuned gemms for unquantized gemms
     "VLLM_ROCM_USE_AITER_LINEAR": lambda: (
         os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in ("true", "1")
     ),
@@ -1067,6 +1078,32 @@ def _get_or_set_default() -> str:
     "VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT": lambda: (
         os.getenv("VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT", "False").lower() in ("true", "1")
     ),
+    # Master switch for the pre-rebase ROCm-native code paths used by
+    # DeepSeek-V4 (DSv4-Flash-FP8). When True (default on ROCm) the model
+    # selects the validated pre-rebase implementations at four call sites:
+    #
+    #   1. SWA K-cache writer: torch reference
+    #      (``_deepseek_v4_qnorm_rope_kv_insert_reference``) instead of
+    #      upstream's HIPified ``fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert``
+    #      C++ kernel, whose FP8 dtype is selected at compile time
+    #      (``HIP_FP8_TYPE_OCP``) and silently corrupts every K byte on
+    #      MI300X (FNUZ-only). This is the regression fix; the other three
+    #      below are kept for defense in depth and bisection.
+    #   2. MLA decode: ``flash_mla_with_kvcache_rocm`` Triton kernel
+    #      (95% GSM8K validated) instead of upstream's
+    #      ``rocm_forward_decode_fallback``.
+    #   3. MLA sparse prefill: ``flash_mla_sparse_fwd_rocm`` Triton kernel
+    #      instead of upstream's ``rocm_sparse_attn_prefill``.
+    #   4. Sparse indexer: recovered ``rocm_sparse_attn_indexer_no_insert``
+    #      orchestration instead of upstream's
+    #      ``rocm_aiter_sparse_attn_indexer_native``.
+    #
+    # Set to "0" to opt back into the upstream paths for bisection / perf
+    # comparison (note: requires the SWA writer fix below to also be in place
+    # — flipping this alone reproduces the deterministic-garbage regression).
+    "VLLM_ROCM_USE_V4_TRITON_FALLBACK": lambda: (
+        os.getenv("VLLM_ROCM_USE_V4_TRITON_FALLBACK", "True").lower() in ("true", "1")
+    ),
     # Custom quick allreduce kernel for MI3* cards
     # Choice of quantization level: FP, INT8, INT6, INT4 or NONE
     # Recommended for large models to get allreduce
@@ -1222,8 +1259,8 @@ def _get_or_set_default() -> str:
     # if 1, force use indexed gemm
     # if 0, force use grouped gemm
     # if None, choose better gemm type automatically
-    "VLLM_HUMMING_MOE_GEMM_TYPE": lambda: maybe_convert_bool(
-        os.environ.get("VLLM_HUMMING_MOE_GEMM_TYPE", None)
+    "VLLM_HUMMING_MOE_GEMM_TYPE": lambda: os.environ.get(
+        "VLLM_HUMMING_MOE_GEMM_TYPE", None
     ),
     # Whether to use DeepEPLL kernels for NVFP4 quantization and dispatch method
     # only supported on Blackwell GPUs and with
@@ -1584,6 +1621,12 @@ def _get_or_set_default() -> str:
     "VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY": lambda: bool(
         int(os.getenv("VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY", "0"))
     ),
+    # When 1,the model structural tags will be used to enforce the model
+    # output conforming to the model's tool-calling format and schema.
+    # Default 0 (off).
+    "VLLM_ENFORCE_STRICT_TOOL_CALLING": lambda: bool(
+        int(os.getenv("VLLM_ENFORCE_STRICT_TOOL_CALLING", "0"))
+    ),
     # Add optional custom scopes for profiling, disable to avoid overheads
     "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(
         int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))
@@ -1620,9 +1663,18 @@ def _get_or_set_default() -> str:
     "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool(
         int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", "0"))
     ),
-    # The number of SMs to allocate for communication kernels when running DBO
-    # the rest of the SMs on the device will be allocated to compute
-    "VLLM_DBO_COMM_SMS": lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")),
+    # The number of SMs/CUs to allocate for communication kernels when
+    # running DBO; the rest will be allocated to compute.
+    # Default: 20 on CUDA (SMs), 64 on ROCm (CUs).
+    "VLLM_DBO_COMM_SMS": lambda: int(
+        os.getenv(
+            "VLLM_DBO_COMM_SMS",
+            "64"
+            if hasattr(__import__("torch").version, "hip")
+            and __import__("torch").version.hip is not None
+            else "20",
+        )
+    ),
     # Enable max_autotune & coordinate_descent_tuning in inductor_config
     # to compile static shapes passed from compile_sizes in compilation_config
     # If set to 1, enable max_autotune; By default, this is enabled (1)
@@ -1662,6 +1714,17 @@ def _get_or_set_default() -> str:
     "VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD": lambda: int(
         int(os.getenv("VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD", 256))
     ),
+    # Token-count cutoff for multi-stream overlap of the attention input
+    # GEMM with auxiliary GEMMs (e.g. fused_wqa_wkv overlapped with indexer
+    # weights / kv-score projections in DeepSeek-V4). At or below this many
+    # tokens the FP8 main GEMM has idle SMs to share with the bf16 aux GEMMs
+    # and overlap is a 5-45% win; above it the FP8 GEMM saturates the device
+    # and the cross-stream sync becomes pure overhead. Set to 0 to disable
+    # the multi-stream path entirely. See #PR 41526 for the empirical result
+    # for the default value of 1024 tokens.
+    "VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD": lambda: int(
+        os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "1024")
+    ),
     # Format for saving torch.compile cache artifacts
     # - "binary": saves as binary file
     #     Safe for multiple vllm serve processes accessing the same torch compile cache.
@@ -1708,6 +1771,14 @@ def _get_or_set_default() -> str:
     "VLLM_CUDA_COMPATIBILITY_PATH": lambda: os.environ.get(
         "VLLM_CUDA_COMPATIBILITY_PATH", None
     ),
+    # Skip model name validation in OpenAI API requests.
+    # When set to 1, any model name will be accepted in the 'model' field
+    # of API requests. This is useful for proxy/gateway scenarios where
+    # the actual model is served but different names may be used in requests.
+    "VLLM_SKIP_MODEL_NAME_VALIDATION": lambda: (
+        os.getenv("VLLM_SKIP_MODEL_NAME_VALIDATION", "0").strip().lower()
+        in ("1", "true")
+    ),
     # Whether it is a scale up launch engine for elastic EP,
     # Should only be set by EngineCoreClient.
     "VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": lambda: bool(
@@ -1732,6 +1803,10 @@ def _get_or_set_default() -> str:
     "VLLM_XPU_ENABLE_XPU_GRAPH": lambda: bool(
         int(os.getenv("VLLM_XPU_ENABLE_XPU_GRAPH", "0"))
     ),
+    # whether use xpu specific sample kernel
+    "VLLM_XPU_USE_SAMPLER_KERNEL": lambda: bool(
+        int(os.getenv("VLLM_XPU_USE_SAMPLER_KERNEL", "1"))
+    ),
     # Enable simple KV offload.
     "VLLM_USE_SIMPLE_KV_OFFLOAD": lambda: bool(
         int(os.getenv("VLLM_USE_SIMPLE_KV_OFFLOAD", "0"))
@@ -1883,6 +1958,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_TEST_FORCE_LOAD_FORMAT",
         "VLLM_ENABLE_CUDA_COMPATIBILITY",
         "VLLM_CUDA_COMPATIBILITY_PATH",
+        "VLLM_SKIP_MODEL_NAME_VALIDATION",
         "LOCAL_RANK",
         "CUDA_VISIBLE_DEVICES",
         "NO_COLOR",
diff --git a/vllm/inputs/engine.py b/vllm/inputs/engine.py
index 2b426eba8f00..1c12fbc2c552 100644
--- a/vllm/inputs/engine.py
+++ b/vllm/inputs/engine.py
@@ -71,12 +71,27 @@ class EmbedsInput(_InputOptions):
     prompt: NotRequired[str]
     """The prompt text corresponding to the token IDs, if available."""
 
+    prompt_token_ids: NotRequired[list[int]]
+    """Token IDs of the rendered prompt. Only set for mixed-mode inputs
+    (chat completion with `prompt_embeds` content parts). When present,
+    `is_token_ids` MUST also be present and have the same length. 
+    For pure-embeds inputs this field is absent."""
+
+    is_token_ids: NotRequired[list[bool]]
+    """Per-position mask for mixed-mode inputs. `True` means the position
+    is a real token ID (use the model's embedding layer); `False` means
+    the position uses a pre-computed embedding row from `prompt_embeds`.
+    Length MUST equal `len(prompt_token_ids)`.
+    For pure-embeds inputs this field is absent."""
+
 
 def embeds_input(
     prompt_embeds: "torch.Tensor",
     *,
     prompt: str | None = None,
     cache_salt: str | None = None,
+    prompt_token_ids: list[int] | None = None,
+    is_token_ids: list[bool] | None = None,
 ) -> EmbedsInput:
     """
     Construct [`EmbedsInput`][vllm.inputs.engine.EmbedsInput]
@@ -88,6 +103,10 @@ def embeds_input(
         inputs["prompt"] = prompt
     if cache_salt is not None:
         inputs["cache_salt"] = cache_salt
+    if prompt_token_ids is not None:
+        inputs["prompt_token_ids"] = prompt_token_ids
+    if is_token_ids is not None:
+        inputs["is_token_ids"] = is_token_ids
 
     return inputs
 
diff --git a/vllm/inputs/llm.py b/vllm/inputs/llm.py
index ff22af819a28..918098b758ca 100644
--- a/vllm/inputs/llm.py
+++ b/vllm/inputs/llm.py
@@ -125,6 +125,17 @@ class EmbedsPrompt(_PromptOptions):
     prompt: NotRequired[str]
     """The prompt text corresponding to the token embeddings, if available."""
 
+    prompt_token_ids: NotRequired[list[int]]
+    """Token IDs for mixed-mode inputs (chat completion with
+    `prompt_embeds` content parts). The tokens at positions where 
+    `prompt_is_token_ids` is `False` are placeholder tokens that 
+    get replaced by entries from `prompt_embeds` in the forward pass."""
+
+    prompt_is_token_ids: NotRequired[list[bool]]
+    """Per-position mask, `True` uses the real token ID, `False` uses
+    the corresponding entry from `prompt_embeds`. 
+    Must be the same length as `prompt_token_ids` when both are set."""
+
 
 DecoderOnlyPrompt: TypeAlias = (
     str | TextPrompt | list[int] | TokensPrompt | EmbedsPrompt
diff --git a/vllm/ir/op.py b/vllm/ir/op.py
index 5d7c01be1bbc..841df0f9adf4 100644
--- a/vllm/ir/op.py
+++ b/vllm/ir/op.py
@@ -4,7 +4,7 @@
 import inspect
 from collections.abc import Callable
 from pathlib import Path
-from typing import Any, ClassVar, overload
+from typing import Any, ClassVar, Literal, overload
 
 import torch
 from torch.library import Library, infer_schema
@@ -46,35 +46,51 @@ def enable_torch_wrap(enable: bool = True):
         _ENABLE_TORCH_WRAP = old
 
 
-# 0-param decorator overload
+# 0-param decorator overload (no inplace)
 @overload
 def register_op(f: Callable[..., Any]) -> "IrOp": ...
 
 
-# parametrized decorator overload
+# parametrized decorator with allow_inplace=False (default)
 @overload
 def register_op(
     *,
     name: str | None = None,
+    activations: list[str] | None = None,
+    allow_inplace: Literal[False] = False,
 ) -> Callable[[Callable[..., Any]], "IrOp"]: ...
 
 
+# parametrized decorator with allow_inplace=True
+@overload
+def register_op(
+    *,
+    name: str | None = None,
+    activations: list[str] | None = None,
+    allow_inplace: Literal[True],
+) -> Callable[[Callable[..., Any]], "IrOpInplace"]: ...
+
+
 def register_op(
     f: Callable | None = None,
     *,
     name: str | None = None,
+    activations: list[str] | None = None,
+    allow_inplace: bool = False,
 ) -> "IrOp | Callable[[Callable], IrOp]":
     """
     Register a new vLLM IR op.
 
     :param f: the native implementation of the op
     :param name: the name of the op, defaults to the function name
+    :param activations: list of activation params, defaults to params starting with 'x'
+    :param allow_inplace: add a maybe_inplace overload that allows inplace impls
     :return: the IrOp object if f is provided, otherwise a decorator
 
     Example usage:
     ```python
     @vllm.ir.register_op
-    def my_op(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    def my_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return x + y
 
 
@@ -85,7 +101,10 @@ def multiply(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     def decorator(_f: Callable):
         op_name: str = _f.__name__ if name is None else name
         assert op_name not in IrOp.registry
-        op = IrOp(op_name, _f)
+        if allow_inplace:
+            op: IrOp = IrOpInplace(op_name, _f, activations)
+        else:
+            op = IrOp(op_name, _f, activations)
         IrOp.registry[op_name] = op
         return op
 
@@ -100,8 +119,14 @@ class IrOp:
 
     name: str
     impls: dict[str, "IrOpImpl"]
+    allow_inplace: bool = False
 
-    def __init__(self, name: str, native_impl: Callable):
+    def __init__(
+        self,
+        name: str,
+        native_impl: Callable,
+        activations: list[str] | None = None,
+    ):
         self._py_signature = inspect.signature(native_impl)
         if any(
             p.kind == inspect.Parameter.KEYWORD_ONLY
@@ -112,8 +137,22 @@ def __init__(self, name: str, native_impl: Callable):
                 f"supported. That's because kwargs are not allowed during lowering."
             )
 
+        # By convention, we consider parameters starting with 'x' as activations.
+        if activations is None:
+            activations = [
+                p.name
+                for p in self._py_signature.parameters.values()
+                if p.name.startswith("x")
+            ]
+
         self.name = name
         self.impls: dict[str, IrOpImpl] = {}
+        self.activations = activations
+        self.activation_indices = [
+            i
+            for i, p in enumerate(self._py_signature.parameters.values())
+            if p.name in activations
+        ]
         self._priority_impls: list[IrOpImpl] = []
         self._schema_str = infer_schema(native_impl, mutates_args=[])
         self._input_generator: InputGenerator | None = None
@@ -121,7 +160,12 @@ def __init__(self, name: str, native_impl: Callable):
 
         # native implementation
         self.impls["native"] = IrOpImpl(
-            self, "native", native_impl, supported=True, supports_args=None
+            self,
+            "native",
+            native_impl,
+            # always supported
+            supported=True,
+            supports_args=None,
         )
 
         # By default, fake routes directly to native,
@@ -161,12 +205,14 @@ def register_impl(
         *,
         supported: bool = True,
         supports_args: Callable[..., bool] | None = None,
+        inplace: bool = False,
     ):
         """
         Register an implementation for this custom op.
         :param provider: The name of the provider, must be unique.
         :param supported: Static support check, use this to check platform support.
         :param supports_args: Dynamic arg support check, used for types and shapes.
+        :param inplace: Does this op reuse activation input memory for outputs
         :return: A decorator that registers the implementation.
 
         The decorated function must have the same semantics and signature as
@@ -193,7 +239,7 @@ def my_provider_impl(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: ...
         )
 
         def _register_impl(f: Callable):
-            impl = IrOpImpl(self, provider, f, supported, supports_args)
+            impl = IrOpImpl(self, provider, f, supported, supports_args, inplace)
             self.impls[provider] = impl
 
             if self.get_priority():
@@ -213,7 +259,10 @@ def _inner_call(self, *args, **kwargs) -> Any:
         __call__ routes straight here instead of going through torch op dispatching.
         """
         impl = self.dispatch(*args, **kwargs)
-        return impl.impl_fn(*args, **kwargs)
+
+        # Default overload must be functional,
+        # use func_impl_fn to correctly handle inplace impls.
+        return impl.func_impl_fn(*args, **kwargs)
 
     def apply_arg_defaults(self, args) -> tuple:
         """
@@ -314,6 +363,11 @@ def filter_priority_impls(p_list: list[str]) -> list[IrOpImpl]:
         old_priority_impls = self._priority_impls
         try:
             self._priority_impls = filter_priority_impls(priority)
+            logger.debug(
+                "Priority for vllm.ir.%s set to %s",
+                self.name,
+                lazy(lambda: [p.provider for p in self._priority_impls]),
+            )
             yield
         finally:
             self._priority_impls = old_priority_impls
@@ -354,6 +408,66 @@ def get_tolerance(self, dtype: torch.dtype) -> dict[str, float]:
         )
 
 
+class IrOpInplace(IrOp):
+    """IR op with inplace support via maybe_inplace."""
+
+    maybe_inplace: "IrOpInplaceOverload"
+    allow_inplace: bool = True
+
+    def __init__(
+        self,
+        name: str,
+        native_impl: Callable,
+        activations: list[str] | None = None,
+    ):
+        super().__init__(name, native_impl, activations)
+
+        # Create the inplace overload
+        self.maybe_inplace = IrOpInplaceOverload(self)
+
+
+class IrOpInplaceOverload:
+    def __init__(self, op: IrOp):
+        params, returns = op._schema_str.split(" -> ")
+        n_outputs = returns.count("Tensor")
+
+        assert returns.count("Tensor") == len(op.activations), (
+            "Inplace overload requires the same number of outputs as activations."
+        )
+
+        assert returns.count(",") == n_outputs - 1, (
+            "Inplace overload only supports Tensor outputs for now."
+        )
+
+        self.op = op
+        self.name = f"{op.name}.maybe_inplace"
+        self._schema_str = infer_schema(
+            op.impls["native"].impl_fn, mutates_args=op.activations
+        )
+
+        # torch registration
+        vllm_ir_lib.define(self.name + self._schema_str)
+        vllm_ir_lib.impl(
+            self.name, self._inner_call, dispatch_key="CompositeExplicitAutograd"
+        )
+        # fake goes to default overload for now
+        vllm_ir_lib._register_fake(self.name, self.op._fake_call)
+
+        assert hasattr(getattr(torch.ops.vllm_ir, self.op.name), "maybe_inplace")
+        self.torch_op = getattr(torch.ops.vllm_ir, self.op.name).maybe_inplace
+
+    def __call__(self, *args, **kwargs) -> Any:
+        if not _ENABLE_TORCH_WRAP:
+            return self._inner_call(*args, **kwargs)
+
+        return self.torch_op(*args, **kwargs)
+
+    def _inner_call(self, *args, **kwargs) -> Any:
+        # Calling the maybe_inplace overload means we can use inplace impls directly.
+        impl = self.op.dispatch(*args, **kwargs)
+        return impl.impl_fn(*args, **kwargs)
+
+
 class IrOpImpl:
     def __init__(
         self,
@@ -362,6 +476,7 @@ def __init__(
         impl_fn: Callable,
         supported: bool,
         supports_args: Callable[..., bool] | None,
+        inplace: bool = False,
     ):
         assert provider not in op.impls, (
             f"Implementation for provider {provider} already registered."
@@ -420,11 +535,18 @@ def __init__(
                         f"native default {op_p.default}'"
                     )
 
+        if inplace:
+            assert op.allow_inplace, (
+                f"Inplace implementation cannot be registered for op {op.name}"
+                f" that does not allow inplace."
+            )
+
         self.op = op
         self.provider = provider
         self.impl_fn = impl_fn
         self.supported = supported
         self._supports_args = supports_args
+        self.inplace = inplace
 
     @property
     def supports_all_args(self) -> bool:
@@ -449,3 +571,19 @@ def uuid(self):
         """
         sources = [Path(inspect.getfile(self.impl_fn))]
         return hash_source(*sources)
+
+    def func_impl_fn(self, *args, **kwargs) -> Any:
+        """
+        Copy any inputs in activations if this is an inplace impl,
+        to ensure functional semantics.
+        """
+        if not self.inplace:
+            return self.impl_fn(*args, **kwargs)
+
+        # copy activations to ensure functional semantics
+        new_args = list(args)
+        for i in self.op.activation_indices:
+            assert isinstance(args[i], torch.Tensor)
+            new_args[i] = args[i].clone()
+
+        return self.impl_fn(*new_args, **kwargs)
diff --git a/vllm/ir/ops/__init__.py b/vllm/ir/ops/__init__.py
index 25ad27c8a078..d4d71afef723 100644
--- a/vllm/ir/ops/__init__.py
+++ b/vllm/ir/ops/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from .layernorm import rms_norm
+from .layernorm import fused_add_rms_norm, rms_norm
 
-__all__ = ["rms_norm"]
+__all__ = ["rms_norm", "fused_add_rms_norm"]
diff --git a/vllm/ir/ops/layernorm.py b/vllm/ir/ops/layernorm.py
index 981d5e3bd836..33a71b8f853f 100644
--- a/vllm/ir/ops/layernorm.py
+++ b/vllm/ir/ops/layernorm.py
@@ -27,10 +27,46 @@ def _rms_norm_input_generator(
 ) -> tuple:
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     weight = torch.randn(hidden_size, dtype=dtype)
-    return (x, weight, epsilon)
+    return x, weight, epsilon
 
 
 # Reductions in rms_norm accumulate rounding error at large shapes
 # (e.g. 32768x16384), causing a few elements out of millions to exceed
 # the default float16 tolerance.
 rms_norm.override_tolerance(torch.float16, atol=1e-2, rtol=2e-3)
+
+
+@register_op(allow_inplace=True)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    """Fused add and weighted root-mean-square layer normalization"""
+    orig_dtype = x.dtype
+    x = x.to(torch.float32)
+    x = x + x_residual.to(torch.float32)
+    x_residual = x.to(orig_dtype)
+
+    x_var = x if variance_size is None else x[..., :variance_size]
+    variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + epsilon)
+    if weight is not None:
+        x = x.to(weight.dtype) * weight
+    return x.to(orig_dtype), x_residual
+
+
+# fused_add_rms_norm has similar rounding error accumulation as rms_norm
+fused_add_rms_norm.override_tolerance(torch.float16, atol=1e-2, rtol=2e-3)
+
+
+@fused_add_rms_norm.register_input_generator
+def _fused_add_rms_norm_input_generator(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, epsilon: float = 1e-5
+) -> tuple:
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x_residual = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    weight = torch.randn(hidden_size, dtype=dtype)
+    return x, x_residual, weight, epsilon
diff --git a/vllm/kernels/aiter_ops.py b/vllm/kernels/aiter_ops.py
index 14c2b87fbbdb..273bc58935b7 100644
--- a/vllm/kernels/aiter_ops.py
+++ b/vllm/kernels/aiter_ops.py
@@ -75,3 +75,72 @@ def _rms_norm_fake(x: Tensor, weight: Tensor, variance_epsilon: float) -> Tensor
 direct_register_aiter_op(
     op_name="rms_norm", op_func=_rms_norm_impl, fake_impl=_rms_norm_fake
 )
+
+rms_add_no_var_16bit_only = (
+    lambda x, x_residual, weight, epsilon, variance_size=None: variance_size is None
+    and x.dtype in (torch.float16, torch.bfloat16)
+    and (weight is None or weight.dtype == x.dtype)
+)
+"""
+AITER fused_add_rms_norm only supports 16-bit activations and no var_size override.
+Requires weight dtype to match x dtype.
+"""
+
+
+@ir.ops.fused_add_rms_norm.register_impl(
+    "aiter", supports_args=rms_add_no_var_16bit_only, supported=AITER_SUPPORTED
+)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    assert variance_size is None
+    assert x.dtype in (torch.float16, torch.bfloat16)
+    if weight is None:
+        weight = torch.ones(x.shape[-1], device=x.device, dtype=x.dtype)
+    return torch.ops.vllm_aiter.fused_add_rms_norm(x, x_residual, weight, epsilon)
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import rmsnorm2d_fwd_with_add
+
+    # TODO can out = x and residual_out = residual to save memory?
+    #  Need to check if the kernel supports in-place residual output
+    #  (if yes set mutates_args and inplace)
+    residual_out = torch.empty_like(residual)
+    out = torch.empty_like(x)
+    rmsnorm2d_fwd_with_add(
+        out,  # output
+        x,  # input
+        residual,  # residual input
+        residual_out,  # residual output
+        weight,
+        variance_epsilon,
+    )
+    return out, residual_out
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    residual_out = torch.empty_like(residual)
+    out = torch.empty_like(x)
+    return out, residual_out
+
+
+direct_register_aiter_op(
+    op_name="fused_add_rms_norm",
+    op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
+    fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
+)
diff --git a/vllm/kernels/oink_ops.py b/vllm/kernels/oink_ops.py
index e8e3cb91f857..835cd062d037 100644
--- a/vllm/kernels/oink_ops.py
+++ b/vllm/kernels/oink_ops.py
@@ -1,6 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""This file registers Oink implementations for vLLM IR ops.
+
+vLLM does not depend on the external Oink repository/package. When an external
+plugin registers torch.library.custom_op entrypoints under the `oink::`
+namespace (e.g. via vLLM's general_plugins mechanism), these ops will be marked
+ as supported. To dispatch to those ops, set kernel_config.ir_op_priority.<op> to oink.
+Alternatively, `VLLM_USE_OINK_OPS=1` will add this to priority by default.
+"""
+
 import torch
+from torch import Tensor
 
 from vllm import ir
 from vllm.platforms import current_platform
@@ -15,7 +25,7 @@ def has_oink_op(name: str) -> bool:
     return OINK_AVAILABLE and hasattr(torch.ops.oink, name)
 
 
-def _can_view_as_2d(x: torch.Tensor) -> bool:
+def _can_view_as_2d(x: Tensor) -> bool:
     """Return True if x.view(-1, x.shape[-1]) is viewable (no copy)."""
     if x.dim() < 2:
         return False
@@ -32,7 +42,7 @@ def _can_view_as_2d(x: torch.Tensor) -> bool:
     return True
 
 
-def _is_oink_stride_compatible_2d(x_2d: torch.Tensor) -> bool:
+def _is_oink_stride_compatible_2d(x_2d: Tensor) -> bool:
     """Return True if x_2d meets Oink's pointer-path stride constraints."""
     if x_2d.dim() != 2:
         return False
@@ -67,11 +77,51 @@ def _is_oink_stride_compatible_2d(x_2d: torch.Tensor) -> bool:
     "oink", supports_args=oink_rms_supported, supported=has_oink_op("rmsnorm")
 )
 def rms_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor | None,
+    x: Tensor,
+    weight: Tensor | None,
     epsilon: float,
     variance_size: int | None = None,
-) -> torch.Tensor:
+) -> Tensor:
     assert variance_size is None
     x_2d = x.view(-1, x.shape[-1])
     return torch.ops.oink.rmsnorm(x_2d, weight, epsilon).view_as(x)
+
+
+oink_add_rms_supported = (
+    lambda x, x_residual, weight, epsilon, variance_size=None: variance_size is None
+    and weight is not None
+    and x.dim() >= 2
+    and x.dtype == weight.dtype
+    and weight.is_contiguous()
+    and _can_view_as_2d(x)
+    and _is_oink_stride_compatible_2d(x.view(-1, x.shape[-1]))
+    # residual must have 2d-compatible strides and match x shape/dtype
+    and x.dtype == x_residual.dtype
+    and x.shape == x_residual.shape
+    and _can_view_as_2d(x_residual)
+    and _is_oink_stride_compatible_2d(x_residual.view(-1, x_residual.shape[-1]))
+)
+"""
+Oink fused_add_rms_norm has the same constraints as rms_norm,
+and residual must be 2d-like with compatible strides.
+"""
+
+
+@ir.ops.fused_add_rms_norm.register_impl(
+    "oink",
+    supports_args=oink_add_rms_supported,
+    supported=has_oink_op("fused_add_rms_norm"),
+    inplace=True,
+)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    assert variance_size is None
+    x_2d = x.view(-1, x.shape[-1])
+    residual_2d = x_residual.view(-1, x_residual.shape[-1])
+    torch.ops.oink.fused_add_rms_norm(x_2d, residual_2d, weight, epsilon)
+    return x, x_residual
diff --git a/vllm/kernels/vllm_c.py b/vllm/kernels/vllm_c.py
index 124b02e4e27a..5c602c39843b 100644
--- a/vllm/kernels/vllm_c.py
+++ b/vllm/kernels/vllm_c.py
@@ -31,3 +31,33 @@ def rms_norm(
     output = torch.empty(x.shape, device=x.device, dtype=x.dtype)
     torch.ops._C.rms_norm(output, x, weight, epsilon)
     return output
+
+
+rms_add_no_var_size = (
+    lambda x, x_residual, weight, epsilon, variance_size=None: variance_size is None
+    and (weight is None or weight.dtype == x.dtype)
+)
+"""vLLM Kernel does not support variance_size parameter and requires
+matching input/weight dtype."""
+
+
+@ir.ops.fused_add_rms_norm.register_impl(
+    "vllm_c",
+    supports_args=rms_add_no_var_size,
+    supported=CUDA_ALIKE,
+    inplace=True,
+)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    if weight is None:
+        # Kernel requires weight tensor, pass ones
+        weight = torch.ones(x.shape[-1], device=x.device, dtype=x.dtype)
+
+    assert variance_size is None
+    torch.ops._C.fused_add_rms_norm(x, x_residual, weight, epsilon)
+    return x, x_residual
diff --git a/vllm/kernels/xpu_ops.py b/vllm/kernels/xpu_ops.py
index c680c542c1df..5e7f90f70868 100644
--- a/vllm/kernels/xpu_ops.py
+++ b/vllm/kernels/xpu_ops.py
@@ -36,3 +36,31 @@ def rms_norm(
     output = torch.empty(x.shape, device=x.device, dtype=x.dtype)
     torch.ops._C.rms_norm(output, x, weight, epsilon)
     return output
+
+
+rms_add_no_var_size = (
+    lambda x, x_residual, weight, epsilon, variance_size=None: variance_size is None
+    and (weight is None or weight.dtype == x.dtype)
+)
+
+
+@ir.ops.fused_add_rms_norm.register_impl(
+    "xpu_kernels",
+    supports_args=rms_add_no_var_size,
+    supported=XPU_KERNELS_SUPPORTED,
+    inplace=True,
+)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    if weight is None:
+        # Kernel requires weight tensor, pass ones
+        weight = torch.ones(x.shape[-1], device=x.device, dtype=x.dtype)
+
+    assert variance_size is None
+    torch.ops._C.fused_add_rms_norm(x, x_residual, weight, epsilon)
+    return x, x_residual
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 6d8ef2db51ad..5b9bf2d76fbb 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -17,11 +17,7 @@
 )
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
-from vllm.lora.utils import (
-    get_adapter_absolute_path,
-    is_in_target_modules,
-    is_supported_lora_module,
-)
+from vllm.lora.utils import get_adapter_absolute_path
 
 logger = init_logger(__name__)
 
@@ -146,34 +142,6 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 skip_prefixes=lora_skip_prefixes,
             )
 
-            # Warn about adapter modules that will be ignored.
-            target_modules = self.lora_config.target_modules
-            expected_lora_modules_lst = list(expected_lora_modules)
-            for module_name in lora.loras:
-                if not is_supported_lora_module(module_name, expected_lora_modules_lst):
-                    logger.warning_once(
-                        "LoRA module '%s' in adapter '%s' is not in the "
-                        "model's supported LoRA target modules [%s]. "
-                        "These parameters will be ignored, which may "
-                        "cause abnormal model behavior.",
-                        module_name,
-                        lora_request.lora_path,
-                        ", ".join(sorted(expected_lora_modules_lst)),
-                    )
-                elif not is_in_target_modules(
-                    module_name,
-                    target_modules,
-                    packed_modules_mapping,
-                ):
-                    logger.warning_once(
-                        "LoRA module '%s' in adapter '%s' is not in the "
-                        "deployment-time target_modules restriction [%s]."
-                        " These parameters will be ignored.",
-                        module_name,
-                        lora_request.lora_path,
-                        ", ".join(sorted(target_modules)),
-                    )
-
         except FileNotFoundError as e:
             # FileNotFoundError should be raised if both
             # - No adapter found to download from huggingface (or in
diff --git a/vllm/model_executor/kernels/linear/__init__.py b/vllm/model_executor/kernels/linear/__init__.py
index 5d513f767f03..b9a96a035309 100644
--- a/vllm/model_executor/kernels/linear/__init__.py
+++ b/vllm/model_executor/kernels/linear/__init__.py
@@ -110,6 +110,7 @@
     AiterPreshuffledPerTokenFp8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.cpu import (
+    CPUFp8BlockScaledMMKernel,
     CPUInt8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.cutlass import (
@@ -199,6 +200,9 @@
         AiterFp8BlockScaledMMKernel,
         TritonFp8BlockScaledMMKernel,
     ],
+    PlatformEnum.CPU: [
+        CPUFp8BlockScaledMMKernel,
+    ],
 }
 
 _POSSIBLE_WFP8A16_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] = {
diff --git a/vllm/model_executor/kernels/linear/scaled_mm/__init__.py b/vllm/model_executor/kernels/linear/scaled_mm/__init__.py
index e86684b2f8a1..f8f12f7b0cba 100644
--- a/vllm/model_executor/kernels/linear/scaled_mm/__init__.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/__init__.py
@@ -8,6 +8,7 @@
     Fp8BlockScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.cpu import (
+    CPUFp8BlockScaledMMKernel,
     CPUInt8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.cutlass import (
@@ -58,4 +59,5 @@
     "ROCmFP8ScaledMMLinearKernel",
     "TritonInt8ScaledMMLinearKernel",
     "Fp8BlockScaledMMLinearKernel",
+    "CPUFp8BlockScaledMMKernel",
 ]
diff --git a/vllm/model_executor/kernels/linear/scaled_mm/aiter.py b/vllm/model_executor/kernels/linear/scaled_mm/aiter.py
index 8a8650d22135..5ded5ca798ad 100644
--- a/vllm/model_executor/kernels/linear/scaled_mm/aiter.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/aiter.py
@@ -312,6 +312,21 @@ def apply_block_scaled_mm(
         As: torch.Tensor,
         Bs: torch.Tensor,
     ) -> torch.Tensor:
+        if As.dtype != Bs.dtype:
+            from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+                _upcast_e8m0_to_fp32,
+            )
+
+            if As.dtype == torch.float8_e8m0fnu:
+                As = _upcast_e8m0_to_fp32(As).contiguous()
+            else:
+                As = As.to(torch.float32)
+
+            if Bs.dtype == torch.float8_e8m0fnu:
+                Bs = _upcast_e8m0_to_fp32(Bs).contiguous()
+            else:
+                Bs = Bs.to(torch.float32)
+
         out_dtype = self.config.out_dtype
         if self.use_triton:
             gemm_a8w8_blockscale_op = rocm_aiter_ops.triton_gemm_a8w8_blockscale
diff --git a/vllm/model_executor/kernels/linear/scaled_mm/cpu.py b/vllm/model_executor/kernels/linear/scaled_mm/cpu.py
index 3d67a73af433..083cb473aaca 100644
--- a/vllm/model_executor/kernels/linear/scaled_mm/cpu.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/cpu.py
@@ -14,6 +14,10 @@
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 
+from .BlockScaledMMLinearKernel import (
+    Fp8BlockScaledMMLinearKernel,
+    FP8ScaledMMLinearLayerConfig,
+)
 from .ScaledMMLinearKernel import (
     Int8ScaledMMLinearKernel,
     Int8ScaledMMLinearLayerConfig,
@@ -215,3 +219,109 @@ def _apply_weights_sgl(
             x.dtype,
             True,
         )
+
+
+class CPUFp8BlockScaledMMKernel(Fp8BlockScaledMMLinearKernel):
+    """FP8 W8A16 block-quantized GEMM via AMX BRGEMM on CPU."""
+
+    # Input stays BF16 — no FP8 activation quantization.
+    apply_input_quant = False
+
+    @classmethod
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
+        if not current_platform.is_cpu():
+            return False, "requires CPU platform."
+        if not torch.cpu._is_amx_tile_supported():
+            return False, "requires AMX tile support (Sapphire Rapids or newer)."
+        if not ops._supports_cpu_fp8_w8a16:
+            return False, "fp8_scaled_mm_cpu op not available."
+        return True, None
+
+    @classmethod
+    def can_implement(
+        cls, config: FP8ScaledMMLinearLayerConfig
+    ) -> tuple[bool, str | None]:
+        # Validate weight block shape
+        weight_gs = config.weight_quant_key.scale.group_shape
+        if weight_gs.col <= 0 or weight_gs.col != 128:
+            return False, (
+                "CPU FP8 kernel requires K-dimension block size of 128, "
+                f"got {weight_gs.col}."
+            )
+        if weight_gs.row <= 0 or weight_gs.row % 32 != 0:
+            return False, (
+                "CPU FP8 kernel requires N-dimension block size to be "
+                f"a positive multiple of 32, got {weight_gs.row}."
+            )
+        if config.out_dtype not in (torch.bfloat16, torch.float32):
+            return False, "Only bfloat16/float32 output dtype supported."
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Skip the base class process (FP8 padding / fnuz normalization)
+        # which is GPU-oriented.  Instead, VNNI-prepack weights for AMX.
+        params = self._get_layer_params(layer)
+        packed_weight = torch.ops._C.convert_weight_packed(params.weight)
+        replace_parameter(
+            layer,
+            params.WEIGHT,
+            torch.nn.Parameter(packed_weight, requires_grad=False),
+        )
+
+        # Re-wrap scale as a plain Parameter so the kernel can read it
+        # without weight-loader metadata interfering.
+        scale_attr = (
+            params.WEIGHT_SCALE_INV
+            if params.weight_scale_inv is not None
+            else params.WEIGHT_SCALE
+        )
+        weight_scale = (
+            params.weight_scale_inv
+            if params.weight_scale_inv is not None
+            else params.weight_scale
+        )
+        assert weight_scale is not None
+        replace_parameter(
+            layer,
+            scale_attr,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False),
+        )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        params = self._get_layer_params(layer)
+        weight_scale = (
+            params.weight_scale_inv
+            if params.weight_scale_inv is not None
+            else params.weight_scale
+        )
+
+        x_2d = x.reshape(-1, x.shape[-1]) if x.dim() > 2 else x
+        out = torch.ops._C.fp8_scaled_mm_cpu(
+            x_2d,
+            params.weight,
+            weight_scale,
+            list(self.weight_group_shape),
+            bias,
+            x.dtype,
+            True,  # is_vnni (weight already prepacked)
+        )
+        return out.reshape(x.shape[:-1] + (out.size(-1),)) if x.dim() > 2 else out
+
+    def apply_block_scaled_mm(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError(
+            "CPUFp8BlockScaledMMKernel overrides apply_weights directly."
+        )
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 59cc95f18c58..df9459012ae8 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -169,7 +169,9 @@ class SiluAndMulWithClamp(CustomOp):
     def __init__(self, swiglu_limit: float, *, compile_native: bool = True):
         super().__init__(compile_native=compile_native)
         self.swiglu_limit = float(swiglu_limit)
-        if current_platform.is_cuda_alike() or current_platform.is_xpu():
+        if current_platform.is_rocm():
+            self._forward_method = self.forward_native
+        elif current_platform.is_cuda_alike() or current_platform.is_xpu():
             self.op = torch.ops._C.silu_and_mul_with_clamp
         elif current_platform.is_cpu():
             self._forward_method = self.forward_native
diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index 4afe2319570e..9cf7ebf5cdfc 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -189,12 +189,9 @@
 
 import functools
 from abc import abstractmethod
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from enum import Enum
-from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast
-
-if TYPE_CHECKING:
-    from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
+from typing import ClassVar, Generic, TypeVar, cast
 
 import torch
 import torch.nn as nn
@@ -242,7 +239,7 @@
     kNvfp4Dynamic,
 )
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer, has_nvidia_artifactory
+from vllm.utils.flashinfer import has_flashinfer
 from vllm.utils.math_utils import cdiv, round_down
 from vllm.utils.torch_utils import (
     LayerNameType,
@@ -262,11 +259,12 @@
     MLAAttentionImpl,
     SparseMLAAttentionImpl,
 )
-from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
+from vllm.v1.attention.backends.mla.prefill import (
+    MLAPrefillBackend,
+    get_mla_prefill_backend,
+)
 from vllm.v1.attention.backends.utils import (
     get_dcp_local_seq_lens,
-    get_per_layer_parameters,
-    infer_global_hyperparameters,
     split_decodes_and_prefills,
 )
 from vllm.v1.attention.ops.common import cp_lse_ag_out_rs
@@ -456,20 +454,32 @@ def __init__(
         self.q_pad_num_heads = getattr(self.impl, "q_pad_num_heads", None)
         self.use_direct_call = not current_platform.opaque_attention_op()
 
-        compilation_config = get_current_vllm_config().compilation_config
+        vllm_config = get_current_vllm_config()
+        compilation_config = vllm_config.compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
 
+        prefill_backend_cls = get_mla_prefill_backend(vllm_config)
+        self.prefill_backend = prefill_backend_cls(
+            num_heads=self.num_heads,
+            scale=self.scale,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            v_head_dim=self.v_head_dim,
+            vllm_config=vllm_config,
+        )
+
         self.kv_cache = torch.tensor([])
 
         self.use_sparse = use_sparse
 
-        vllm_config = get_current_vllm_config_or_none()
+        _vllm_config = get_current_vllm_config_or_none()
         self.dcp_a2a = (
-            vllm_config is not None
-            and vllm_config.parallel_config.decode_context_parallel_size > 1
-            and vllm_config.parallel_config.dcp_comm_backend == "a2a"
+            _vllm_config is not None
+            and _vllm_config.parallel_config.decode_context_parallel_size > 1
+            and _vllm_config.parallel_config.dcp_comm_backend == "a2a"
         )
 
         # Initialize q/k/v range constants.
@@ -1123,33 +1133,6 @@ class QueryLenSupport(Enum):
     VARLEN = "varlen"
 
 
-try:
-    from vllm.vllm_flash_attn import (  # type: ignore[attr-defined]
-        flash_attn_varlen_func,
-    )
-
-    is_vllm_fa = True
-except ImportError:
-    is_vllm_fa = False
-    flash_attn_varlen_func = None  # type: ignore[assignment]
-    # On ROCm, vllm_flash_attn is not available, try upstream flash_attn instead.
-    # On CUDA, vllm_flash_attn should always be available (built with vLLM),
-    # so we don't attempt the fallback there.
-    if current_platform.is_rocm():
-        try:
-            from flash_attn import flash_attn_varlen_func  # type: ignore[no-redef]
-        except ImportError:
-            logger.debug(
-                "flash_attn not available on ROCm; "
-                "MLA models using TRITON_MLA will require flash_attn. "
-                "AITER_MLA backends use aiter kernels instead."
-            )
-    elif current_platform.is_xpu():
-        from vllm._xpu_ops import xpu_ops
-
-        flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func  # type: ignore[no-redef,attr-defined,assignment]
-
-
 def dynamic_per_batched_tensor_quant(
     x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
 ):
@@ -1161,9 +1144,6 @@ def dynamic_per_batched_tensor_quant(
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
 
 
-logger = init_logger(__name__)
-
-
 @CustomOp.register(
     "mla_decode_concat_quant_fp8",
     dynamic_arg_dims={"decode_ql_nope": 0, "decode_q_pe": 0},
@@ -1197,9 +1177,6 @@ def forward(
     forward_hip = _make_forward(QuantFP8.forward_hip)  # type: ignore[arg-type]
 
 
-CUDNN_WORKSPACE_SIZE = 12800
-
-
 class MLACommonBackend(AttentionBackend):
     @staticmethod
     def get_name() -> str:
@@ -1268,26 +1245,9 @@ class ChunkedContextMetadata:
     query_start_loc: torch.Tensor
     max_query_len: int
     chunked_context: ChunkedContextMetadata | None = None
-    query_seq_lens: torch.Tensor | None = None
-    workspace_buffer: torch.Tensor | None = None
     q_data_type: torch.dtype | None = None
     output_dtype: torch.dtype | None = None
-
-
-@dataclass
-class FlashInferPrefillMetadata(MLACommonPrefillMetadata):
-    prefill_main: "BatchPrefillWithRaggedKVCacheWrapper | None" = None
-    prefill_chunks: "list[BatchPrefillWithRaggedKVCacheWrapper]" = field(
-        default_factory=list
-    )
-
-
-@dataclass
-class CudnnPrefillMetadata(MLACommonPrefillMetadata):
-    class ChunkedContextMetadata(MLACommonPrefillMetadata.ChunkedContextMetadata):
-        seq_lens: torch.Tensor
-
-    cudnn_workspace: torch.Tensor | None = None
+    prefill_backend: MLAPrefillBackend | None = None
 
 
 @dataclass
@@ -1333,13 +1293,8 @@ class MLACommonMetadata(AttentionMetadata, Generic[D]):
     # The dimension of the attention heads
     head_dim: int | None = None
 
+    prefill: MLACommonPrefillMetadata | None = None
     decode: D | None = None
-    prefill: (
-        MLACommonPrefillMetadata
-        | FlashInferPrefillMetadata
-        | CudnnPrefillMetadata
-        | None
-    ) = None
 
     def __post_init__(self):
         if self.head_dim is not None and not MLACommonBackend.supports_head_size(
@@ -1352,64 +1307,6 @@ def __post_init__(self):
 A = TypeVar("A", bound=AttentionMetadata)
 
 
-def is_deepseek_r1_mla_compatible(vllm_config: VllmConfig) -> bool:
-    # Check if model has DeepSeek R1 compatible MLA dimensions:
-    # qk_nope_head_dim = 128, qk_rope_head_dim = 64, v_head_dim = 128
-    # which results in query/key head dim = 192.
-    if vllm_config.model_config is None:
-        return False
-    hf_text_config = vllm_config.model_config.hf_text_config
-    qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
-    qk_rope_head_dim = getattr(hf_text_config, "qk_rope_head_dim", 1)
-    v_head_dim = getattr(hf_text_config, "v_head_dim", 1)
-    return qk_nope_head_dim == 128 and qk_rope_head_dim == 64 and v_head_dim == 128
-
-
-@functools.cache
-def use_flashinfer_prefill() -> bool:
-    from vllm.config import get_current_vllm_config
-
-    vllm_config = get_current_vllm_config()
-    if not (
-        not vllm_config.attention_config.disable_flashinfer_prefill
-        and has_flashinfer()
-        and not vllm_config.attention_config.use_cudnn_prefill
-        and current_platform.is_device_capability_family(100)
-    ):
-        return False
-
-    return is_deepseek_r1_mla_compatible(vllm_config)
-
-
-@functools.cache
-def use_cudnn_prefill() -> bool:
-    from vllm.config import get_current_vllm_config
-
-    vllm_config = get_current_vllm_config()
-    return (
-        has_flashinfer()
-        and vllm_config.attention_config.use_cudnn_prefill
-        and current_platform.is_device_capability_family(100)
-        and has_nvidia_artifactory()
-    )
-
-
-@functools.cache
-def use_trtllm_ragged_deepseek_prefill() -> bool:
-    """Check if TRT-LLM ragged DeepSeek prefill should be used."""
-    from vllm.config import get_current_vllm_config
-
-    vllm_config = get_current_vllm_config()
-    if not (
-        has_flashinfer()
-        and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
-        and current_platform.is_device_capability_family(100)
-    ):
-        return False
-
-    return is_deepseek_r1_mla_compatible(vllm_config)
-
-
 @dataclass
 class MLADims:
     q_lora_rank: int | None
@@ -1447,15 +1344,14 @@ def get_mla_dims(model_config: ModelConfig) -> MLADims:
 
 @functools.cache
 def backend_supports_prefill_query_quantization() -> bool:
-    """Check if the selected MLA backend supports prefill query quantization.
+    """Check if the selected MLA prefill backend supports query quantization.
 
     Currently supported backends:
-    - FlashInfer prefill
-    - TRT-LLM ragged DeepSeek prefill
+    - FlashInfer
+    - TRT-LLM Ragged
 
     Not supported:
-    - cuDNN Prefill
-    - FlashAttention
+    - FlashAttention (FA3/FA4)
     - Non-GB200 devices (FP8 prefill requires device capability 100)
     """
     # FP8 prefill query quantization requires GB200 (device capability 100)
@@ -1463,7 +1359,15 @@ def backend_supports_prefill_query_quantization() -> bool:
     if not current_platform.is_device_capability_family(100):
         return False
 
-    return use_flashinfer_prefill() or use_trtllm_ragged_deepseek_prefill()
+    from vllm.config import get_current_vllm_config
+    from vllm.v1.attention.backends.mla.prefill import get_mla_prefill_backend
+
+    vllm_config = get_current_vllm_config()
+    backend_cls = get_mla_prefill_backend(vllm_config)
+    return backend_cls.get_name() in (
+        "FLASHINFER",
+        "TRTLLM_RAGGED",
+    )
 
 
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
@@ -1574,7 +1478,6 @@ def __init__(
             metadata_cls if metadata_cls is not None else MLACommonMetadata
         )
         self.kv_cache_spec = kv_cache_spec
-        scheduler_config = vllm_config.scheduler_config
         self.model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         self.compilation_config = vllm_config.compilation_config
@@ -1634,139 +1537,21 @@ def __init__(
                 device=device,
             )
 
-        self._use_cudnn_prefill = use_cudnn_prefill()
-        self._use_fi_prefill = use_flashinfer_prefill()
-        self._use_trtllm_ragged_prefill = use_trtllm_ragged_deepseek_prefill()
-        self.prefill_metadata_cls = (
-            FlashInferPrefillMetadata
-            if self._use_fi_prefill
-            else CudnnPrefillMetadata
-            if self._use_cudnn_prefill
-            else MLACommonPrefillMetadata
-        )
-
-        if self._use_fi_prefill:
-            self._workspace_buffer = torch.empty(
-                envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,
-                dtype=torch.uint8,
-                device=device,
-            )
-
-            self._fi_prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
-            self._fi_prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = []
-
-            self._global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(vllm_config, layer_names, MLACommonImpl)  # type: ignore[type-abstract]
-            )
-
-        if self._use_trtllm_ragged_prefill:
-            self._workspace_buffer = torch.empty(
-                envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,
-                dtype=torch.uint8,
-                device=device,
-            )
-
-        if self._use_cudnn_prefill:
-            self.cudnn_workspace = torch.empty(
-                CUDNN_WORKSPACE_SIZE * scheduler_config.max_num_seqs,
-                dtype=torch.int8,
-                device=device,
-            )
+        self._prefill_backend = self.compilation_config.static_forward_context[
+            layer_names[0]
+        ].prefill_backend
 
         supports_spec_decode = self.query_len_support != QueryLenSupport.SINGLE_ONLY
         self._init_reorder_batch_threshold(
             self.reorder_batch_threshold, supports_spec_decode, supports_dcp_with_varlen
         )
 
-        # Validate consistency between query_len_support and reorder_batch_threshold
         if self.query_len_support == QueryLenSupport.SINGLE_ONLY:
             assert self.reorder_batch_threshold == 1, (
                 f"reorder_batch_threshold must be 1 when query_len_support is "
                 f"SINGLE_ONLY, got {self.reorder_batch_threshold}"
             )
 
-    def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata):
-        qo_indptr = prefill.query_start_loc
-
-        has_context = False
-        if prefill.chunked_context is not None:
-            chunked_context = prefill.chunked_context
-            has_context = True
-
-        if self._fi_prefill_main is None:
-            from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
-
-            self._fi_prefill_main = BatchPrefillWithRaggedKVCacheWrapper(
-                self._workspace_buffer, "NHD", backend="cutlass"
-            )
-
-        if has_context:
-            num_chunks = chunked_context.cu_seq_lens.shape[0]
-            # Allocate more prefill chunk wrappers if needed
-            if len(self._fi_prefill_chunks) < num_chunks:
-                from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
-
-                for _ in range(len(self._fi_prefill_chunks), num_chunks):
-                    self._fi_prefill_chunks.append(
-                        BatchPrefillWithRaggedKVCacheWrapper(
-                            self._workspace_buffer, "NHD", backend="cutlass"
-                        )
-                    )
-            assert num_chunks <= len(self._fi_prefill_chunks)
-
-        # In MLA, the non-latent num_qo_heads == num_kv_heads
-        num_qo_heads = self.num_heads
-        num_kv_heads = num_qo_heads
-
-        # Sanity: Verify that num_kv_heads == 1 since it is latent space
-        assert self.kv_cache_spec.num_kv_heads == 1
-
-        # Get non-latent head_dim_qk and head_dim_vo
-        head_dim_qk = self.mla_dims.qk_nope_head_dim + self.mla_dims.qk_rope_head_dim
-        head_dim_vo = self.mla_dims.v_head_dim
-
-        # For main run, qo_indptr == kv_indptr
-        kv_indptr = qo_indptr.clone()
-
-        # Prepare main prefill
-        self._fi_prefill_main.plan(
-            qo_indptr=qo_indptr,
-            kv_indptr=kv_indptr,
-            num_qo_heads=num_qo_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim_qk=head_dim_qk,
-            head_dim_vo=head_dim_vo,
-            causal=True,  # This is main run
-            sm_scale=self._global_hyperparameters.sm_scale,
-            window_left=self._global_hyperparameters.window_left,
-            logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
-            q_data_type=self.q_data_type,
-            o_data_type=prefill.output_dtype,
-        )
-
-        # Prepare context prefills
-        if has_context:
-            for i in range(num_chunks):
-                kv_indptr_chunk = chunked_context.cu_seq_lens[i]
-
-                self._fi_prefill_chunks[i].plan(
-                    qo_indptr=qo_indptr,
-                    kv_indptr=kv_indptr_chunk,
-                    num_qo_heads=num_qo_heads,
-                    num_kv_heads=num_kv_heads,
-                    head_dim_qk=head_dim_qk,
-                    head_dim_vo=head_dim_vo,
-                    causal=False,  # This is context run
-                    sm_scale=self._global_hyperparameters.sm_scale,
-                    window_left=self._global_hyperparameters.window_left,
-                    logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
-                    q_data_type=self.q_data_type,
-                    o_data_type=prefill.output_dtype,
-                )
-
-        prefill.prefill_main = self._fi_prefill_main
-        prefill.prefill_chunks = self._fi_prefill_chunks
-
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
@@ -1972,18 +1757,14 @@ def build(
                         dtype=torch.int32,
                     )
 
-                chunked_context_metadata_cls = (
-                    CudnnPrefillMetadata.ChunkedContextMetadata
-                    if self._use_cudnn_prefill
-                    else MLACommonPrefillMetadata.ChunkedContextMetadata
-                )
                 prefill_tokens_with_context = None
                 if num_prefills_with_context_cpu > 0:
                     prefill_tokens_with_context = prefill_query_start_loc_cpu[
                         num_prefills_with_context_cpu
                     ].item()
+                _ChunkedMetadata = MLACommonPrefillMetadata.ChunkedContextMetadata
                 if self.dcp_world_size > 1:
-                    chunked_context_metadata = chunked_context_metadata_cls(
+                    chunked_context_metadata = _ChunkedMetadata(
                         cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
                         starts=local_chunk_starts.to(device, non_blocking=True),
                         seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
@@ -2004,7 +1785,7 @@ def build(
                         prefill_tokens_with_context=prefill_tokens_with_context,
                     )
                 else:
-                    chunked_context_metadata = chunked_context_metadata_cls(
+                    chunked_context_metadata = _ChunkedMetadata(
                         cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
                         starts=chunk_starts.to(device, non_blocking=True),
                         seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
@@ -2018,35 +1799,22 @@ def build(
                         prefill_tokens_with_context=prefill_tokens_with_context,
                     )
 
-                if self._use_cudnn_prefill:
-                    chunked_context_metadata.seq_lens = chunk_seq_lens
-
                 assert (
                     max(chunked_context_metadata.max_seq_lens)
                     <= self.chunked_prefill_workspace_size
                 )
 
-            prefill_metadata = self.prefill_metadata_cls(
+            prefill_metadata = MLACommonPrefillMetadata(
                 block_table=block_table_tensor[reqs_start:, ...],
                 query_start_loc=prefill_query_start_loc,
                 max_query_len=max_query_len,
                 chunked_context=chunked_context_metadata,
                 output_dtype=self.model_config.dtype,
                 q_data_type=self.q_data_type,
+                prefill_backend=self._prefill_backend,
             )
 
-            if self._use_cudnn_prefill:
-                assert isinstance(prefill_metadata, CudnnPrefillMetadata)
-                prefill_metadata.query_seq_lens = (
-                    prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
-                )
-                prefill_metadata.cudnn_workspace = self.cudnn_workspace
-
-            if self._use_trtllm_ragged_prefill:
-                prefill_metadata.query_seq_lens = (
-                    prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
-                )
-                prefill_metadata.workspace_buffer = self._workspace_buffer
+            self._prefill_backend.prepare_metadata(prefill_metadata)
 
         decode_metadata = None
         if num_decodes > 0:
@@ -2091,10 +1859,6 @@ def build(
             decode=decode_metadata,
         )
 
-        if self._use_fi_prefill and num_prefills > 0:
-            assert isinstance(attn_metadata.prefill, FlashInferPrefillMetadata)
-            self._build_fi_prefill_wrappers(attn_metadata.prefill)
-
         return attn_metadata  # type: ignore[return-value]
 
 
@@ -2240,308 +2004,12 @@ def __init__(
             and (self.qk_rope_head_dim == 64)
         )
 
-        if use_trtllm_ragged_deepseek_prefill():
-            logger.info_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
-            self._run_prefill_context_chunk = (
-                self._run_prefill_context_chunk_trtllm_ragged
-            )
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
-            self._pad_v = False
-        elif use_flashinfer_prefill():
-            logger.info_once("Using FlashInfer prefill for MLA")
-            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
-            self._pad_v = False
-        elif use_cudnn_prefill():
-            logger.info_once("Using CUDNN prefill for MLA")
-            self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
-            self._pad_v = False
-        else:  # Use FlashAttention
-            if flash_attn_varlen_func is None:
-                raise RuntimeError(
-                    "MLA attention requires FlashAttention but it is not "
-                    "available. Please install flash_attn or use "
-                    "--attention-backend ROCM_AITER_MLA."
-                )
-            logger.info_once("Using FlashAttention prefill for MLA")
-            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa
-
-            # Handle the differences between the flash_attn_varlen from
-            # flash_attn and the one from vllm_flash_attn. The former is used on
-            # RoCM and the latter has an additional parameter to control
-            # FA2 vs FA3
-            self.flash_attn_varlen_func = flash_attn_varlen_func
-            self.vllm_flash_attn_version = get_flash_attn_version(
-                head_size=self.qk_head_dim
-            )
-            if self.vllm_flash_attn_version is not None:
-                self.flash_attn_varlen_func = functools.partial(
-                    flash_attn_varlen_func, fa_version=self.vllm_flash_attn_version
-                )
-
-            # For MLA the v head dim is smaller than qk head dim so we pad out
-            # v with 0s to match the qk head dim for attention backends that do
-            # not support different headdims.
-            # FA3 on Hopper (SM90) and FA4 natively handle diff headdims.
-            device_capability = current_platform.get_device_capability()
-            self._pad_v = self.vllm_flash_attn_version is None or not (
-                (
-                    self.vllm_flash_attn_version == 3
-                    and device_capability is not None
-                    and device_capability[0] == 9
-                )
-                or self.vllm_flash_attn_version == 4
-            )
-
         self.dcp_world_size: int = -1
 
         self.cp_kv_cache_interleave_size: int = (
             get_current_vllm_config().parallel_config.cp_kv_cache_interleave_size
         )
 
-    def _flash_attn_varlen_diff_headdims(
-        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
-    ):
-        maybe_padded_v = v
-        if self._pad_v:
-            maybe_padded_v = torch.nn.functional.pad(
-                v, [0, q.shape[-1] - v.shape[-1]], value=0
-            )
-
-        if is_vllm_fa:
-            kwargs["return_softmax_lse"] = return_softmax_lse
-        else:
-            # ROCm leverages the upstream flash_attn, which takes a parameter
-            # called "return_attn_probs" instead of return_softmax_lse
-            kwargs["return_attn_probs"] = return_softmax_lse
-        if envs.VLLM_BATCH_INVARIANT:
-            kwargs["num_splits"] = 1
-
-        attn_out = self.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=maybe_padded_v,
-            softmax_scale=softmax_scale,
-            **kwargs,
-        )
-
-        # Unpack the output if there is multiple results
-        lse = None
-        if isinstance(attn_out, tuple):
-            attn_out, lse = attn_out[0], attn_out[1]
-
-        # Remain consistent with old `flash_attn_varlen_func` where there
-        # is only one output tensor if `return_softmax_lse` is False.
-        if return_softmax_lse:
-            return attn_out, lse
-        return attn_out
-
-    def _run_prefill_new_tokens_fa(
-        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
-    ):
-        return self._flash_attn_varlen_diff_headdims(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=prefill.query_start_loc,
-            cu_seqlens_k=prefill.query_start_loc,
-            max_seqlen_q=prefill.max_query_len,
-            max_seqlen_k=prefill.max_query_len,
-            softmax_scale=self.scale,
-            causal=True,
-            return_softmax_lse=return_softmax_lse,
-        )
-
-    def _run_prefill_new_tokens_fi(
-        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
-    ):
-        assert isinstance(prefill, FlashInferPrefillMetadata)
-        assert prefill.prefill_main is not None
-
-        ret = prefill.prefill_main.run(
-            q=q,
-            k=k,
-            v=v,
-            return_lse=return_softmax_lse,
-        )
-
-        if isinstance(ret, tuple):
-            return ret[0], ret[1].transpose(0, 1).contiguous()
-        return ret
-
-    def _run_prefill_new_tokens_cudnn(
-        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
-    ):
-        assert isinstance(prefill, CudnnPrefillMetadata)
-        assert prefill.query_seq_lens is not None
-        from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache
-
-        output, lse = cudnn_batch_prefill_with_kv_cache(
-            q=q,
-            k_cache=k,
-            v_cache=v,
-            scale=self.scale,
-            workspace_buffer=prefill.cudnn_workspace,
-            max_token_per_sequence=prefill.max_query_len,
-            max_sequence_kv=prefill.max_query_len,
-            actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
-            actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
-            causal=True,
-            # Do not support False for now
-            return_lse=True,
-            # Indicates actual_seq_lens are on GPU or CPU.
-            is_cuda_graph_compatible=True,
-        )
-        if return_softmax_lse:
-            return output, lse
-        return output
-
-    def _run_prefill_context_chunk_fa(
-        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
-    ):
-        assert prefill.chunked_context is not None
-        return self._flash_attn_varlen_diff_headdims(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=prefill.query_start_loc,
-            cu_seqlens_k=prefill.chunked_context.cu_seq_lens[chunk_idx],
-            max_seqlen_q=prefill.max_query_len,
-            max_seqlen_k=prefill.chunked_context.max_seq_lens[chunk_idx],
-            softmax_scale=self.scale,
-            causal=False,  # Context is unmasked
-            return_softmax_lse=True,
-        )
-
-    def _run_prefill_context_chunk_fi(
-        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
-    ):
-        assert isinstance(prefill, FlashInferPrefillMetadata)
-
-        attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
-            q=q,
-            k=k,
-            v=v,
-            return_lse=True,
-        )
-
-        # Convert from (q_len, num_heads) to (num_heads, q_len)
-        return attn_out, lse.transpose(0, 1).contiguous()
-
-    def _run_prefill_context_chunk_cudnn(
-        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
-    ):
-        assert isinstance(prefill, CudnnPrefillMetadata)
-        assert prefill.chunked_context is not None
-        assert prefill.chunked_context.seq_lens[chunk_idx] is not None
-        assert prefill.query_seq_lens is not None
-        from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache
-
-        return cudnn_batch_prefill_with_kv_cache(
-            q=q,
-            k_cache=k,
-            v_cache=v,
-            scale=self.scale,
-            workspace_buffer=prefill.cudnn_workspace,
-            max_token_per_sequence=prefill.max_query_len,
-            max_sequence_kv=prefill.chunked_context.max_seq_lens[chunk_idx],
-            actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
-            actual_seq_lens_kv=prefill.chunked_context.seq_lens[chunk_idx].view(
-                -1, 1, 1, 1
-            ),
-            causal=False,
-            return_lse=True,
-            # Indicates actual_seq_lens are on GPU or CPU.
-            is_cuda_graph_compatible=True,
-        )
-
-    def _run_prefill_new_tokens_trtllm_ragged(
-        self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse
-    ):
-        """TRT-LLM ragged attention for new tokens (causal)."""
-        from flashinfer.prefill import trtllm_ragged_attention_deepseek
-
-        assert prefill.query_seq_lens is not None
-        assert prefill.workspace_buffer is not None
-        # allocate BF16 / FP16 output tensor for TRT-LLM ragged attention
-        out = torch.empty(
-            q.shape[0],
-            q.shape[1],
-            v.shape[2],
-            device=q.device,
-            dtype=prefill.output_dtype,
-        )
-
-        ret = trtllm_ragged_attention_deepseek(
-            query=q,
-            key=k,
-            value=v,
-            workspace_buffer=prefill.workspace_buffer,
-            seq_lens=prefill.query_seq_lens,
-            max_q_len=prefill.max_query_len,
-            max_kv_len=prefill.max_query_len,
-            bmm1_scale=self.scale,
-            bmm2_scale=1.0,
-            o_sf_scale=1.0,
-            batch_size=prefill.query_seq_lens.shape[0],
-            window_left=-1,
-            cum_seq_lens_q=prefill.query_start_loc,
-            cum_seq_lens_kv=prefill.query_start_loc,
-            enable_pdl=False,
-            is_causal=True,
-            return_lse=return_softmax_lse,
-            out=out,
-        )
-
-        if isinstance(ret, tuple):
-            # Convert from (q_len, num_heads) to (num_heads, q_len)
-            return ret[0], ret[1].transpose(0, 1).contiguous()
-        return ret
-
-    def _run_prefill_context_chunk_trtllm_ragged(
-        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
-    ):
-        """TRT-LLM ragged attention for context chunks (non-causal)."""
-        from flashinfer.prefill import trtllm_ragged_attention_deepseek
-
-        assert prefill.chunked_context is not None
-        assert prefill.chunked_context.seq_lens[chunk_idx] is not None
-        assert prefill.workspace_buffer is not None
-
-        out = torch.empty(
-            q.shape[0],
-            q.shape[1],
-            v.shape[2],
-            device=q.device,
-            dtype=prefill.output_dtype,
-        )
-
-        attn_out, lse = trtllm_ragged_attention_deepseek(
-            query=q,
-            key=k,
-            value=v,
-            workspace_buffer=prefill.workspace_buffer,
-            seq_lens=prefill.chunked_context.seq_lens[chunk_idx],
-            max_q_len=prefill.max_query_len,
-            max_kv_len=prefill.chunked_context.max_seq_lens[chunk_idx],
-            bmm1_scale=self.scale,
-            bmm2_scale=1.0,
-            o_sf_scale=1.0,
-            batch_size=prefill.chunked_context.seq_lens[chunk_idx].shape[0],
-            window_left=-1,
-            cum_seq_lens_q=prefill.query_start_loc,
-            cum_seq_lens_kv=prefill.chunked_context.cu_seq_lens[chunk_idx],
-            enable_pdl=False,
-            is_causal=False,
-            return_lse=True,
-            out=out,
-        )
-
-        # Convert from (q_len, num_heads) to (num_heads, q_len)
-        return attn_out, lse.transpose(0, 1).contiguous()
-
     def _concat_k_nope_k_pe(
         self, k_nope: torch.Tensor, k_pe: torch.Tensor
     ) -> torch.Tensor:
@@ -2582,6 +2050,7 @@ def _compute_prefill_context(
     ):
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.prefill_backend is not None
         assert prefill_metadata.chunked_context is not None
 
         use_fp8_prefill = prefill_metadata.q_data_type == current_platform.fp8_dtype()
@@ -2649,12 +2118,13 @@ def _compute_prefill_context(
 
             k = self._concat_k_nope_k_pe(k_nope, k_pe)
 
-            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
-                prefill=prefill_metadata,
-                chunk_idx=i,
-                q=q,
-                k=k,
-                v=v,
+            attn_output, attn_softmax_lse = (
+                prefill_metadata.prefill_backend.run_prefill_context_chunk(
+                    chunk_idx=i,
+                    q=q,
+                    k=k,
+                    v=v,
+                )
             )
 
             if output is None:
@@ -2687,6 +2157,7 @@ def _context_parallel_compute_prefill_context(
         assert k_scale is None, "DCP not support scaled kvcache now."
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.prefill_backend is not None
         assert prefill_metadata.chunked_context is not None
         assert prefill_metadata.chunked_context.padded_local_chunk_seq_lens is not None
         assert prefill_metadata.chunked_context.local_context_lens_allranks is not None
@@ -2753,12 +2224,13 @@ def _context_parallel_compute_prefill_context(
             k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
             k = self._concat_k_nope_k_pe(k_nope, k_pe)
 
-            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
-                prefill=prefill_metadata,
-                chunk_idx=i,
-                q=q,
-                k=k,
-                v=v,
+            attn_output, attn_softmax_lse = (
+                prefill_metadata.prefill_backend.run_prefill_context_chunk(
+                    chunk_idx=i,
+                    q=q,
+                    k=k,
+                    v=v,
+                )
             )
 
             if output is None:
@@ -2790,11 +2262,11 @@ def forward_mha(
         k_scale: torch.Tensor,
         output: torch.Tensor,
     ) -> None:
-        # TODO (zyongye): Prefill function here
         assert attn_metadata.prefill is not None
         assert self.dcp_world_size != -1
 
         prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.prefill_backend is not None
         use_fp8_prefill = prefill_metadata.q_data_type == current_platform.fp8_dtype()
 
         # Convert q to FP8 if FP8 prefill attention is enabled
@@ -2813,8 +2285,7 @@ def forward_mha(
             k = k.to(prefill_metadata.q_data_type)
             v = v.to(prefill_metadata.q_data_type)
 
-        output_prefill = self._run_prefill_new_tokens(
-            prefill=prefill_metadata,
+        output_prefill = prefill_metadata.prefill_backend.run_prefill_new_tokens(
             q=q,
             k=k,
             v=v,
@@ -2839,11 +2310,6 @@ def forward_mha(
                     q, kv_c_and_k_pe_cache, attn_metadata, k_scale
                 )
 
-            # unpad if necessary
-            if self._pad_v:
-                context_output = context_output[..., : v.shape[-1]]
-                suffix_output = suffix_output[..., : v.shape[-1]]
-
             output = output.view(-1, self.num_heads, self.v_head_dim)
             merge_attn_states(
                 output=output,
@@ -2854,7 +2320,8 @@ def forward_mha(
                 prefill_tokens_with_context=prefill_metadata.chunked_context.prefill_tokens_with_context,
             )
         else:
-            output_prefill = output_prefill[..., : v.shape[-1]].flatten(start_dim=-2)
+            assert isinstance(output_prefill, torch.Tensor)
+            output_prefill = output_prefill.flatten(start_dim=-2)
             output.copy_(output_prefill)
 
     @abstractmethod
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 3831f7aa9658..bcdd30500329 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -930,22 +930,17 @@ def enable_batch_invariant_mode():
     _batch_invariant_MODE = True
     _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
 
-    if current_platform.is_device_capability_family(
-        100
-    ) or current_platform.is_device_capability_family(80):
-        # For PyTorch 2.9, B200 uses GEMV for bs=1
-        # Requires https://github.com/pytorch/pytorch/pull/166735
+    if current_platform.is_device_capability_family(80):
+        # SM80 (Ampere) cannot rely on cuBLASLt-only determinism; install the
+        # triton persistent matmul overrides for mm/addmm/matmul/linear.
         _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
         _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
         _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
         _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
-
-        # Query the shared memory size and set block size
-        # accordingly to avoid triton OutOfResources
-        _fp16_block_size_n = 256 if get_max_shared_memory_bytes() > 106496 else 128
     else:
-        # Only source of batch invariance for Hopper is split-k, can disable through
-        # cuBLAS workspace config
+        # Hopper (SM90) and Blackwell (SM100): the only source of batch
+        # variance is split-k, which we disable via the cuBLAS workspace
+        # config.
         _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None)
         _original_cublaslt_workspace_size = os.environ.get(
             "CUBLASLT_WORKSPACE_SIZE", None
@@ -953,6 +948,11 @@ def enable_batch_invariant_mode():
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
         os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
 
+    # Triton bmm/persistent-matmul kernels read this for the FP16 N-tile size;
+    # set unconditionally because bmm is overridden on all CUDA platforms.
+    if current_platform.is_cuda():
+        _fp16_block_size_n = 256 if get_max_shared_memory_bytes() > 106496 else 128
+
     _batch_invariant_LIB.impl(
         "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA"
     )
diff --git a/vllm/model_executor/layers/deepseek_compressor.py b/vllm/model_executor/layers/deepseek_compressor.py
index 1bf4a4ac52e0..48628fec46e0 100644
--- a/vllm/model_executor/layers/deepseek_compressor.py
+++ b/vllm/model_executor/layers/deepseek_compressor.py
@@ -14,9 +14,8 @@
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
 )
-from vllm.model_executor.layers.utils import cublas_gemm_bf16_bf16_fp32
 from vllm.platforms import current_platform
-from vllm.triton_utils import maybe_launch_pdl, tl, triton
+from vllm.triton_utils import tl, triton
 from vllm.v1.attention.backend import (
     AttentionBackend,
     AttentionCGSupport,
@@ -271,16 +270,12 @@ def __init__(
 
     def forward(
         self,
-        # [num_tokens, hidden_size]
-        x: torch.Tensor,
+        # [num_tokens, 2 * self.coff * self.head_dim]
+        kv_score: torch.Tensor,
         # [num_tokens]
         positions: torch.Tensor,
         rotary_emb,
     ) -> None:
-        num_tokens, _ = x.shape
-        # bf16 weights/activations but fp32 output for numerical stability of
-        # the downstream compressor math.
-        kv_score = cublas_gemm_bf16_bf16_fp32(x, self.fused_wkv_wgate.weight)
         # Each of shape [num_tokens, coff * self.head_dim]
         # input bf16, output are fp32
         kv, score = kv_score.split(
@@ -305,6 +300,7 @@ def forward(
         state_cache = self.state_cache.kv_cache
         # kv_state stored in first half, score_state stored in second half
         state_width = state_cache.shape[-1] // 2
+        pdl_kwargs = {} if current_platform.is_rocm() else {"launch_pdl": False}
 
         # Store the KV and score (with fused APE addition) in the state.
         # NOTE: PDL is disabled — both this kernel and _fused_kernel below
@@ -329,10 +325,7 @@ def forward(
             TRITON_BLOCK_SIZE=triton.next_power_of_2(kv.shape[-1]),
             STATE_WIDTH=state_width,
             COMPRESS_RATIO=self.compress_ratio,
-            # PDL is a NVIDIA Hopper-only Triton launch attribute; omit
-            # on other backends (e.g. ROCm) to avoid KeyError in
-            # JITKernel. See note above re: read-after-write race.
-            **maybe_launch_pdl(),
+            **pdl_kwargs,
         )
 
         # Fused: compress → RMSNorm → RoPE → FP8 quant → KV cache write.
@@ -381,7 +374,7 @@ def forward(
             SCALE_DIM=self._scale_dim,
             KV_BLOCK_STRIDE=kv_cache.stride(0),
             num_warps=self._num_warps,
-            **maybe_launch_pdl(),
+            **pdl_kwargs,
         )
 
 
diff --git a/vllm/model_executor/layers/deepseek_v4_attention.py b/vllm/model_executor/layers/deepseek_v4_attention.py
index 1e574bfe7646..92dc0b6dc12b 100644
--- a/vllm/model_executor/layers/deepseek_v4_attention.py
+++ b/vllm/model_executor/layers/deepseek_v4_attention.py
@@ -4,14 +4,16 @@
 DeepseekV4 MLA Attention Layer
 """
 
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import DeepseekV2Config, DeepseekV3Config
 
+import vllm.envs as envs
 from vllm.model_executor.layers.linear import (
     ReplicatedLinear,
 )
@@ -27,6 +29,11 @@
     fused_q_kv_rmsnorm,
     quantize_and_insert_k_cache,
 )
+from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
+    rocm_forward_decode_fallback,
+    rocm_inv_rope_einsum,
+    rocm_sparse_attn_prefill,
+)
 
 if TYPE_CHECKING:
     from vllm.v1.attention.backends.mla.sparse_swa import (
@@ -52,7 +59,11 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
 )
-from vllm.utils.multi_stream_utils import maybe_execute_in_parallel
+from vllm.platforms import current_platform
+from vllm.utils.multi_stream_utils import (
+    execute_in_parallel,
+    maybe_execute_in_parallel,
+)
 from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata
 from vllm.v1.attention.backends.mla.flashmla_sparse import (
     DeepseekV4FlashMLASparseBackend,
@@ -175,7 +186,7 @@ class DeepseekV4MLAModules:
     indexer: torch.nn.Module | None
     indexer_rotary_emb: torch.nn.Module
     topk_indices_buffer: torch.Tensor | None
-    aux_stream: torch.cuda.Stream | None = None
+    aux_stream_list: list[torch.cuda.Stream] | None = None
 
 
 # --8<-- [start:multi_head_latent_attention]
@@ -274,8 +285,6 @@ def __init__(
         # Pick fp8_einsum recipe based on GPU arch:
         # SM90: FP32 block scales stay [g, r/128, d/128] → sfb_gran_mn=128
         # SM100: INT32 packed scales become [g, r, ...] → sfb_gran_mn=1
-        from vllm.platforms import current_platform
-
         cap = current_platform.get_device_capability()
         assert cap is not None, "DeepseekV4 attention requires a CUDA device"
         self._einsum_recipe = (1, 128, 128) if cap.major <= 9 else (1, 1, 128)
@@ -298,8 +307,12 @@ def __init__(
             + 1  # 1B pad
         )
 
-        self.aux_stream = mla_modules.aux_stream
-        self.ln_events = [torch.cuda.Event(), torch.cuda.Event()]
+        # Will be None on ROCm for now.
+        self.aux_stream_list = mla_modules.aux_stream_list
+        # [0]: GEMM start / post-GEMM event0. [1..3]: GEMM done events;
+        # [1] doubles as post-GEMM event1. Reuse is safe: GEMM fully joins
+        # before post-GEMM starts.
+        self.ln_events = [torch.cuda.Event() for _ in range(4)]
 
         assert cache_config is not None, "DeepseekV4 attention requires cache_config"
         self.swa_cache_layer = DeepseekV4SWACache(
@@ -358,9 +371,6 @@ def forward(
         hidden_states: torch.Tensor,
         llama_4_scaling: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        qr_kv, _ = self.fused_wqa_wkv(hidden_states)
-        qr, kv = qr_kv.split([self.q_lora_rank, self.head_dim], dim=-1)
-
         # Pre-allocate attention output with FlashMLA-padded head count.
         # The op writes into `o_padded`; we slice to n_local_heads after.
         num_tokens = hidden_states.shape[0]
@@ -373,14 +383,25 @@ def forward(
         # Attention (inside custom op for torch.compile boundary)
         torch.ops.vllm.deepseek_v4_attention(
             hidden_states,
-            qr,
-            kv,
             positions,
             o_padded,
             self.layer_name,
         )
         o = o_padded[:, : self.n_local_heads, :]
 
+        # Keep ROCm on the BF16 reference wo_a path util kernel ready.
+        if current_platform.is_rocm():
+            z = rocm_inv_rope_einsum(
+                self.rotary_emb,
+                o,
+                positions,
+                self.rope_head_dim,
+                self.n_local_groups,
+                self.o_lora_rank,
+                self.wo_a,
+            )
+            return self.wo_b(z.flatten(1))
+
         # O projection: inverse RoPE + FP8 quant + einsum + wo_b
         o_fp8, o_scale = fused_inv_rope_fp8_quant(
             o,
@@ -413,17 +434,80 @@ def forward(
 
         return self.wo_b(z.flatten(1))
 
+    def attn_gemm_parallel_execute(self, hidden_states) -> tuple[Any, ...]:
+        aux_streams = self.aux_stream_list
+        if aux_streams is not None:
+            assert len(aux_streams) >= 3
+            aux_streams = aux_streams[:3]
+
+        # fused_wqa_wkv (heaviest) on default; the three lighter input GEMMs
+        # on aux streams 0..2 when their owning module exists. ln_events[0]
+        # is the fan-out start event; ln_events[1..3] are per-aux done events.
+        # On ROCm, aux_streams is None and execute_in_parallel runs serially.
+        aux_fns: list[Callable[[], Any] | None] = [None, None, None]
+
+        if self.compressor is not None:
+            # Local ref so the closure keeps a non-None type for mypy.
+            compressor = self.compressor
+
+            def compressor_kv_score() -> torch.Tensor:
+                return torch.mm(
+                    hidden_states,
+                    compressor.fused_wkv_wgate.weight.T,
+                    out_dtype=torch.float32,
+                )
+
+            aux_fns[0] = compressor_kv_score
+
+        if self.indexer is not None:
+            indexer = self.indexer
+
+            def indexer_weights_proj() -> torch.Tensor:
+                # ReplicatedLinear returns (output, bias); bias is None.
+                weights, _ = indexer.weights_proj(hidden_states)
+                return weights
+
+            def indexer_compressor_kv_score() -> torch.Tensor:
+                return torch.mm(
+                    hidden_states,
+                    indexer.compressor.fused_wkv_wgate.weight.T,
+                    out_dtype=torch.float32,
+                )
+
+            aux_fns[1] = indexer_weights_proj
+            aux_fns[2] = indexer_compressor_kv_score
+
+        def fused_wqa_wkv() -> torch.Tensor:
+            # MergedColumnParallelLinear returns (output, bias); bias is None.
+            qr_kv, _ = self.fused_wqa_wkv(hidden_states)
+            return qr_kv
+
+        qr_kv, (kv_score, indexer_weights, indexer_kv_score) = execute_in_parallel(
+            fused_wqa_wkv,
+            aux_fns,
+            self.ln_events[0],
+            self.ln_events[1:4],
+            aux_streams,
+            enable=hidden_states.shape[0]
+            <= envs.VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD,
+        )
+
+        return qr_kv, kv_score, indexer_kv_score, indexer_weights
+
     def attention_impl(
         self,
         hidden_states: torch.Tensor,
-        qr: torch.Tensor,
-        kv: torch.Tensor,
         positions: torch.Tensor,
         out: torch.Tensor,  # [num_tokens, padded_heads, head_dim], written in place
     ) -> None:
         forward_context = get_forward_context()
         attn_metadata = forward_context.attn_metadata
 
+        qr_kv, kv_score, indexer_kv_score, indexer_weights = (
+            self.attn_gemm_parallel_execute(hidden_states)
+        )
+
+        qr, kv = qr_kv.split([self.q_lora_rank, self.head_dim], dim=-1)
         qr, kv = fused_q_kv_rmsnorm(
             qr,
             kv,
@@ -431,42 +515,62 @@ def attention_impl(
             self.kv_norm.weight.data,
             self.eps,
         )
-        q = self.wq_b(qr).view(-1, self.n_local_heads, self.head_dim)
 
-        # Overlap kv_insert with whichever of indexer/compressor is present.
-        # Indexer implies compressor; when both exist, compressor rides on the
-        # aux stream alongside kv_insert so the heavy indexer owns default.
+        # wq_b + kv_insert (+ MLA compressor when an indexer is present) ride
+        # on the default stream so q stays on its consumer stream (mla_attn
+        # downstream reads q on default). Indexer/compressor go on aux for
+        # overlap with default's GEMM + cache write.
         if self.indexer is not None:
+            aux_stream = (
+                self.aux_stream_list[0] if self.aux_stream_list is not None else None
+            )
             indexer = self.indexer
             # Local ref so the closure keeps a non-None type for mypy.
             assert self.compressor is not None
             compressor = self.compressor
 
-            def kv_insert_and_compress() -> None:
+            def wq_b_kv_insert_and_compress() -> torch.Tensor:
+                q = self.wq_b(qr).view(-1, self.n_local_heads, self.head_dim)
                 self._fused_qnorm_rope_kv_insert(q, kv, positions, attn_metadata)
-                compressor(hidden_states, positions, self.rotary_emb)
-
-            maybe_execute_in_parallel(
-                lambda: indexer(hidden_states, qr, positions, self.indexer_rotary_emb),
-                kv_insert_and_compress,
+                compressor(kv_score, positions, self.rotary_emb)
+                return q
+
+            q, _ = maybe_execute_in_parallel(
+                wq_b_kv_insert_and_compress,
+                lambda: indexer(
+                    hidden_states,
+                    qr,
+                    indexer_kv_score,
+                    indexer_weights,
+                    positions,
+                    self.indexer_rotary_emb,
+                ),
                 self.ln_events[0],
                 self.ln_events[1],
-                self.aux_stream,
+                aux_stream,
             )
         elif self.compressor is not None:
-            # Compressor on default, kv_insert on aux.
+            # wq_b + kv_insert on default, compressor on aux.
+            aux_stream = (
+                self.aux_stream_list[0] if self.aux_stream_list is not None else None
+            )
             compressor = self.compressor
-            maybe_execute_in_parallel(
-                lambda: compressor(hidden_states, positions, self.rotary_emb),
-                lambda: self._fused_qnorm_rope_kv_insert(
-                    q, kv, positions, attn_metadata
-                ),
+
+            def wq_b_kv_insert() -> torch.Tensor:
+                q = self.wq_b(qr).view(-1, self.n_local_heads, self.head_dim)
+                self._fused_qnorm_rope_kv_insert(q, kv, positions, attn_metadata)
+                return q
+
+            q, _ = maybe_execute_in_parallel(
+                wq_b_kv_insert,
+                lambda: compressor(kv_score, positions, self.rotary_emb),
                 self.ln_events[0],
                 self.ln_events[1],
-                self.aux_stream,
+                aux_stream,
             )
         else:
             # SWA-only layer: no compressor, no overlap.
+            q = self.wq_b(qr).view(-1, self.n_local_heads, self.head_dim)
             self._fused_qnorm_rope_kv_insert(q, kv, positions, attn_metadata)
 
         # Handle dummy run (no metadata).
@@ -532,7 +636,21 @@ def _fused_qnorm_rope_kv_insert(
         block_sz = swa_metadata.block_size
         slot_map = swa_metadata.slot_mapping
 
-        if fused_op is not None:
+        # Commit 628c43630 wired the
+        # kernel into the ROCm build, but its FP8 dtype is selected at
+        # *compile time* via ``HIP_FP8_TYPE_OCP`` whereas MI300X (gfx942)
+        # is FNUZ-only at runtime — a mismatch silently corrupts every K
+        # byte written to the SWA cache. Force the Python reference on
+        # ROCm under ``VLLM_ROCM_USE_V4_TRITON_FALLBACK`` so we match the
+        # pre-rebase numerics; flip the env var to "0" to opt back into
+        # the upstream C++ kernel for bisection.
+        # TODO: fix in the next commit.
+        use_torch_ref = (
+            current_platform.is_rocm()
+            and envs.VLLM_ROCM_USE_V4_TRITON_FALLBACK
+        )
+
+        if fused_op is not None and not use_torch_ref:
             fused_op(q, kv, swa_kv_cache_2d, slot_map, pos_i64, cos_sin, self.eps, block_sz)
         else:
             _deepseek_v4_qnorm_rope_kv_insert_reference(
@@ -551,21 +669,17 @@ def _fused_qnorm_rope_kv_insert(
 
 def deepseek_v4_attention(
     hidden_states: torch.Tensor,
-    qr: torch.Tensor,
-    kv: torch.Tensor,
     positions: torch.Tensor,
     out: torch.Tensor,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self.attention_impl(hidden_states, qr, kv, positions, out)
+    self.attention_impl(hidden_states, positions, out)
 
 
 def deepseek_v4_attention_fake(
     hidden_states: torch.Tensor,
-    qr: torch.Tensor,
-    kv: torch.Tensor,
     positions: torch.Tensor,
     out: torch.Tensor,
     layer_name: str,
@@ -792,7 +906,7 @@ def __init__(
             vllm_config.scheduler_config.max_num_batched_tokens
         )
         self.max_model_len = vllm_config.model_config.max_model_len
-        # DeepseekV4 only supports fp8 kv-cache format for now
+        # DeepseekV4 only supports fp8 kv-cache format for now.
         kv_cache_dtype = cache_config.cache_dtype if cache_config is not None else "fp8"
 
         assert kv_cache_dtype.startswith("fp8"), (
@@ -940,6 +1054,34 @@ def _forward_decode(
         swa_indices = swa_metadata.decode_swa_indices
         swa_lens = swa_metadata.decode_swa_lens
 
+        # When VLLM_ROCM_USE_V4_TRITON_FALLBACK is enabled (default on ROCm),
+        # we deliberately skip the upstream `rocm_forward_decode_fallback` and
+        # let the standard `flash_mla_with_kvcache` call below run. That call
+        # is mapped by `vllm.v1.attention.ops.flashmla` to our pre-rebase
+        # `flash_mla_with_kvcache_rocm` Triton/online-softmax fallback, which
+        # is the path that produced 95% GSM8K accuracy. The upstream torch
+        # reference (`rocm_ref_sparse_attn_decode`) has its own bugs that
+        # collapse generation to the base-model prior, so we keep it gated as
+        # an opt-in fallback for bisection only.
+        if current_platform.is_rocm() and not envs.VLLM_ROCM_USE_V4_TRITON_FALLBACK:
+            rocm_forward_decode_fallback(
+                q=q,
+                kv_cache=kv_cache,
+                swa_k_cache=self.swa_cache_layer.kv_cache,
+                swa_only=swa_only,
+                topk_indices=topk_indices,
+                topk_lens=topk_lens,
+                swa_indices=swa_indices,
+                swa_lens=swa_lens,
+                attn_sink=self.attn_sink,
+                scale=self.scale,
+                head_dim=self.head_dim,
+                nope_head_dim=self.nope_head_dim,
+                rope_head_dim=self.rope_head_dim,
+                output=output,
+            )
+            return
+
         # We treat queries in the same seq as different queries
         # and later we only attend by generated indices.
         # q arrives pre-padded to self.padded_heads by the outer wrapper.
@@ -969,12 +1111,19 @@ def _forward_decode(
                 f"Unsupported compress_ratio={self.compress_ratio}; "
                 "expected 1, 4, or 128."
             )
-        assert tile_metadata is not None, (
-            "swa_metadata missing tile_sched entry for "
-            f"compress_ratio={self.compress_ratio}; "
-            "DeepseekSparseSWAMetadataBuilder.build_tile_scheduler did not "
-            "allocate one for this layer type."
-        )
+        # FlashMLA's tile-scheduler metadata is an NVIDIA-only planner state
+        # consumed by the C++/CUDA kernel. Our ROCm fallback
+        # (`flash_mla_with_kvcache_rocm`) discards `tile_scheduler_metadata`
+        # entirely, and `DeepseekSparseSWAMetadataBuilder.build_tile_scheduler`
+        # (correctly) skips allocating it on ROCm — so a `None` here is
+        # expected on AMD and only an error on CUDA.
+        if not current_platform.is_rocm():
+            assert tile_metadata is not None, (
+                "swa_metadata missing tile_sched entry for "
+                f"compress_ratio={self.compress_ratio}; "
+                "DeepseekSparseSWAMetadataBuilder.build_tile_scheduler did "
+                "not allocate one for this layer type."
+            )
 
         out, _ = flash_mla_with_kvcache(
             q=q,
@@ -1104,15 +1253,37 @@ def _forward_prefill(
                 N,
             )
 
-            output_chunk, _, _ = flash_mla_sparse_fwd(
-                q=q[query_start:query_end],
-                kv=kv.view(-1, 1, q.shape[-1]),
-                indices=combined_indices.unsqueeze(1),
-                sm_scale=self.scale,
-                attn_sink=self.attn_sink,
-                topk_length=combined_lens,
-                out=output[query_start:query_end],
-            )
+            # See the matching comment in `_forward_decode`: by default
+            # (VLLM_ROCM_USE_V4_TRITON_FALLBACK=True) we send the prefill
+            # forward through `flash_mla_sparse_fwd`, which on ROCm is bound
+            # to our pre-rebase `flash_mla_sparse_fwd_rocm` chunked-online-
+            # softmax kernel via `vllm.v1.attention.ops.flashmla`. Set the env
+            # var to "0" to opt back into upstream's `rocm_sparse_attn_prefill`
+            # torch reference (kept for bisection / regression testing).
+            if (
+                current_platform.is_rocm()
+                and not envs.VLLM_ROCM_USE_V4_TRITON_FALLBACK
+            ):
+                rocm_sparse_attn_prefill(
+                    q=q[query_start:query_end],
+                    kv=kv.view(-1, 1, q.shape[-1]),
+                    indices=combined_indices.unsqueeze(1),
+                    topk_length=combined_lens,
+                    scale=self.scale,
+                    head_dim=self.head_dim,
+                    attn_sink=self.attn_sink,
+                    output=output[query_start:query_end],
+                )
+            else:
+                output_chunk, _, _ = flash_mla_sparse_fwd(
+                    q=q[query_start:query_end],
+                    kv=kv.view(-1, 1, q.shape[-1]),
+                    indices=combined_indices.unsqueeze(1),
+                    sm_scale=self.scale,
+                    attn_sink=self.attn_sink,
+                    topk_length=combined_lens,
+                    out=output[query_start:query_end],
+                )
 
 
 class DeepseekV4IndexerCache(torch.nn.Module, AttentionLayerBase):
@@ -1182,7 +1353,7 @@ def __init__(
         self.compress_ratio = compress_ratio
         self.use_fp4_kv = self.vllm_config.attention_config.use_fp4_indexer_cache
         logger.info_once(
-            "Using %s indexer cache for Lighening Indexer.",
+            "Using %s indexer cache for Lightning Indexer.",
             "MXFP4" if self.use_fp4_kv else "FP8",
         )
 
@@ -1258,18 +1429,20 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         qr: torch.Tensor,
+        compressed_kv_score: torch.Tensor,
+        indexer_weights: torch.Tensor,
         positions: torch.Tensor,
         rotary_emb: nn.Module,
     ) -> torch.Tensor:
+        # ReplicatedLinear returns (output, bias); bias is None.
         q, _ = self.wq_b(qr)
         q = q.view(-1, self.n_head, self.head_dim)
-        k = self.compressor(hidden_states, positions, rotary_emb)
-        weights, _ = self.weights_proj(hidden_states)
+        k = self.compressor(compressed_kv_score, positions, rotary_emb)
         q_quant, weights = fused_indexer_q_rope_quant(
             positions,
             q,
             rotary_emb.cos_sin_cache,
-            weights,
+            indexer_weights,
             self.softmax_scale,
             self.n_head**-0.5,
             use_fp4=self.use_fp4_kv,
diff --git a/vllm/model_executor/layers/fla/ops/kda.py b/vllm/model_executor/layers/fla/ops/kda.py
index 67cd0231d6e9..1a6f7cc0f5ae 100644
--- a/vllm/model_executor/layers/fla/ops/kda.py
+++ b/vllm/model_executor/layers/fla/ops/kda.py
@@ -1076,10 +1076,10 @@ def chunk_gla_fwd_kernel_o(
         )
         p_h = tl.make_block_ptr(
             h + (i_tg * H + i_h) * K * V,
-            (K, V),
-            (V, 1),
-            (i_k * BK, i_v * BV),
-            (BK, BV),
+            (V, K),
+            (K, 1),
+            (i_v * BV, i_k * BK),
+            (BV, BK),
             (1, 0),
         )
 
@@ -1090,12 +1090,11 @@ def chunk_gla_fwd_kernel_o(
         b_g = tl.load(p_g, boundary_check=(0, 1))
         # [BT, BK]
         b_qg = (b_q * exp(b_g)).to(b_q.dtype)
-        # [BK, BV]
+        # [BV, BK]
         b_h = tl.load(p_h, boundary_check=(0, 1))
-        # works but dkw, owing to divine benevolence
         # [BT, BV]
         if i_k >= 0:
-            b_o += tl.dot(b_qg, b_h.to(b_qg.dtype))
+            b_o += tl.dot(b_qg, tl.trans(b_h).to(b_qg.dtype))
     p_v = tl.make_block_ptr(
         v + (bos * H + i_h) * V,
         (T, V),
diff --git a/vllm/model_executor/layers/fused_moe/activation.py b/vllm/model_executor/layers/fused_moe/activation.py
index 3112b3054fcd..b2e67e6220a9 100644
--- a/vllm/model_executor/layers/fused_moe/activation.py
+++ b/vllm/model_executor/layers/fused_moe/activation.py
@@ -15,6 +15,7 @@ class MoEActivation(Enum):
     # and produce output of shape [..., d]
     SILU = "silu"
     GELU = "gelu"
+    GELU_TANH = "gelu_tanh"
     RELU2 = "relu2"
     SWIGLUOAI = "swigluoai"
     SWIGLUSTEP = "swiglustep"
@@ -24,6 +25,7 @@ class MoEActivation(Enum):
     # NOTE: Non-gated activations require the "_no_mul" suffix to be present.
     SILU_NO_MUL = "silu_no_mul"
     GELU_NO_MUL = "gelu_no_mul"
+    GELU_TANH_NO_MUL = "gelu_tanh_no_mul"
     RELU2_NO_MUL = "relu2_no_mul"
 
     @property
@@ -53,6 +55,7 @@ def without_mul(self) -> "MoEActivation":
     @classmethod
     def from_str(cls, s: str) -> "MoEActivation":
         """Parse from string for backward compatibility."""
+        s = _STR_ALIASES.get(s, s)
         for member in cls:
             if member.value == s:
                 return member
@@ -61,20 +64,27 @@ def from_str(cls, s: str) -> "MoEActivation":
 
 
 # Module-level lookup tables used by MoEActivation functions.
+_STR_ALIASES: dict[str, str] = {
+    "gelu_pytorch_tanh": "gelu_tanh",
+}
+
 _CUSTOM_OP_NAMES: dict[MoEActivation, str] = {
     MoEActivation.SILU: "silu_and_mul",
     MoEActivation.GELU: "gelu_and_mul",
+    MoEActivation.GELU_TANH: "gelu_tanh_and_mul",
     MoEActivation.SWIGLUOAI: "swigluoai_and_mul",
     MoEActivation.SWIGLUSTEP: "swiglustep_and_mul",
     MoEActivation.RELU2: "relu2",
     MoEActivation.SILU_NO_MUL: "silu_and_mul",
     MoEActivation.GELU_NO_MUL: "gelu_and_mul",
+    MoEActivation.GELU_TANH_NO_MUL: "gelu_tanh_and_mul",
     MoEActivation.RELU2_NO_MUL: "relu2",
 }
 
 _WITHOUT_MUL: dict[MoEActivation, MoEActivation] = {
     MoEActivation.SILU: MoEActivation.SILU_NO_MUL,
     MoEActivation.GELU: MoEActivation.GELU_NO_MUL,
+    MoEActivation.GELU_TANH: MoEActivation.GELU_TANH_NO_MUL,
     MoEActivation.RELU2: MoEActivation.RELU2_NO_MUL,
 }
 
@@ -115,6 +125,8 @@ def apply_moe_activation(
         torch.ops._C.silu_and_mul(output, input)
     elif activation == MoEActivation.GELU:
         torch.ops._C.gelu_and_mul(output, input)
+    elif activation == MoEActivation.GELU_TANH:
+        torch.ops._C.gelu_tanh_and_mul(output, input)
     elif activation == MoEActivation.SWIGLUOAI:
         torch.ops._C.swigluoai_and_mul(output, input)
     elif activation == MoEActivation.SWIGLUSTEP:
@@ -127,6 +139,8 @@ def apply_moe_activation(
         output.copy_(F.silu(input))
     elif activation == MoEActivation.GELU_NO_MUL:
         output.copy_(F.gelu(input))
+    elif activation == MoEActivation.GELU_TANH_NO_MUL:
+        output.copy_(F.gelu(input, approximate="tanh"))
     elif activation == MoEActivation.RELU2_NO_MUL:
         F.relu(input, inplace=True)
         torch.square(input, out=output)
diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
index fba1d4c692af..2a6f0c71d936 100644
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -228,23 +228,37 @@ def maybe_make_prepare_finalize(
 
     elif moe.use_fi_nvl_one_sided_kernels:
         assert quant_config is not None
-        if quant_config.quant_dtype != "nvfp4":
-            raise ValueError(
-                "The 'flashinfer_nvlink_one_sided' all2all backend only "
-                "supports nvfp4 activation quantization, but got "
-                f"quant_dtype={quant_config.quant_dtype!r}. Use a different "
-                "all2all backend (e.g. 'flashinfer_nvlink_two_sided' or "
-                "'allgather_reducescatter') for non-nvfp4 models."
-            )
         max_num_tokens = (
             get_current_vllm_config().scheduler_config.max_num_batched_tokens
         )
+        if quant_config.quant_dtype is None:
+            dispatch_dtype_bytes_per_elem = 2
+            dispatch_scale_bytes_per_token = 0
+        elif quant_config.quant_dtype == "nvfp4":
+            dispatch_dtype_bytes_per_elem = 0
+            dispatch_scale_bytes_per_token = moe.hidden_dim // 16
+        elif quant_config.quant_dtype == "mxfp8":
+            dispatch_dtype_bytes_per_elem = 1
+            align = quant_config.mx_alignment
+            if align > 0:
+                padded_k = ((moe.hidden_dim + align - 1) // align) * align
+            else:
+                padded_k = moe.hidden_dim
+            dispatch_scale_bytes_per_token = padded_k // 32
+        else:
+            raise NotImplementedError(
+                "flashinfer_nvlink_one_sided dispatch supports nvfp4, mxfp8, "
+                "and bf16 (quant_dtype=None) today; got "
+                f"quant_dtype={quant_config.quant_dtype!r}"
+            )
         prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(
             max_num_tokens=max_num_tokens,
             top_k=moe.experts_per_token,
             num_experts=moe.num_experts,
             hidden_size=moe.hidden_dim,
             num_dispatchers=all2all_manager.world_size,
+            dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,
+            dispatch_scale_bytes_per_token=dispatch_scale_bytes_per_token,
         )
 
     elif moe.use_ag_rs_all2all_kernels and allow_new_interface:
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 565df1324f62..8ffa5cffb551 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -254,6 +254,8 @@ class FusedMoEQuantConfig:
     gemm1_beta: float | None = None
     gemm1_clamp_limit: float | None = None
 
+    mx_alignment: int = 0
+
     def __post_init__(self):
         assert not self.per_act_token_quant or self.block_shape is None, (
             "illegal quantization"
@@ -712,6 +714,7 @@ def mxfp4_mxfp8_moe_quant_config(
     gemm1_alpha: float | None = None,
     gemm1_beta: float | None = None,
     gemm1_clamp_limit: float | None = None,
+    mx_alignment: int = 0,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for mxfp4 activations and mxfp4 weights.
@@ -724,6 +727,7 @@ def mxfp4_mxfp8_moe_quant_config(
         gemm1_alpha=gemm1_alpha,
         gemm1_beta=gemm1_beta,
         gemm1_clamp_limit=gemm1_clamp_limit,
+        mx_alignment=mx_alignment,
     )
 
 
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=2880,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=2880,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000000..2d53aedbed48
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=2880,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.6.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
index a2d267bd7490..df69fa328ca7 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
@@ -140,6 +140,8 @@ def _fwd_kernel_ep_scatter_2(
     offset_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
     mask_s = offset_in_s < SCALE_HIDDEN_SIZE
 
+    output_tensor_stride0 = output_tensor_stride0.to(tl.int64)
+
     for token_id in range(start_token_id, total_token_num, grid_num):
         to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
         to_copy_s = tl.load(
@@ -154,12 +156,13 @@ def _fwd_kernel_ep_scatter_2(
 
             if expert_id >= 0:
                 dest_token_index = tl.atomic_add(expert_start_loc + expert_id, 1)
+                dest_token_index_i64 = dest_token_index.to(tl.int64)
                 tl.store(
                     output_index + token_id * output_index_stride0 + topk_index,
                     dest_token_index,
                 )
                 output_tensor_ptr = (
-                    output_tensor + dest_token_index * output_tensor_stride0
+                    output_tensor + dest_token_index_i64 * output_tensor_stride0
                 )
                 output_tensor_scale_ptr = (
                     output_tensor_scale + dest_token_index * output_tensor_scale_stride0
diff --git a/vllm/model_executor/layers/fused_moe/experts/aiter_mxfp4_w4a8_moe.py b/vllm/model_executor/layers/fused_moe/experts/aiter_mxfp4_w4a8_moe.py
new file mode 100644
index 000000000000..3906a7e057ca
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/experts/aiter_mxfp4_w4a8_moe.py
@@ -0,0 +1,292 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+    RoutingMethodType,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    kFp8StaticTensorSym,
+    kMxfp4Static,
+)
+
+__all__ = [
+    "AiterW4A8ExpertsMonolithic",
+    "aiter_triton_kernel_w4a8_moe_forward",
+]
+
+
+def aiter_triton_kernel_w4a8_moe_forward(
+    hidden_states: torch.Tensor,
+    w1,  # Tensor or triton_kernels.Tensor
+    w2,  # Tensor or triton_kernels.Tensor
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    activation: MoEActivation = MoEActivation.SWIGLUOAI,
+    quant_config: FusedMoEQuantConfig | None = None,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+    unpadded_N_w1=None,
+    unpadded_K_w1=None,
+    unpadded_N_w2=None,
+    unpadded_K_w2=None,
+):
+    assert (
+        quant_config is not None
+        and quant_config.use_mxfp4_w4a8
+        and rocm_aiter_ops.is_enabled()
+    )
+    from aiter.ops.triton.moe_routing.routing import routing as aiter_routing
+
+    routing_data, gather_idx, scatter_idx = aiter_routing(
+        gating_output, topk, sm_first=not renormalize
+    )
+    return triton_kernel_fused_mxfp4_w4a8_experts(
+        None,
+        hidden_states,
+        w1,
+        w2,
+        routing_data,
+        gather_idx,
+        scatter_idx,
+        activation=activation.value,
+        quant_config=quant_config,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        unpadded_N_w1=unpadded_N_w1,
+        unpadded_K_w1=unpadded_K_w1,
+        unpadded_N_w2=unpadded_N_w2,
+        unpadded_K_w2=unpadded_K_w2,
+    )
+
+
+def triton_kernel_fused_mxfp4_w4a8_experts(
+    output_tensor: torch.Tensor,
+    hidden_states: torch.Tensor,
+    w1,  # Tensor or triton_kernels.Tensor
+    w2,  # Tensor or triton_kernels.Tensor
+    routing_data,  # RoutingData
+    gather_indx,  # GatherIndx
+    scatter_indx,  # ScatterIndx
+    activation: str = "silu",
+    quant_config: FusedMoEQuantConfig | None = None,
+    swiglu_alpha: float = 1.702,
+    swiglu_limit: float = 7.0,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+    a1q_scale: torch.Tensor | None = None,
+    unpadded_N_w1=None,
+    unpadded_K_w1=None,
+    unpadded_N_w2=None,
+    unpadded_K_w2=None,
+) -> torch.Tensor:
+    assert quant_config is not None
+    # type check, uint8 means mxfp4
+    assert hidden_states.dtype == torch.bfloat16
+    assert quant_config.w1_bias is None or quant_config.w1_bias.dtype == torch.float32
+    assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
+
+    # Shape check: weights are padded (e.g. hidden_size padded for
+    # GFX950 swizzle).
+    assert hidden_states.shape[-1] == w1.shape[-2]
+    assert w2.shape[-1] == w1.shape[1]
+
+    E, _, N = w1.shape
+
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    gammas = routing_data.gate_scal if routing_data else None
+
+    from aiter.ops.triton.moe_op_gemm_a8w4 import moe_gemm_a8w4
+    from aiter.ops.triton.quant_moe import downcast_to_static_fp8
+
+    assert quant_config.w1_precision is not None, (
+        "w1_precision in quant config can't be None"
+    )
+    assert quant_config.w2_precision is not None, (
+        "w2_precision in quant config can't be None"
+    )
+
+    hidden_states = downcast_to_static_fp8(
+        hidden_states, quant_config.w1_precision.flex_ctx.lhs_data.scale
+    )
+
+    intermediate_cache1 = moe_gemm_a8w4(
+        hidden_states,
+        w1.storage.data,
+        None,
+        quant_config.w1_precision.weight_scale.storage.data,
+        quant_config.w1_precision.flex_ctx.lhs_data.scale,
+        quant_config.w2_precision.flex_ctx.lhs_data.scale,
+        quant_config.w1_bias,
+        routing_data,
+        gather_indx=gather_indx,
+        gammas=gammas if apply_router_weight_on_input else None,
+        swizzle_mx_scale="CDNA4_SCALE",
+        out_dtype=torch.float8_e4m3fn,
+        apply_swiglu=True,
+        alpha=swiglu_alpha,
+        limit=swiglu_limit,
+        unpadded_N=unpadded_N_w1,
+        unpadded_K=unpadded_K_w1,
+    )
+
+    intermediate_cache3 = moe_gemm_a8w4(
+        intermediate_cache1,
+        w2.storage.data,
+        None,
+        quant_config.w2_precision.weight_scale.storage.data,
+        quant_config.w2_precision.flex_ctx.lhs_data.scale,
+        None,
+        quant_config.w2_bias,
+        routing_data,
+        scatter_indx=scatter_indx,
+        gammas=None if apply_router_weight_on_input else gammas,
+        swizzle_mx_scale="CDNA4_SCALE",
+        unpadded_N=unpadded_N_w2,
+        unpadded_K=unpadded_K_w2,
+    )
+
+    return intermediate_cache3
+
+
+class AiterW4A8ExpertsMonolithic(mk.FusedMoEExpertsMonolithic):
+    """
+    Monolithic MXFP4 W4A8 expert using AITER triton kernels.
+
+    This backend uses:
+    - aiter.ops.triton.moe_routing.routing for routing
+    - aiter.ops.triton.moe_op_gemm_a8w4.moe_gemm_a8w4 for computation
+
+    Weight format: MXFP4 weights with GFX950 swizzle
+    Activation: Static FP8 quantization
+    """
+
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(moe_config, quant_config)
+        self.topk = moe_config.experts_per_token
+        self.renormalize = moe_config.routing_method in (
+            RoutingMethodType.Renormalize,
+            RoutingMethodType.RenormalizeNaive,
+        )
+
+    @staticmethod
+    def activation_format() -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    @staticmethod
+    def _supports_current_device() -> bool:
+        # Requires AITER and GFX950
+        if not rocm_aiter_ops.is_enabled():
+            return False
+        from vllm.platforms.rocm import on_gfx950
+
+        return on_gfx950()
+
+    @staticmethod
+    def _supports_no_act_and_mul() -> bool:
+        return False
+
+    @staticmethod
+    def _supports_quant_scheme(
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        # W4A8: MXFP4 weights with static FP8 activations
+        SUPPORTED_W_A = [
+            (kMxfp4Static, kFp8StaticTensorSym),
+        ]
+        return (weight_key, activation_key) in SUPPORTED_W_A
+
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        # Only SILU activation (swiglu) is supported
+        return activation == MoEActivation.SWIGLUOAI
+
+    @staticmethod
+    def _supports_parallel_config(
+        moe_parallel_config: FusedMoEParallelConfig,
+    ) -> bool:
+        return (
+            not moe_parallel_config.use_all2all_kernels
+            and not moe_parallel_config.enable_eplb
+            and moe_parallel_config.dp_size <= 1
+        )
+
+    @staticmethod
+    def _supports_routing_method(
+        routing_method: RoutingMethodType,
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        return routing_method in [
+            RoutingMethodType.Renormalize,
+            RoutingMethodType.RenormalizeNaive,
+        ]
+
+    @staticmethod
+    def _supports_router_logits_dtype(
+        router_logits_dtype: torch.dtype | None,
+        routing_method: RoutingMethodType,
+    ) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return False  # Expert parallelism not yet supported
+
+    @property
+    def expects_unquantized_inputs(self) -> bool:
+        return True
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+        # grouped topk + fused topk bias parameters
+        num_expert_group: int | None = None,
+        e_score_correction_bias: torch.Tensor | None = None,
+        routed_scaling_factor: float | None = None,
+        topk_group: int | None = None,
+    ) -> torch.Tensor:
+        assert self.moe_config.intermediate_size_per_partition_unpadded is not None
+        assert self.moe_config.hidden_dim_unpadded is not None
+        return aiter_triton_kernel_w4a8_moe_forward(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            gating_output=router_logits,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            quant_config=self.quant_config,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            unpadded_N_w1=self.moe_config.intermediate_size_per_partition_unpadded * 2,
+            unpadded_K_w1=self.moe_config.hidden_dim_unpadded,
+            unpadded_N_w2=self.moe_config.hidden_dim_unpadded,
+            unpadded_K_w2=self.moe_config.intermediate_size_per_partition_unpadded,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
index ac317ac7762c..e10514debd08 100644
--- a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
@@ -5,7 +5,6 @@
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
-from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
@@ -286,35 +285,6 @@ def triton_kernel_moe_forward(
     unpadded_N_w2=None,
     unpadded_K_w2=None,
 ) -> torch.Tensor:
-    if (
-        quant_config is not None
-        and quant_config.use_mxfp4_w4a8
-        and rocm_aiter_ops.is_enabled()
-    ):
-        from aiter.ops.triton.moe_routing.routing import routing as aiter_routing
-
-        routing_data, gather_idx, scatter_idx = aiter_routing(
-            gating_output, topk, sm_first=not renormalize
-        )
-        return triton_kernel_fused_mxfp4_w4a8_experts(
-            None,
-            hidden_states,
-            w1,
-            w2,
-            routing_data,
-            gather_idx,
-            scatter_idx,
-            activation=activation.value,
-            quant_config=quant_config,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            unpadded_N_w1=unpadded_N_w1,
-            unpadded_K_w1=unpadded_K_w1,
-            unpadded_N_w2=unpadded_N_w2,
-            unpadded_K_w2=unpadded_K_w2,
-        )
-
     from triton_kernels.topk import topk as topk_fn
 
     sm_first = not renormalize
@@ -471,99 +441,6 @@ def triton_kernel_fused_experts(
     return output_tensor
 
 
-# This is a triton implementation of the fused_experts function
-def triton_kernel_fused_mxfp4_w4a8_experts(
-    output_tensor: torch.Tensor,
-    hidden_states: torch.Tensor,
-    w1,  # Tensor or triton_kernels.Tensor
-    w2,  # Tensor or triton_kernels.Tensor
-    routing_data,  # RoutingData
-    gather_indx,  # GatherIndx
-    scatter_indx,  # ScatterIndx
-    activation: str = "silu",
-    quant_config: FusedMoEQuantConfig | None = None,
-    swiglu_alpha: float = 1.702,
-    swiglu_limit: float = 7.0,
-    apply_router_weight_on_input: bool = False,
-    global_num_experts: int = -1,
-    expert_map: torch.Tensor | None = None,
-    a1q_scale: torch.Tensor | None = None,
-    unpadded_N_w1=None,
-    unpadded_K_w1=None,
-    unpadded_N_w2=None,
-    unpadded_K_w2=None,
-) -> torch.Tensor:
-    assert quant_config is not None
-    # type check, uint8 means mxfp4
-    assert hidden_states.dtype == torch.bfloat16
-    assert quant_config.w1_bias is None or quant_config.w1_bias.dtype == torch.float32
-    assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
-
-    # Shape check: weights are padded (e.g. hidden_size padded for
-    # GFX950 swizzle).
-    assert hidden_states.shape[-1] == w1.shape[-2]
-    assert w2.shape[-1] == w1.shape[1]
-
-    E, _, N = w1.shape
-
-    if global_num_experts == -1:
-        global_num_experts = E
-
-    gammas = routing_data.gate_scal if routing_data else None
-
-    from aiter.ops.triton.moe_op_gemm_a8w4 import moe_gemm_a8w4
-    from aiter.ops.triton.quant_moe import downcast_to_static_fp8
-
-    assert quant_config.w1_precision is not None, (
-        "w1_precision in quant config can't be None"
-    )
-    assert quant_config.w2_precision is not None, (
-        "w2_precision in quant config can't be None"
-    )
-
-    hidden_states = downcast_to_static_fp8(
-        hidden_states, quant_config.w1_precision.flex_ctx.lhs_data.scale
-    )
-
-    intermediate_cache1 = moe_gemm_a8w4(
-        hidden_states,
-        w1.storage.data,
-        None,
-        quant_config.w1_precision.weight_scale.storage.data,
-        quant_config.w1_precision.flex_ctx.lhs_data.scale,
-        quant_config.w2_precision.flex_ctx.lhs_data.scale,
-        quant_config.w1_bias,
-        routing_data,
-        gather_indx=gather_indx,
-        gammas=gammas if apply_router_weight_on_input else None,
-        swizzle_mx_scale="CDNA4_SCALE",
-        out_dtype=torch.float8_e4m3fn,
-        apply_swiglu=True,
-        alpha=swiglu_alpha,
-        limit=swiglu_limit,
-        unpadded_N=unpadded_N_w1,
-        unpadded_K=unpadded_K_w1,
-    )
-
-    intermediate_cache3 = moe_gemm_a8w4(
-        intermediate_cache1,
-        w2.storage.data,
-        None,
-        quant_config.w2_precision.weight_scale.storage.data,
-        quant_config.w2_precision.flex_ctx.lhs_data.scale,
-        None,
-        quant_config.w2_bias,
-        routing_data,
-        scatter_indx=scatter_indx,
-        gammas=None if apply_router_weight_on_input else gammas,
-        swizzle_mx_scale="CDNA4_SCALE",
-        unpadded_N=unpadded_N_w2,
-        unpadded_K=unpadded_K_w2,
-    )
-
-    return intermediate_cache3
-
-
 def make_routing_data(
     topk_ids: torch.Tensor,
     topk_weights: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
index f7af9aea70ad..69e5b7fe4f0e 100644
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
@@ -44,6 +44,9 @@ def __init__(
             moe_config.intermediate_size_per_partition
         )
         self.hidden_dim = moe_config.hidden_dim
+        self.hidden_dim_unpadded = (
+            moe_config.hidden_dim_unpadded or moe_config.hidden_dim
+        )
         self.local_num_experts = moe_config.num_local_experts
         self.ep_rank = moe_config.moe_parallel_config.ep_rank
 
@@ -82,9 +85,6 @@ def __init__(
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
         )
 
-        # P1-5 fix: use public quant_dtype property instead of private _a1
-        self.use_mxfp8_input = quant_config.quant_dtype == "mxfp8"
-
     @staticmethod
     def _supports_current_device() -> bool:
         p = current_platform
@@ -121,8 +121,7 @@ def supports_expert_map(self) -> bool:
 
     @property
     def expects_unquantized_inputs(self) -> bool:
-        # Expert handles MXFP8 quantization internally if needed
-        return True
+        return False
 
 
 class TrtLlmMxfp4ExpertsMonolithic(
@@ -181,24 +180,19 @@ def apply(
     ) -> torch.Tensor:
         from flashinfer import trtllm_fp4_block_scale_moe
 
-        # Handle input quantization
-        if self.use_mxfp8_input:
-            from flashinfer import mxfp8_quantize
-
-            x_quant, x_scale = mxfp8_quantize(
-                hidden_states,
-                is_sf_swizzled_layout=False,
-                alignment=256,
-            )
-            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
-                *hidden_states.shape[:-1], -1
-            )
+        if a1q_scale is not None:
+            x_quant = hidden_states
+            x_scale = a1q_scale.view(torch.float8_e4m3fn)
         else:
             assert hidden_states.dtype == torch.bfloat16
             x_quant = hidden_states
             x_scale = None
-
-        output = torch.empty_like(hidden_states)
+        output = torch.empty(
+            *hidden_states.shape[:-1],
+            self.hidden_dim_unpadded,
+            dtype=torch.bfloat16,
+            device=hidden_states.device,
+        )
 
         from vllm.utils.flashinfer import _is_fi_autotuning, autotune
 
@@ -244,10 +238,6 @@ class TrtLlmMxfp4ExpertsModular(TrtLlmMxfp4ExpertsBase, mk.FusedMoEExpertsModula
     Moved from trtllm_moe.py.
     """
 
-    @property
-    def expects_unquantized_inputs(self) -> bool:
-        return True
-
     @staticmethod
     def _supports_parallel_config(
         moe_parallel_config: FusedMoEParallelConfig,
@@ -284,7 +274,7 @@ def workspace_shapes(
         # The workspaces for this implementation are managed by flashinfer.
         workspace1 = (0,)
         workspace2 = (0,)
-        output = (M, K)
+        output = (M, self.hidden_dim_unpadded)
         return (workspace1, workspace2, output)
 
     def apply(
@@ -310,18 +300,9 @@ def apply(
         intermediate_size = self.intermediate_size_per_partition
         local_expert_offset = self.moe_config.ep_rank * local_num_experts
 
-        # Handle input quantization
-        if self.use_mxfp8_input:
-            from flashinfer import mxfp8_quantize
-
-            x_quant, x_scale = mxfp8_quantize(
-                hidden_states,
-                is_sf_swizzled_layout=False,
-                alignment=256,
-            )
-            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
-                *hidden_states.shape[:-1], -1
-            )
+        if a1q_scale is not None:
+            x_quant = hidden_states
+            x_scale = a1q_scale.view(torch.float8_e4m3fn)
         else:
             assert hidden_states.dtype == torch.bfloat16
             x_quant = hidden_states
diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
index baa7d3fd3eef..96202cad622e 100644
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
@@ -112,8 +112,12 @@ def _supports_quant_scheme(
 
     @staticmethod
     def _supports_activation(activation: MoEActivation) -> bool:
-        """Supports only SiLU and RELU^2 non-gated activation."""
-        return activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]
+        """Supports only SiLU, RELU^2 non-gated and GELU activation."""
+        return activation in [
+            MoEActivation.SILU,
+            MoEActivation.RELU2_NO_MUL,
+            MoEActivation.GELU,
+        ]
 
     @staticmethod
     def _supports_shape(hidden_dim: int) -> bool:
@@ -190,7 +194,7 @@ def apply(
     ):
         import flashinfer
 
-        assert activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]
+        assert self._supports_activation(activation)
         assert a1q_scale is not None
         assert self.quant_config.w1_scale is not None
         assert self.quant_config.w2_scale is not None
@@ -263,6 +267,7 @@ def _supports_routing_method(
             RoutingMethodType.SigmoidRenorm,
             RoutingMethodType.MiniMax2,
             RoutingMethodType.Simulated,
+            RoutingMethodType.SigmoidRenorm,
         ]
 
     @staticmethod
@@ -291,7 +296,7 @@ def apply(
     ) -> torch.Tensor:
         import flashinfer
 
-        assert activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]
+        assert self._supports_activation(activation)
         assert a1q_scale is not None
         assert self.quant_config.w1_scale is not None
         assert self.quant_config.w2_scale is not None
diff --git a/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py b/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
index e10be4af8680..d6bd2b140087 100644
--- a/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
@@ -62,7 +62,7 @@ def _supports_current_device() -> bool:
 
     @staticmethod
     def _supports_no_act_and_mul() -> bool:
-        return False
+        return True
 
     @staticmethod
     def _supports_activation(activation: MoEActivation) -> bool:
@@ -70,6 +70,7 @@ def _supports_activation(activation: MoEActivation) -> bool:
             MoEActivation.SILU,
             MoEActivation.GELU,
             MoEActivation.SWIGLUOAI,
+            MoEActivation.RELU2_NO_MUL,
         ]
 
     @staticmethod
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 26409804c48d..ea1b48b4b25e 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -129,7 +129,7 @@ def _supports_current_device() -> bool:
             and (
                 p.is_device_capability(90)
                 or p.is_device_capability_family(100)
-                or p.is_device_capability_family(110)
+                # SM110 excluded: flashinfer-ai/flashinfer#3134
                 or p.is_device_capability_family(120)
             )
             and has_flashinfer_cutlass_fused_moe()
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index bd54cd636b00..8638a11466c0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -786,9 +786,11 @@ def _supports_activation(activation: MoEActivation) -> bool:
         return activation in [
             MoEActivation.SILU,
             MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
             MoEActivation.SWIGLUOAI,
             MoEActivation.SILU_NO_MUL,
             MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
             MoEActivation.RELU2_NO_MUL,
         ]
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_humming_moe.py b/vllm/model_executor/layers/fused_moe/fused_humming_moe.py
index 6a2417cd4d31..5876f1c87ed6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_humming_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_humming_moe.py
@@ -4,7 +4,7 @@
 
 import json
 import math
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import torch
 from humming import dtypes
@@ -16,7 +16,11 @@
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
-from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     moe_align_block_size,
 )
@@ -34,21 +38,16 @@
 from vllm.platforms import current_platform
 from vllm.v1.worker.workspace import current_workspace_manager
 
-if TYPE_CHECKING:
-    from vllm.model_executor.layers.quantization.humming import HummingMoEMethod
-
-
 logger = init_logger(__name__)
 
 
 def get_humming_moe_gemm_type() -> str:
     env_gemm_type: str = envs.VLLM_HUMMING_MOE_GEMM_TYPE or ""
     env_gemm_type = env_gemm_type.lower()
-    if env_gemm_type in ["indexed", "grouped"]:
+    if env_gemm_type == "indexed":
         gemm_type = env_gemm_type
-    elif current_platform.has_device_capability(90):
-        # for device that supports TMA, use grouped gemm
-        gemm_type = "grouped"
+    elif env_gemm_type in ["grouped_contiguous", "grouped"]:
+        gemm_type = "grouped_contiguous"
     else:
         gemm_type = "indexed"
 
@@ -60,49 +59,44 @@ class HummingExpertsBase(mk.FusedMoEExpertsModular):
     def __init__(
         self,
         layer: torch.nn.Module,
-        quant_method: "HummingMoEMethod",
-        prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular | None = None,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+        max_num_tokens: int | None = None,
+        num_dispatchers: int | None = None,
     ):
         self.layer = layer
         self.num_experts = self.layer.num_experts
         self.global_num_experts = self.layer.global_num_experts
         self.init_humming_moe()
 
-        if prepare_finalize is not None:
-            max_num_tokens: int | None = None
-            num_dispatchers: int | None = None
-            if self.is_batched:
-                max_num_tokens = prepare_finalize.max_num_tokens_per_rank()
-                num_dispatchers = prepare_finalize.num_dispatchers()
-
-            assert quant_method.moe_quant_config is not None
-            super().__init__(
-                moe_config=quant_method.moe,
-                quant_config=quant_method.moe_quant_config,
-                max_num_tokens=max_num_tokens,
-                num_dispatchers=num_dispatchers,
-            )
-        else:
-            assert not self.is_batched
+        if self.is_batched():
+            assert max_num_tokens is not None and num_dispatchers is not None
+
+        super().__init__(
+            moe_config=moe_config,
+            quant_config=quant_config,
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=num_dispatchers,
+        )
 
     def init_humming_moe(self):
         self.compute_config = {
             "use_batch_invariant": envs.VLLM_BATCH_INVARIANT,
             "use_f16_accum": envs.VLLM_HUMMING_USE_F16_ACCUM,
-            "gemm_type": self.humming_gemm_type.value,
+            "gemm_type": self.humming_gemm_type().value,
         }
         self.w13_tuning_config = HummingMethod.get_default_tuning_configs(
             layer=self.layer,
             use_f16_accum=envs.VLLM_HUMMING_USE_F16_ACCUM,
             use_batch_invariant=envs.VLLM_BATCH_INVARIANT,
-            gemm_type=self.humming_gemm_type,
+            gemm_type=self.humming_gemm_type(),
             sublayer_name="w13",
         )
         self.w2_tuning_config = HummingMethod.get_default_tuning_configs(
             layer=self.layer,
             use_f16_accum=envs.VLLM_HUMMING_USE_F16_ACCUM,
             use_batch_invariant=envs.VLLM_BATCH_INVARIANT,
-            gemm_type=self.humming_gemm_type,
+            gemm_type=self.humming_gemm_type(),
             sublayer_name="w2",
         )
         self.compute_config_str = json.dumps(self.compute_config)
@@ -124,13 +118,13 @@ def estimate_local_valid_shape_m(self, topk_ids: torch.Tensor):
         global_num_experts = self.global_num_experts
         return math.ceil(global_valid_shape_m * num_experts / global_num_experts)
 
-    @property
-    def humming_gemm_type(self) -> HummingGemmType:
+    @staticmethod
+    def humming_gemm_type() -> HummingGemmType:
         raise NotImplementedError
 
-    @property
-    def is_batched(self) -> bool:
-        return self.activation_format() == mk.FusedMoEActivationFormat.BatchedExperts
+    @classmethod
+    def is_batched(cls) -> bool:
+        return cls.activation_format() == mk.FusedMoEActivationFormat.BatchedExperts
 
     @staticmethod
     def _supports_quant_scheme(
@@ -158,10 +152,12 @@ def _supports_activation(activation: MoEActivation) -> bool:
         return activation in [
             MoEActivation.SILU,
             MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
             MoEActivation.SWIGLUOAI,
             MoEActivation.SWIGLUSTEP,
             MoEActivation.SILU_NO_MUL,
             MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
             MoEActivation.RELU2_NO_MUL,
         ]
 
@@ -189,7 +185,7 @@ def moe_problem_size(
         assert w1.size(0) == num_experts
         assert w2.size(0) == num_experts
 
-        if not self.is_batched:
+        if not self.is_batched():
             num_tokens = a1.size(0)
             assert topk_ids.size(0) == num_tokens
         else:
@@ -201,7 +197,7 @@ def moe_problem_size(
 
     def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
         num_experts = self.num_experts
-        N = self.layer.intermediate_size
+        N = self.layer.intermediate_size_per_partition
         K = self.layer.hidden_size
         assert isinstance(num_experts, int)
         assert isinstance(N, int)
@@ -218,7 +214,7 @@ def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
         # The output must be derived from workspace1.
 
         output_shape: tuple[int, ...]
-        if self.is_batched:
+        if self.is_batched():
             max_num_tokens = self.max_num_tokens
             num_dispatchers = self.num_dispatchers
             assert max_num_tokens is not None and num_dispatchers is not None
@@ -227,7 +223,7 @@ def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
             output_shape = (num_experts, max_num_tokens * num_dispatchers, K)
         else:
             input_shape_m = M
-            if self.humming_gemm_type != HummingGemmType.INDEXED:
+            if self.humming_gemm_type() != HummingGemmType.INDEXED:
                 input_shape_m = M * topk
             real_shape_m = M * topk
             output_shape = (M, K)
@@ -262,7 +258,7 @@ def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
                 "dtype": torch_dtype_map[a_dtype],
             },
             "down_output": {
-                "shape": output_shape if self.is_batched else (real_shape_m, K),
+                "shape": output_shape if self.is_batched() else (real_shape_m, K),
                 "dtype": torch_dtype_map[c_dtype],
             },
             "output": {
@@ -288,7 +284,7 @@ def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
             ]
 
         # batched moe use down_output as output
-        if not self.is_batched:
+        if not self.is_batched():
             required_buffers.append("output")
 
         return buffer_metas, required_buffers
@@ -308,7 +304,7 @@ def _workspace_shapes(self, M: int, topk: int, activation: MoEActivation):
             else:
                 workspace2_nbytes = max(workspace2_nbytes, nbytes)
 
-        output_key = "down_output" if self.is_batched else "output"
+        output_key = "down_output" if self.is_batched() else "output"
         output_shape = buffer_metas[output_key]["shape"]
 
         return (workspace1_nbytes // 2,), (workspace2_nbytes // 2,), output_shape
@@ -395,6 +391,33 @@ def main_apply(
     ):
         raise NotImplementedError
 
+    @staticmethod
+    def is_supported_config(
+        cls: type[mk.FusedMoEExperts],
+        moe_config: FusedMoEConfig,
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+        activation_format: mk.FusedMoEActivationFormat,
+    ) -> tuple[bool, str | None]:
+        if activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
+            supported = cls.activation_format() == activation_format
+            reason = "activation_format mismatched"
+        elif activation_format == mk.FusedMoEActivationFormat.Standard:
+            if cls.activation_format() != mk.FusedMoEActivationFormat.Standard:
+                supported = False
+                reason = "activation_format mismatched"
+            else:
+                assert hasattr(cls, "humming_gemm_type")
+                gemm_type = cls.humming_gemm_type().value.lower()
+                preferred_gemm_type = get_humming_moe_gemm_type().lower()
+                supported = preferred_gemm_type == gemm_type
+                reason = "preferred gemm type mismatched"
+        else:
+            supported = False
+            reason = "unsupported activation_format"
+
+        return supported, None if supported else reason
+
 
 class HummingIndexedExperts(HummingExpertsBase):
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
@@ -404,8 +427,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
     def activation_format() -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    @property
-    def humming_gemm_type(self) -> HummingGemmType:
+    @staticmethod
+    def humming_gemm_type() -> HummingGemmType:
         return HummingGemmType.INDEXED
 
     def prepare_humming_moe_kwargs(
@@ -526,8 +549,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
     def activation_format() -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    @property
-    def humming_gemm_type(self) -> HummingGemmType:
+    @staticmethod
+    def humming_gemm_type() -> HummingGemmType:
         return HummingGemmType.GROUPED_CONTIGUOUS
 
     def main_apply(
@@ -619,8 +642,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
     def activation_format() -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
 
-    @property
-    def humming_gemm_type(self) -> HummingGemmType:
+    @staticmethod
+    def humming_gemm_type() -> HummingGemmType:
         return HummingGemmType.GROUPED_MASKED
 
     def main_apply(
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index ebd330197099..3487ac1766e6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -613,10 +613,12 @@ def _supports_activation(activation: MoEActivation) -> bool:
         return activation in [
             MoEActivation.SILU,
             MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
             MoEActivation.SWIGLUOAI,
             MoEActivation.SWIGLUSTEP,
             MoEActivation.SILU_NO_MUL,
             MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
             MoEActivation.RELU2_NO_MUL,
         ]
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 7e7bcc709921..1a655934259a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1941,10 +1941,12 @@ def _supports_activation(activation: MoEActivation) -> bool:
         return activation in [
             MoEActivation.SILU,
             MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
             MoEActivation.SWIGLUOAI,
             MoEActivation.SWIGLUSTEP,
             MoEActivation.SILU_NO_MUL,
             MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
             MoEActivation.RELU2_NO_MUL,
         ]
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 7174cdd88f25..456f40bbf7a3 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -538,9 +538,11 @@ def _get_quant_method() -> FusedMoEMethodBase:
         # for heuristic purposes, so it must be initialized first.
         self.quant_method: FusedMoEMethodBase = _get_quant_method()
 
-        if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike():
+        if not self.moe_config.is_act_and_mul and not (
+            current_platform.is_cuda_alike() or current_platform.is_xpu()
+        ):
             raise NotImplementedError(
-                "is_act_and_mul=False is supported only for CUDA and ROCm for now"
+                "is_act_and_mul=False is supported only for CUDA and XPU for now"
             )
 
         if self.enable_eplb and not self.quant_method.supports_eplb:
@@ -1103,9 +1105,6 @@ def weight_loader(
         return_success: bool = False,
     ) -> bool | None:
         quant_config_name = self.quant_config and self.quant_config.get_name()
-        if quant_config_name == "humming":
-            assert hasattr(self.quant_method, "weight_schema")
-            quant_config_name = self.quant_method.weight_schema.quant_method
         if quant_config_name == "gpt_oss_mxfp4":
             # (FIXME) for gpt-oss all experts are combined
             if "bias" in weight_name:
@@ -1487,15 +1486,19 @@ def _maybe_make_contiguous(
             "w2_input_scale",
         }
 
+        # Parameters of non-expert submodules that live inside runner (MoERunner).
+        # These must be excluded from EPLB weight rearrangement.
+        NON_EXPERT_PREFIXES = (
+            "runner._shared_experts.",
+            "runner.gate.",
+            "runner.routed_input_transform.",
+            "runner.routed_output_transform.",
+        )
+
         assert all(
             weight.is_contiguous()
             for name, weight in weights
-            if not (
-                name.startswith("_shared_experts.")
-                or name.startswith("_gate.")
-                or name.startswith("_routed_input_transform.")
-                or name.startswith("_routed_output_transform.")
-            )
+            if not name.startswith(NON_EXPERT_PREFIXES)
             and name not in NON_EXPERT_WEIGHTS
         )
 
@@ -1504,12 +1507,7 @@ def _maybe_make_contiguous(
             for name, weight in weights
             if name not in NON_EXPERT_WEIGHTS
             and weight.shape != torch.Size([])
-            and not name.startswith("_shared_experts.")
-            # exclude parameters from non-expert submodules,
-            # e.g. gate/shared/transforms.
-            and not name.startswith("_gate.")
-            and not name.startswith("_routed_input_transform.")
-            and not name.startswith("_routed_output_transform.")
+            and not name.startswith(NON_EXPERT_PREFIXES)
         ]
 
     def set_eplb_state(
diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
index f476d980d555..7c596d52a653 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
@@ -18,7 +18,9 @@
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     FusedMoEQuantDesc,
+    RoutingMethodType,
     mxfp4_mxfp8_moe_quant_config,
+    mxfp4_w4a8_moe_quant_config,
     mxfp4_w4a16_moe_quant_config,
     ocp_mx_moe_quant_config,
 )
@@ -26,9 +28,11 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     kFp8Dynamic128Sym,
+    kFp8StaticTensorSym,
     kMxfp4Static,
     kMxfp8Dynamic,
 )
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import all_close_1d
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.math_utils import round_up
@@ -59,8 +63,11 @@ class Mxfp4MoeBackend(Enum):
     # Marlin
     BATCHED_MARLIN = "BATCHED_MARLIN"
     MARLIN = "MARLIN"
-    # ROCm AITER
-    AITER = "AITER"
+    # ROCm AITER backends
+    AITER_MXFP4_BF16 = "AITER_MXFP4_BF16"  # W4A16: CK kernel
+    # Keep the legacy name as an alias while the ROCm split backend rename settles.
+    AITER = "AITER_MXFP4_BF16"
+    AITER_MXFP4_FP8 = "AITER_MXFP4_FP8"  # W4A8: triton kernel
     # Triton
     TRITON = "TRITON"
     TRITON_UNFUSED = "TRITON_UNFUSED"
@@ -68,6 +75,15 @@ class Mxfp4MoeBackend(Enum):
     XPU = "XPU"
     # Emulation
     EMULATION = "EMULATION"
+    # Humming
+    HUMMING = "HUMMING"
+
+
+# AITER backends group
+AITER_BACKENDS = (
+    Mxfp4MoeBackend.AITER_MXFP4_BF16,
+    Mxfp4MoeBackend.AITER_MXFP4_FP8,
+)
 
 
 # Backends that share the same TRTLLM weight format
@@ -130,6 +146,19 @@ def backend_to_kernel_cls(
 
         return [UnfusedOAITritonExperts]
 
+    elif backend == Mxfp4MoeBackend.HUMMING:
+        from vllm.model_executor.layers.fused_moe.fused_humming_moe import (
+            BatchedHummingGroupedExperts,
+            HummingGroupedExperts,
+            HummingIndexedExperts,
+        )
+
+        return [
+            BatchedHummingGroupedExperts,
+            HummingGroupedExperts,
+            HummingIndexedExperts,
+        ]
+
     elif backend == Mxfp4MoeBackend.MARLIN:
         from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
             MarlinExperts,
@@ -144,13 +173,20 @@ def backend_to_kernel_cls(
 
         return [BatchedMarlinExperts]
 
-    elif backend == Mxfp4MoeBackend.AITER:
+    elif backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
             AiterExperts,
         )
 
         return [AiterExperts]
 
+    elif backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
+        from vllm.model_executor.layers.fused_moe.experts.aiter_mxfp4_w4a8_moe import (
+            AiterW4A8ExpertsMonolithic,
+        )
+
+        return [AiterW4A8ExpertsMonolithic]
+
     elif backend == Mxfp4MoeBackend.XPU:
         from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExpertsMXFp4
 
@@ -177,8 +213,10 @@ def map_mxfp4_backend(runner_backend: MoEBackend) -> Mxfp4MoeBackend:
         "flashinfer_cutlass_afp8": Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
         "triton": Mxfp4MoeBackend.TRITON,
         "triton_unfused": Mxfp4MoeBackend.TRITON_UNFUSED,
+        "humming": Mxfp4MoeBackend.HUMMING,
         "marlin": Mxfp4MoeBackend.MARLIN,
-        "aiter": Mxfp4MoeBackend.AITER,
+        "aiter": Mxfp4MoeBackend.AITER_MXFP4_BF16,
+        "aiter_mxfp4_fp8": Mxfp4MoeBackend.AITER_MXFP4_FP8,
         "xpu": Mxfp4MoeBackend.XPU,
         "emulation": Mxfp4MoeBackend.EMULATION,
     }
@@ -197,7 +235,8 @@ def _get_priority_backends_for_gpt_oss() -> list[Mxfp4MoeBackend]:
     """
     _AVAILABLE_BACKENDS = [
         Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
-        Mxfp4MoeBackend.AITER,
+        Mxfp4MoeBackend.AITER_MXFP4_BF16,
+        Mxfp4MoeBackend.AITER_MXFP4_FP8,
         Mxfp4MoeBackend.TRITON,
         Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
         # TRITON_UNFUSED has bug with MTP support
@@ -217,6 +256,8 @@ def _get_priority_backends() -> list[Mxfp4MoeBackend]:
     TRTLLM MXFP8; SM90 falls through to Triton_unfused or Marlin (the
     backend-level ``is_supported_config`` check filters by device capability).
     """
+    if current_platform.is_rocm():
+        return [Mxfp4MoeBackend.AITER_MXFP4_BF16]
     _AVAILABLE_BACKENDS = [
         Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
         Mxfp4MoeBackend.DEEPGEMM_MXFP4,
@@ -238,16 +279,28 @@ def _backend_activation_key(backend: Mxfp4MoeBackend) -> QuantKey | None:
         Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
     ):
         return kMxfp8Dynamic
-    return None
+    if backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
+        return kFp8StaticTensorSym
+    return None  # BF16 activation
 
 
-def select_gpt_oss_mxfp4_moe_backend(
+def select_mxfp4_moe_backend(
     config: FusedMoEConfig,
+    activation_key: QuantKey | None = None,
 ) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts] | None]:
     """
     Select the primary MXFP4 MoE backend.
+
+    Args:
+        config: MoE configuration
+        activation_key: Optional activation quantization key. If provided,
+            overrides the default activation key for backend selection.
+            Use kFp8StaticTensorSym for W4A8 scheme.
+
     Note: Shape-specific fallbacks may still occur at runtime.
     """
+    # If activation_key is explicitly provided (e.g., W4A8), use it
+    requested_activation_key = activation_key
     device_capability = current_platform.get_device_capability()
     triton_kernels_supported = (
         has_triton_kernels()
@@ -316,11 +369,17 @@ def _return_or_raise(
             and requested_backend == Mxfp4MoeBackend.MARLIN
         ):
             requested_backend = Mxfp4MoeBackend.BATCHED_MARLIN
+        # Use requested_activation_key if provided, otherwise use backend default
+        act_key = (
+            requested_activation_key
+            if requested_activation_key is not None
+            else _backend_activation_key(requested_backend)
+        )
         return _return_or_raise(
             requested_backend,
             config,
             kMxfp4Static,
-            _backend_activation_key(requested_backend),
+            act_key,
             activation_format,
         )
 
@@ -392,10 +451,15 @@ def _return_or_raise(
         )
 
     for backend in AVAILABLE_BACKENDS:
-        activation_key = _backend_activation_key(backend)
+        # Use requested_activation_key if provided, otherwise use backend default
+        act_key = (
+            requested_activation_key
+            if requested_activation_key is not None
+            else _backend_activation_key(backend)
+        )
         for k_cls in backend_to_kernel_cls(backend):
             supported, reason = k_cls.is_supported_config(
-                k_cls, config, kMxfp4Static, activation_key, activation_format
+                k_cls, config, kMxfp4Static, act_key, activation_format
             )
             if supported:
                 logger.info_once(_make_log_backend(backend))
@@ -422,7 +486,7 @@ def _return_or_raise(
     return Mxfp4MoeBackend.NONE, None
 
 
-def select_mxfp4_moe_backend(
+def select_deepseek_v4_mxfp4_moe_backend(
     config: FusedMoEConfig,
 ) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts] | None]:
     """
@@ -484,8 +548,22 @@ def _return_or_raise(
             activation_format,
         )
 
+    # DeepSeek-V4 on ROCm is more accurate with the unfused Triton MXFP4 path
+    # than the default AITER path. Prefer Triton-unfused for this routing mode,
+    # while keeping AITER as a fallback if Triton-unfused rejects the config.
+    if (
+        current_platform.is_rocm()
+        and config.routing_method == RoutingMethodType.DeepseekV4
+    ):
+        priority_backends = [
+            Mxfp4MoeBackend.TRITON_UNFUSED,
+            Mxfp4MoeBackend.AITER_MXFP4_BF16,
+        ]
+    else:
+        priority_backends = _get_priority_backends()
+
     # Iterate priority backends: TRTLLM MXFP8, then Triton.
-    for backend in _get_priority_backends():
+    for backend in priority_backends:
         activation_key = _backend_activation_key(backend)
         for k_cls in backend_to_kernel_cls(backend):
             supported, reason = k_cls.is_supported_config(
@@ -573,7 +651,21 @@ def convert_gpt_oss_weight_to_mxfp4_moe_kernel_format(
 
     sf_block_size = 32  # mxfp4 block size
 
-    if mxfp4_backend in (
+    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
+        from vllm.model_executor.layers.quantization.utils.humming_utils import (
+            prepare_humming_moe_layer,
+        )
+
+        prepare_humming_moe_layer(layer, {"quant_method": "gpt_oss_mxfp4"})
+        return (
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
+        )
+    elif mxfp4_backend in (
         Mxfp4MoeBackend.MARLIN,
         Mxfp4MoeBackend.BATCHED_MARLIN,
     ):
@@ -806,7 +898,7 @@ def _interleave_mxfp4_cutlass_sm90(w):
                 w2_bias,
             )
 
-    elif mxfp4_backend == Mxfp4MoeBackend.AITER:
+    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
         from vllm._aiter_ops import rocm_aiter_ops
 
         if w13_bias is not None:
@@ -868,6 +960,63 @@ def _interleave_mxfp4_cutlass_sm90(w):
             w2_bias,
         )
 
+    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
+        # W4A8: MXFP4 weights + static FP8 activations (triton kernel)
+        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+        from triton_kernels.numerics import InFlexData
+
+        if w13_bias is not None:
+            w13_bias = w13_bias.to(torch.float32)
+        if w2_bias is not None:
+            w2_bias = w2_bias.to(torch.float32)
+
+        # Process static FP8 input scales (reduce to scalar, warn if not uniform)
+        w13_input_scale = layer.w13_input_scale
+        w2_input_scale = layer.w2_input_scale
+        if w13_input_scale is None or w2_input_scale is None:
+            raise ValueError(
+                "W4A8 (AITER_MXFP4_FP8) requires static input scales, but found "
+                "w13_input_scale or w2_input_scale is None."
+            )
+        if not all_close_1d(w13_input_scale) or not all_close_1d(w2_input_scale):
+            logger.warning_once(
+                "Found input_scales that are not equal for "
+                "fp8 MoE layer. Using the maximum across experts "
+                "for each layer."
+            )
+        w13_input_scale = w13_input_scale.max().to(torch.float32)
+        w2_input_scale = w2_input_scale.max().to(torch.float32)
+
+        # Swizzle weights for GFX950
+        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(w13_weight, w13_weight_scale)
+        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(w2_weight, w2_weight_scale)
+
+        # Create InFlexData for activation scales
+        lhs_data13 = InFlexData(scale=w13_input_scale)
+        lhs_data2 = InFlexData(scale=w2_input_scale)
+
+        # Create PrecisionConfig with both weight and activation info
+        w13_precision_config = PrecisionConfig(
+            weight_scale=w13_scale,
+            flex_ctx=FlexCtx(rhs_data=w13_flex, lhs_data=lhs_data13),
+        )
+        w2_precision_config = PrecisionConfig(
+            weight_scale=w2_scale,
+            flex_ctx=FlexCtx(rhs_data=w2_flex, lhs_data=lhs_data2),
+        )
+
+        del layer.w13_weight
+        del layer.w2_weight
+
+        return (
+            w13_weight,
+            w2_weight,
+            w13_precision_config,
+            w2_precision_config,
+            w13_bias,
+            w2_bias,
+        )
+
     elif mxfp4_backend in TRITON_BACKENDS:
         from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 
@@ -970,6 +1119,21 @@ def convert_weight_to_mxfp4_moe_kernel_format(
             w2_bias,
         )
 
+    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
+        from vllm.model_executor.layers.quantization.utils.humming_utils import (
+            prepare_humming_moe_layer,
+        )
+
+        prepare_humming_moe_layer(layer, {"quant_method": "mxfp4"})
+        return (
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
+        )
+
     if mxfp4_backend in (Mxfp4MoeBackend.MARLIN, Mxfp4MoeBackend.BATCHED_MARLIN):
         from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
             prepare_moe_mxfp4_layer_for_marlin,
@@ -1107,6 +1271,64 @@ def convert_weight_to_mxfp4_moe_kernel_format(
             w2_bias,
         )
 
+    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
+        from vllm._aiter_ops import rocm_aiter_ops
+
+        if w13_bias is not None:
+            w13_bias = w13_bias.data.to(torch.float32)
+        if w2_bias is not None:
+            w2_bias = w2_bias.data.to(torch.float32)
+
+        e, n, k = w13_weight.shape
+
+        w13_weight.view(torch.uint8).copy_(
+            w13_weight.data.view(torch.uint8)
+            .view(e, n // 2, 2, k)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+            .view(e, n, k)
+        )
+        w13_weight_scale.data = (
+            w13_weight_scale.data.view(e, n // 2, 2, -1)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+            .view(e, n, -1)
+        )
+
+        w13_weight.data = w13_weight.data.view(torch.float4_e2m1fn_x2)
+        w2_weight.data = w2_weight.data.view(torch.float4_e2m1fn_x2)
+
+        w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w13_weight, 16, True)
+        shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4(
+            w13_weight_scale.view(-1, w13_weight_scale.shape[-1]),
+            num_experts,
+            True,
+        )
+
+        w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w2_weight, 16, False)
+        shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4(
+            w2_weight_scale.view(-1, w2_weight_scale.shape[-1]),
+            num_experts,
+            False,
+        )
+
+        if w13_bias is not None:
+            w13_bias = (
+                w13_bias.data.view(-1, n // 2, 2)
+                .permute(0, 2, 1)
+                .contiguous()
+                .view(-1, n)
+            )
+
+        return (
+            w13_weight,
+            w2_weight,
+            shuffled_w13_scale,
+            shuffled_w2_scale,
+            w13_bias,
+            w2_bias,
+        )
+
     elif mxfp4_backend in TRITON_BACKENDS:
         from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 
@@ -1162,7 +1384,7 @@ def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
     else:
         raise ValueError(
             f"Unsupported mxfp4_backend for Mxfp4MoEMethod: {mxfp4_backend}. "
-            f"Expected TRTLLM or Triton backend."
+            f"Expected TRTLLM, Triton, or AITER backend."
         )
 
 
@@ -1175,6 +1397,9 @@ def make_mxfp4_moe_quant_config(
     swiglu_limit: float | None = None,
     w1_bias: torch.Tensor | None = None,
     w2_bias: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    layer: torch.nn.Module | None = None,
 ) -> FusedMoEQuantConfig | None:
     """Create a FusedMoEQuantConfig for the given MXFP4 backend."""
     if mxfp4_backend == Mxfp4MoeBackend.DEEPGEMM_MXFP4:
@@ -1195,10 +1420,7 @@ def make_mxfp4_moe_quant_config(
             gemm1_beta=gemm1_beta,
             gemm1_clamp_limit=swiglu_limit,
         )
-    elif mxfp4_backend in (
-        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
-        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
-    ):
+    elif mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8:
         return mxfp4_mxfp8_moe_quant_config(
             w1_bias=w1_bias,
             w2_bias=w2_bias,
@@ -1207,6 +1429,28 @@ def make_mxfp4_moe_quant_config(
             gemm1_alpha=gemm1_alpha,
             gemm1_beta=gemm1_beta,
             gemm1_clamp_limit=swiglu_limit,
+            mx_alignment=256,
+        )
+    elif mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8:
+        return mxfp4_mxfp8_moe_quant_config(
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            gemm1_alpha=gemm1_alpha,
+            gemm1_beta=gemm1_beta,
+            gemm1_clamp_limit=swiglu_limit,
+        )
+    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
+        # W4A8: MXFP4 weights + static FP8 activations
+        return mxfp4_w4a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            block_shape=None,
         )
     elif mxfp4_backend in (
         Mxfp4MoeBackend.MARLIN,
@@ -1215,7 +1459,7 @@ def make_mxfp4_moe_quant_config(
         Mxfp4MoeBackend.TRITON_UNFUSED,
         Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
         Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
-        Mxfp4MoeBackend.AITER,
+        Mxfp4MoeBackend.AITER_MXFP4_BF16,
     ):
         return mxfp4_w4a16_moe_quant_config(
             w1_bias=w1_bias,
@@ -1226,6 +1470,14 @@ def make_mxfp4_moe_quant_config(
             gemm1_beta=gemm1_beta,
             gemm1_clamp_limit=swiglu_limit,
         )
+    elif mxfp4_backend == Mxfp4MoeBackend.HUMMING:
+        from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+        from vllm.model_executor.layers.quantization.utils.humming_utils import (
+            get_humming_moe_quant_config,
+        )
+
+        assert isinstance(layer, FusedMoE)
+        return get_humming_moe_quant_config(layer)
     else:
         return ocp_mx_moe_quant_config(
             quant_dtype="mxfp4",
@@ -1246,11 +1498,11 @@ def make_mxfp4_moe_kernel(
     mxfp4_backend: Mxfp4MoeBackend,
     routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     shared_experts: torch.nn.Module | None = None,
+    layer: torch.nn.Module | None = None,
 ) -> mk.FusedMoEKernel:
     """Create a FusedMoEKernel for the given MXFP4 backend."""
     is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)
 
-    # Create Prepare/Finalize.
     prepare_finalize = maybe_make_prepare_finalize(
         moe=moe_config,
         quant_config=moe_quant_config,
@@ -1262,6 +1514,11 @@ def make_mxfp4_moe_kernel(
 
     logger.info_once("Using %s", prepare_finalize.__class__.__name__)
 
+    extra_kwargs = {}
+    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
+        assert layer is not None
+        extra_kwargs["layer"] = layer
+
     # Create Experts.
     if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
         max_num_tokens = prepare_finalize.max_num_tokens_per_rank()
@@ -1271,11 +1528,13 @@ def make_mxfp4_moe_kernel(
             quant_config=moe_quant_config,
             max_num_tokens=max_num_tokens,
             num_dispatchers=prepare_finalize.num_dispatchers(),
+            **extra_kwargs,
         )
     else:
         experts = experts_cls(
             moe_config=moe_config,
             quant_config=moe_quant_config,
+            **extra_kwargs,
         )
 
     kernel = mk.FusedMoEKernel(
diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
index db6d56e3c3ac..f4796243e013 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
@@ -31,6 +31,9 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     prepare_nvfp4_moe_layer_for_marlin,
 )
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (
+    kE2M1ToFloat_handle,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
 )
@@ -376,6 +379,10 @@ def convert_to_nvfp4_moe_kernel_format(
             is_act_and_mul=is_act_and_mul,
         )
     elif nvfp4_backend == NvFp4MoeBackend.EMULATION:
+        # Move the E2M1 lookup table to the device now, because
+        # `.to(device)` is not allowed during CUDA graph capture.
+        kE2M1ToFloat_handle.val = kE2M1ToFloat_handle.val.to(w13.device)
+
         if a13_scale is None or a2_scale is None:
             raise ValueError(
                 "Activation global scales should not be None, got"
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
index 058d09d23bf2..e5d2b601a768 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
@@ -16,7 +16,6 @@
     moe_kernel_quantize_input,
     normalize_batched_scales_shape,
 )
-from vllm.platforms import current_platform
 from vllm.v1.worker.ubatching import (
     dbo_current_ubatch_id,
     dbo_enabled,
@@ -290,46 +289,29 @@ def prepare_async(
 
         # Dispatch
         dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids)
-        if current_platform.is_rocm():
-            (
-                expert_x,
-                expert_num_tokens,
-                handle,
-                _,
-                hook,
-            ) = self.buffer.low_latency_dispatch(
-                a1,
-                dispatch_topk_ids,
-                self.max_tokens_per_rank,
-                num_experts,
-                use_fp8=self.use_fp8_dispatch,
-                async_finish=False,
-                return_recv_hook=True,
-            )
-        else:
-            (
-                expert_x,
-                expert_num_tokens,
-                handle,
-                _,
-                hook,
-            ) = self.buffer.low_latency_dispatch(
-                a1,
-                dispatch_topk_ids,
-                self.max_tokens_per_rank,
-                num_experts,
-                use_fp8=self.use_fp8_dispatch,
-                round_scale=self.use_ue8m0_dispatch,
-                use_ue8m0=self.use_ue8m0_dispatch,
-                **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
-                **(
-                    dict(x_global_scale=qc_a1_gscale_or_scale)
-                    if qc_a1_gscale_or_scale is not None and nvfp4_dispatch
-                    else dict()
-                ),
-                async_finish=False,
-                return_recv_hook=True,
-            )
+        (
+            expert_x,
+            expert_num_tokens,
+            handle,
+            _,
+            hook,
+        ) = self.buffer.low_latency_dispatch(
+            a1,
+            dispatch_topk_ids,
+            self.max_tokens_per_rank,
+            num_experts,
+            use_fp8=self.use_fp8_dispatch,
+            round_scale=self.use_ue8m0_dispatch,
+            use_ue8m0=self.use_ue8m0_dispatch,
+            **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
+            **(
+                dict(x_global_scale=qc_a1_gscale_or_scale)
+                if qc_a1_gscale_or_scale is not None and nvfp4_dispatch
+                else dict()
+            ),
+            async_finish=False,
+            return_recv_hook=True,
+        )
         self.handles[a2a_idx] = handle
 
         return (
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
index a04ff3b8b68f..6cc0d01cde6b 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
@@ -31,6 +31,8 @@ def __init__(
         num_experts: int,
         hidden_size: int,
         num_dispatchers: int = 1,
+        dispatch_dtype_bytes_per_elem: int = 0,
+        dispatch_scale_bytes_per_token: int = 0,
     ):
         super().__init__()
         self.max_num_tokens = max_num_tokens
@@ -38,6 +40,7 @@ def __init__(
         self.num_experts = num_experts
         self.hidden_size = hidden_size
         self.num_dispatchers_ = num_dispatchers
+        self.scale_elems_per_token = dispatch_scale_bytes_per_token
 
         device_communicator = get_ep_group().device_communicator
         assert device_communicator is not None
@@ -49,6 +52,8 @@ def __init__(
             top_k=self.top_k,
             num_experts=self.num_experts,
             hidden_size=self.hidden_size,
+            dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,
+            dispatch_scale_bytes_per_token=dispatch_scale_bytes_per_token,
         )
 
     @property
@@ -92,19 +97,24 @@ def prepare(
             else a1.shape[0]
         )
 
-        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1,
-            quant_config.a1_gscale,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
-        )
+        if defer_input_quant:
+            a1q, a1q_scale = a1, None
+        else:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                quant_config.a1_gscale,
+                quant_config.quant_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+                is_fp4_scale_swizzled=False,  # delay swizzle to after comm
+                mx_alignment=quant_config.mx_alignment,
+            )
 
         payloads = []
         payloads.append(a1q)
         if a1q_scale is not None:
             payloads.append(a1q_scale)
+        topk_ids_payload_index = len(payloads)
         payloads.append(topk_ids)
         payloads.append(topk_weights)
 
@@ -113,6 +123,8 @@ def prepare(
             token_selected_experts=topk_ids,
             input_payloads=payloads,
             runtime_max_tokens_per_rank=self.runtime_max_tokens_per_rank,
+            invalid_token_expert_id=-1,  # Follow TRTLLM Pattern
+            expert_id_payload_index=topk_ids_payload_index,
         )
         if a1q_scale is not None:
             a1q_recv, a1q_scale_recv, topk_ids_recv, topk_weights_recv = recv_payloads
@@ -124,7 +136,8 @@ def prepare(
                 a1q_scale_recv = a1q_scale_recv.view(-1, a1q_scale_recv.shape[-1])
                 a1q_scale_recv = a1q_scale_recv.view(torch.uint8)
                 a1q_scale_recv = nvfp4_block_scale_interleave(a1q_scale_recv)
-            a1q_scale_recv = a1q_scale_recv.view(-1, self.hidden_size // 16)
+            assert self.scale_elems_per_token > 0
+            a1q_scale_recv = a1q_scale_recv.view(-1, self.scale_elems_per_token)
         else:
             a1q_recv, topk_ids_recv, topk_weights_recv = recv_payloads
             a1q_scale_recv = None
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py
index 47fe293d511e..78be414759f7 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py
@@ -174,6 +174,7 @@ def flashinfer_alltoall_dispatch(
             # the hidden states, breaking the A2A kernel. So, we
             # delay the swizzling until after the A2A.
             is_fp4_scale_swizzled=False,
+            mx_alignment=quant_config.mx_alignment,
         )
 
         x = MnnvlMoe.mnnvl_moe_alltoallv(
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
index 2b21e2db9f68..5b3325ad0195 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
@@ -40,6 +40,7 @@ def _quantize_and_setup_dispatch(
             per_act_token_quant=quant_config.per_act_token_quant,
             block_shape=quant_config.block_shape,
             is_fp4_scale_swizzled=False,
+            mx_alignment=quant_config.mx_alignment,
         )
 
     # Skip gathering scales if we have static quantization
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py
index b9d57da08326..31a35bd60218 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py
@@ -31,6 +31,7 @@ def _quantize_input(
         per_act_token_quant=quant_config.per_act_token_quant,
         block_shape=quant_config.block_shape,
         is_fp4_scale_swizzled=quant_config.is_nvfp4_scale_swizzled,
+        mx_alignment=quant_config.mx_alignment,
     )
 
     return a1q, a1q_scale
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 495b9daaff45..d9d888296b74 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -252,7 +252,8 @@ def rocm_aiter_fused_experts(
 
     else:
         quant_method = QuantMethod.NO.value
-        # mxfp4: both w4a4 (quark) and w4a16 (oracle CK) use BLOCK_1X32
+        # mxfp4 i.e. w4a4, w4a16 uses BLOCK_1X32
+        # mxfp6 and mxfp8 are unsupported in AITER currently and use emulation instead
         if quant_config.use_mxfp4_w4a4 or quant_config.use_mxfp4_w4a16:
             quant_method = QuantMethod.BLOCK_1X32.value
         # w8a8 block-scaled
diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
index 5b93b3d5c6ea..bf4d02782a11 100644
--- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
+++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
@@ -24,6 +24,24 @@
 
 logger = logging.getLogger(__name__)
 
+
+def _get_num_experts_per_tok(hf_config) -> int:
+    """Resolve the per-token expert count from the HF config.
+
+    Different model families store this under different attribute names
+    (e.g. ``num_experts_per_tok`` for DeepSeek, ``top_k_experts`` for Gemma 4).
+    """
+    val = getattr(hf_config, "num_experts_per_tok", None)
+    if val is None:
+        val = getattr(hf_config, "top_k_experts", None)
+    if val is None:
+        raise ValueError(
+            "Cannot determine num_experts_per_tok: HF config has neither "
+            "'num_experts_per_tok' nor 'top_k_experts'"
+        )
+    return val
+
+
 # Constants
 _TMP_DIR = tempfile.gettempdir()
 _LOCK_FILE_PREFIX = os.path.join(_TMP_DIR, "vllm_routed_experts")
@@ -127,7 +145,7 @@ def init_buffer(
 
         hf_config = vllm_config.model_config.hf_text_config
         num_layers = hf_config.num_hidden_layers
-        num_experts_per_tok = hf_config.num_experts_per_tok
+        num_experts_per_tok = _get_num_experts_per_tok(hf_config)
 
         # Initialize device buffer
         self._device_buffer = torch.zeros(
@@ -300,7 +318,7 @@ def attach_buffer(
         shape = (
             max_num_kv_tokens,
             hf_config.num_hidden_layers,
-            hf_config.num_experts_per_tok,
+            _get_num_experts_per_tok(hf_config),
         )
 
         self.dp_rank = vllm_config.parallel_config.data_parallel_rank
diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
index c1bd7a6993ab..8e35169d9005 100644
--- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
+++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
@@ -34,11 +34,15 @@ def __init__(
 
     @property
     def routing_method_type(self) -> RoutingMethodType:
+        from vllm.model_executor.models.cohere_moe import token_choice_with_bias
         from vllm.model_executor.models.llama4 import Llama4MoE
 
         # NOTE: FLASHINFER_TRTLLM support the Llama4 router.
         if self.custom_routing_function == Llama4MoE.custom_routing_function:
             return RoutingMethodType.Llama4
+        # Cohere MoE uses a sigmoid -> top-k -> renormalize routing function.
+        if self.custom_routing_function == token_choice_with_bias:
+            return RoutingMethodType.SigmoidRenorm
         return RoutingMethodType.Custom
 
     def _compute_routing(
diff --git a/vllm/model_executor/layers/fused_moe/router/gate_linear.py b/vllm/model_executor/layers/fused_moe/router/gate_linear.py
index 77d8e756026d..a868c9c8487e 100644
--- a/vllm/model_executor/layers/fused_moe/router/gate_linear.py
+++ b/vllm/model_executor/layers/fused_moe/router/gate_linear.py
@@ -105,7 +105,7 @@ def forward(
 
         # Tier 2: cuBLAS bf16→fp32
         if self.allow_cublas_router_gemm and x.dtype == torch.bfloat16:
-            output = ops.router_gemm_bf16_fp32(x, self.weight)
+            output = torch.mm(x, self.weight.T, out_dtype=torch.float32)
             return output, None
 
         # Tier 3: F.linear (ReplicatedLinear)
diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py
index bf8641b060f2..2eee8acf6b8f 100644
--- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py
+++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py
@@ -220,7 +220,7 @@ def __init__(
         self.routed_output_transform = routed_output_transform
         self.routed_scaling_factor = routed_scaling_factor
         self.gate = gate
-        self.quant_method = quant_method
+        self._quant_method = quant_method
         self.enable_dbo = enable_dbo
 
         self._shared_experts: SharedExperts | None = None
@@ -263,7 +263,7 @@ def shared_experts(self) -> SharedExperts | None:
     def _replace_quant_method(self, quant_method: FusedMoEMethodBase):
         if self._shared_experts is not None:
             self._shared_experts._quant_method = quant_method
-        self.quant_method = quant_method
+        self._quant_method = quant_method
 
     def is_internal_router(self) -> bool:
         return self.gate is not None
@@ -330,8 +330,8 @@ def _maybe_apply_routed_scale_to_output(
     @property
     def _fused_output_is_reduced(self) -> bool:
         return (
-            self.quant_method.moe_kernel is not None
-            and self.quant_method.moe_kernel.output_is_reduced()
+            self._quant_method.moe_kernel is not None
+            and self._quant_method.moe_kernel.output_is_reduced()
         )
 
     def _maybe_reduce_shared_expert_output(
@@ -407,7 +407,7 @@ def _maybe_pad_hidden_states(
         )
         transformed_hidden_dim = hidden_states.shape[-1]
         if (
-            not self.quant_method.skip_forward_padding
+            not self._quant_method.skip_forward_padding
             and self.moe_config.hidden_dim != transformed_hidden_dim
         ):
             hidden_states = F.pad(
@@ -451,8 +451,8 @@ def _apply_quant_method(
             shared_experts_input, SharedExpertsOrder.NO_OVERLAP
         )
 
-        if self.quant_method.is_monolithic:
-            fused_out = self.quant_method.apply_monolithic(
+        if self._quant_method.is_monolithic:
+            fused_out = self._quant_method.apply_monolithic(
                 layer=layer,
                 x=hidden_states,
                 router_logits=router_logits,
@@ -467,7 +467,7 @@ def _apply_quant_method(
 
             # Passing shared_experts_input in case SharedExpertsOrder is
             # MK_INTERNAL_OVERLAPPED.
-            fused_out = self.quant_method.apply(
+            fused_out = self._quant_method.apply(
                 layer=layer,
                 x=hidden_states,
                 topk_weights=topk_weights,
@@ -618,7 +618,7 @@ def forward(
     @property
     def do_naive_dispatch_combine(self) -> bool:
         return (
-            self.moe_config.dp_size > 1 and not self.quant_method.supports_internal_mk
+            self.moe_config.dp_size > 1 and not self._quant_method.supports_internal_mk
         )
 
     def _maybe_dispatch(
diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py
index 9a6c37aa3983..e3b239ca60fa 100644
--- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py
+++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from vllm.model_executor.custom_op import PluggableLayer
 from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
@@ -12,7 +13,7 @@
 )
 
 
-class MoERunnerInterface(ABC):
+class MoERunnerInterface(PluggableLayer, ABC):
     """
     Abstract base class for Mixture of Experts (MoE) runners.
 
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 89697033403d..91de16f79c68 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -154,32 +154,52 @@ def _setup_kernel(
         w2: torch.Tensor,
     ) -> None:
         # Shuffle weights to runtime format.
-        w13, w2 = convert_to_unquantized_kernel_format(
+        w13_new, w2_new = convert_to_unquantized_kernel_format(
             self.unquantized_backend,
             layer=layer,
             w13_weight=w13,
             w2_weight=w2,
         )
-        replace_parameter(layer, "w13_weight", w13)
-        replace_parameter(layer, "w2_weight", w2)
+        # `moe_kernel` is initialized to None in FusedMoEMethodBase.__init__;
+        # On the first call we replace the parameter normally. On subsequent
+        # calls (e.g. RL weight updates that re-trigger
+        # process_weights_after_loading) the moe kernel has already been set
+        # up and CUDA graphs may have captured the parameter addresses, so
+        # we copy the shuffled data into the existing storage instead of
+        # re-registering a new Parameter.
+        is_weight_update = self.moe_kernel is not None  # type: ignore[has-type]
+        replace_parameter(layer, "w13_weight", w13_new, prefer_copy=is_weight_update)
+        replace_parameter(layer, "w2_weight", w2_new, prefer_copy=is_weight_update)
 
-        # Setup moe kernel.
-        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
-        assert self.moe_quant_config is not None
-        assert self.experts_cls is not None
-        self.moe_kernel = make_unquantized_moe_kernel(
-            quant_config=self.moe_quant_config,
-            moe_config=self.moe,
-            backend=self.unquantized_backend,
-            experts_cls=self.experts_cls,
-            routing_tables=layer._maybe_init_expert_routing_tables(),
-            shared_experts=layer.shared_experts,
-        )
+        if not is_weight_update:
+            # Setup moe kernel only on the first call. For the unquantized
+            # method, moe_quant_config is either the constant
+            # FUSED_MOE_UNQUANTIZED_CONFIG or biased_moe_quant_config(...)
+            # which references layer.w{13,2}_bias; since weight updates
+            # mutate those bias tensors in place, the kernel does not need
+            # to be re-built.
+            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+            assert self.moe_quant_config is not None
+            assert self.experts_cls is not None
+            self.moe_kernel = make_unquantized_moe_kernel(
+                quant_config=self.moe_quant_config,
+                moe_config=self.moe,
+                backend=self.unquantized_backend,
+                experts_cls=self.experts_cls,
+                routing_tables=layer._maybe_init_expert_routing_tables(),
+                shared_experts=layer.shared_experts,
+            )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
-        # Padding the weight for better performance on ROCm
+        # Padding the weight for better performance on ROCm.
+        # _maybe_pad_weight is idempotent: on the first call it allocates a
+        # padded storage and returns a strided view; on subsequent calls
+        # (weight updates) the stride condition no longer matches so it
+        # returns the input unchanged. The reassignment to .data is therefore
+        # a no-op on updates and preserves the storage address (data_ptr)
+        # used by captured CUDA graphs.
         layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
         layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
 
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index ffab3ca0bfa9..5cf57c4ffb08 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -208,11 +208,12 @@ def _mxfp8_e4m3_quantize(
     per_act_token_quant: bool,
     block_shape: list[int] | None = None,
     is_sf_swizzled_layout: bool = False,
+    mx_alignment: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert A_scale is None
     assert not per_act_token_quant
     assert block_shape is None or block_shape == [1, 32]
-    return mxfp8_e4m3_quantize(A, is_sf_swizzled_layout)
+    return mxfp8_e4m3_quantize(A, is_sf_swizzled_layout, mx_alignment)
 
 
 def _mxfp6_e3m2_quantize(
@@ -258,6 +259,7 @@ def moe_kernel_quantize_input(
     is_fp4_scale_swizzled: bool = True,
     ocp_mx_scheme: str | None = None,
     quantization_emulation: bool = False,
+    mx_alignment: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor | None]:
     # Handle OCP MX scheme that requires QDQ (quantize-dequantize) for emulation
     if ocp_mx_scheme is not None:
@@ -298,7 +300,8 @@ def moe_kernel_quantize_input(
                 A, A_scale, is_sf_swizzled_layout=is_fp4_scale_swizzled
             )
         else:
-            return ref_nvfp4_quant_dequant(A, A_scale, block_size=16)
+            A = ref_nvfp4_quant_dequant(A, A_scale, block_size=16)
+            return A, None
     elif quant_dtype == "mxfp4":
         if not quantization_emulation:
             raise NotImplementedError(
@@ -319,7 +322,8 @@ def moe_kernel_quantize_input(
             A_scale,
             per_act_token_quant,
             block_shape,
-            is_sf_swizzled_layout=is_fp4_scale_swizzled,
+            is_sf_swizzled_layout=False,
+            mx_alignment=mx_alignment,
         )
     elif quant_dtype == "mxfp6_e3m2":
         if not quantization_emulation:
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index d9184bb77070..a5d4e4db79fe 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -8,68 +8,15 @@
 
 # Import kernels
 import vllm.kernels  # noqa: F401
-from vllm import _oink_ops, envs, ir
-from vllm._aiter_ops import rocm_aiter_ops
+from vllm import envs, ir
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.batch_invariant import (
-    rms_norm_batch_invariant,
-)
-from vllm.platforms import current_platform
+from vllm.model_executor.layers.batch_invariant import rms_norm_batch_invariant
 
 logger = init_logger(__name__)
 
 
-def _can_view_as_2d(x: torch.Tensor) -> bool:
-    """Return True if x.view(-1, x.shape[-1]) is viewable (no copy)."""
-    if x.dim() < 2:
-        return False
-    if x.dim() == 2:
-        return True
-    # For a view(-1, N) to be valid, all leading dims must be contiguous with
-    # respect to each other (size-1 dims are ignored).
-    for dim in range(x.dim() - 1):
-        # Strides for size-1 dims are irrelevant and can be arbitrary.
-        if x.size(dim + 1) != 1 and x.stride(dim) != x.stride(dim + 1) * x.size(
-            dim + 1
-        ):
-            return False
-    return True
-
-
-def _is_oink_stride_compatible_2d(x_2d: torch.Tensor) -> bool:
-    """Return True if x_2d meets Oink's pointer-path stride constraints."""
-    if x_2d.dim() != 2:
-        return False
-    if x_2d.stride(1) != 1:
-        return False
-    # Match Oink's vectorization constraint: stride(0) divisible by 256b.
-    if x_2d.dtype in (torch.float16, torch.bfloat16):
-        divby = 16
-    elif x_2d.dtype == torch.float32:
-        divby = 8
-    else:
-        return False
-    return (x_2d.stride(0) % divby) == 0
-
-
-def fused_add_rms_norm(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    from vllm import _custom_ops as ops
-
-    ops.fused_add_rms_norm(
-        x,
-        residual,
-        weight,
-        variance_epsilon,
-    )
-    return x, residual
-
-
 def poly_norm(
     x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, variance_epsilon: float
 ) -> torch.Tensor:
@@ -86,18 +33,6 @@ def poly_norm(
     return out
 
 
-def dispatch_rocm_rmsnorm_func(dtype: torch.dtype, use_aiter: bool = False):
-    use_aiter = use_aiter and dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-
-    if use_aiter:
-        return rocm_aiter_ops.rms_norm2d_with_add
-    else:
-        return fused_add_rms_norm
-
-
 # --8<-- [start:rms_norm]
 @CustomOp.register("rms_norm")
 class RMSNorm(CustomOp):
@@ -130,105 +65,19 @@ def __init__(
         if self.has_weight:
             self.weight = nn.Parameter(self.weight)
 
-        if current_platform.is_rocm():
-            aiter_rmsnorm_enabled = rocm_aiter_ops.is_rmsnorm_enabled()
-            self.rocm_norm_func_with_add = dispatch_rocm_rmsnorm_func(
-                dtype=weight_dtype, use_aiter=aiter_rmsnorm_enabled
-            )
-
-        # Optional: enable Oink Blackwell RMSNorm custom-op fast path on
-        # compatible CUDA devices (e.g., SM100) when the external Oink
-        # package is available. This is detected once at construction time
-        # to avoid per-call device queries in the hot path.
-        self._use_oink_fused_add_rmsnorm = False
-        if (
-            not current_platform.is_rocm()
-            and torch.cuda.is_available()
-            and bool(getattr(envs, "VLLM_USE_OINK_OPS", False))
-        ):
-            # NOTE: vLLM disables custom ops by default when using Inductor.
-            # If this op is disabled, CustomOp will dispatch to forward_native,
-            # and the Oink path in forward_cuda will never run.
-            if getattr(self._forward_method, "__func__", None) is getattr(
-                self.forward_native, "__func__", None
-            ):
-                try:
-                    from vllm.config import get_cached_compilation_config
-
-                    custom_ops = get_cached_compilation_config().custom_ops
-                except Exception:
-                    custom_ops = ["<unknown>"]
-                logger.warning_once(
-                    "VLLM_USE_OINK_OPS=1 but the `rms_norm` custom op is "
-                    "disabled (CompilationConfig.custom_ops=%s). Enable it via "
-                    "`compilation_config={'custom_ops': ['none', '+rms_norm']}` "
-                    "(or `['all']`) to let vLLM call into torch.ops.oink.*.",
-                    custom_ops,
-                )
-                # Custom op disabled => forward_cuda won't run. Avoid doing any
-                # external Oink initialization work in this case.
-            else:
-                try:
-                    device_index = torch.accelerator.current_device_index()
-                    if _oink_ops.is_oink_available_for_device(device_index):
-                        self._use_oink_fused_add_rmsnorm = (
-                            _oink_ops.has_fused_add_rms_norm()
-                        )
-                except Exception as e:
-                    # If anything goes wrong (no Oink install, CPU-only env, etc.),
-                    # silently fall back to the built-in RMSNorm path.
-                    logger.warning_once(
-                        "VLLM_USE_OINK_OPS=1 but failed to initialize Oink "
-                        "RMSNorm; falling back to vLLM RMSNorm. Error: %s",
-                        e,
-                    )
-                    self._use_oink_fused_add_rmsnorm = False
-
-    @staticmethod
-    def forward_static(
-        x: torch.Tensor,
-        variance_epsilon: float,
-        hidden_size: int,
-        orig_dtype: torch.dtype,
-        weight: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-        variance_size_override: int | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        """PyTorch-native implementation equivalent to forward()."""
-        x = x.to(torch.float32)
-        if residual is not None:
-            # residual promoted f16->f32 automatically,
-            # otherwise Inductor eliminates the casts to and from f16,
-            # increasing memory usage (and complicating pattern matching)
-            x = x + residual
-            residual = x.to(orig_dtype)
-
-        if x.shape[-1] != hidden_size:
-            raise ValueError(
-                f"Expected hidden_size to be {hidden_size}, but found: {x.shape[-1]}"
-            )
-
-        if variance_size_override is None:
-            x_var = x
-        else:
-            if hidden_size < variance_size_override:
-                raise ValueError(
-                    "Expected hidden_size to be at least "
-                    f"{variance_size_override}, but found: {hidden_size}"
-                )
-
-            x_var = x[:, :, :variance_size_override]
-
-        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
-
-        x = x * torch.rsqrt(variance + variance_epsilon)
-        x = x.to(orig_dtype)
-        if weight is not None:
-            x = x * weight
-        if residual is None:
-            return x
-        else:
-            return x, residual
+        # Do not pass identity weight to native implementation (causes issue on TPU).
+        # Other implementations require weight to be passed even if all ones.
+        # Cheat and predict if native will be dispatched to:
+        #  1) if native is first in priority list
+        #  2) if variance_size_override is given (only supported by native impl)
+        # TODO(luka): address weight passing inconsistency:
+        # https://github.com/vllm-project/vllm/issues/39370
+        priority = get_current_vllm_config().kernel_config.ir_op_priority
+        var_override = self.variance_size_override is not None
+        native_rms_norm = priority.rms_norm[0] == "native" or var_override
+        native_add_rms_norm = priority.fused_add_rms_norm[0] == "native" or var_override
+        self.pass_weight = self.has_weight or not native_rms_norm
+        self.pass_weight_add = self.has_weight or not native_add_rms_norm
 
     def forward_native(
         self,
@@ -237,106 +86,34 @@ def forward_native(
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         if residual is None:
-            # TODO(luka): address the weight=None passing issue more generally
             return ir.ops.rms_norm(
                 x,
-                self.weight.data if self.has_weight else None,
+                self.weight.data if self.pass_weight else None,
+                self.variance_epsilon,
+                self.variance_size_override,
+            )
+        else:
+            return ir.ops.fused_add_rms_norm.maybe_inplace(
+                x,
+                residual,
+                self.weight.data if self.pass_weight_add else None,
                 self.variance_epsilon,
                 self.variance_size_override,
             )
-
-        return self.forward_static(
-            x,
-            self.variance_epsilon,
-            self.hidden_size,
-            x.dtype,
-            self.weight.data if self.has_weight else None,
-            residual,
-            self.variance_size_override,
-        )
 
     def forward_cuda(
         self,
         x: torch.Tensor,
         residual: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if residual is None and not envs.VLLM_BATCH_INVARIANT:
-            return ir.ops.rms_norm(
-                x, self.weight.data, self.variance_epsilon, self.variance_size_override
-            )
-
-        if self.variance_size_override is not None:
-            return self.forward_native(x, residual)
-
-        # Optional Oink SM100 fast path (fused residual-add + RMSNorm, in-place).
-        # This mirrors vLLM's fused_add_rms_norm semantics by mutating both
-        # `x` (normalized output) and `residual` (residual-out buffer).
         if (
-            residual is not None
-            and getattr(self, "_use_oink_fused_add_rmsnorm", False)
-            and x.is_cuda
-            and residual.is_cuda
-            and x.shape == residual.shape
-            and x.dtype == residual.dtype
-            and x.dim() >= 2
-            and self.has_weight
-            and not envs.VLLM_BATCH_INVARIANT
-            and self.weight.data.dtype == x.dtype
-            and self.weight.data.is_contiguous()
+            envs.VLLM_BATCH_INVARIANT
+            and residual is None
+            and self.variance_size_override is None
         ):
-            orig_shape = x.shape
-            hidden_size = orig_shape[-1]
-            if _can_view_as_2d(x) and _can_view_as_2d(residual):
-                x_2d = x.view(-1, hidden_size)
-                res_2d = residual.view(-1, hidden_size)
-
-                # The Oink in-place pointer path supports the common vLLM
-                # layout where:
-                # - `x` may be strided/padded row-major (stride(1) == 1), and
-                # - `residual` is contiguous row-major ([M, N] with stride(0) == N).
-                # If these conditions are not met, fall back to vLLM's built-in
-                # fused kernel.
-                if (
-                    _is_oink_stride_compatible_2d(x_2d)
-                    and _is_oink_stride_compatible_2d(res_2d)
-                    and res_2d.is_contiguous()
-                ):
-                    _oink_ops.fused_add_rms_norm_(
-                        x_2d,
-                        res_2d,
-                        self.weight.data,
-                        self.variance_epsilon,
-                    )
-                    return x, residual
-
-        if residual is not None:
-            return fused_add_rms_norm(
-                x, residual, self.weight.data, self.variance_epsilon
-            )
-        else:
-            assert envs.VLLM_BATCH_INVARIANT
             return rms_norm_batch_invariant(x, self.weight.data, self.variance_epsilon)
 
-    def forward_hip(
-        self,
-        x: torch.Tensor,
-        residual: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if residual is None and not envs.VLLM_BATCH_INVARIANT:
-            return ir.ops.rms_norm(
-                x, self.weight.data, self.variance_epsilon, self.variance_size_override
-            )
-
-        if self.variance_size_override is not None:
-            return self.forward_native(x, residual)
-
-        if residual is not None:
-            return self.rocm_norm_func_with_add(
-                x, residual, self.weight.data, self.variance_epsilon
-            )
-        else:
-            assert envs.VLLM_BATCH_INVARIANT
-            return rms_norm_batch_invariant(x, self.weight.data, self.variance_epsilon)
+        return self.forward_native(x, residual)
 
     def forward_xpu(
         self,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 6a4c1f3c47ef..765e79331d1e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -213,6 +213,9 @@ def create_weights(
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if current_platform.is_cpu():
+            if layer.weight.ndim != 2:
+                # this is not a linear layer
+                return
             from vllm.model_executor.layers.utils import dispatch_cpu_unquantized_gemm
 
             dispatch_cpu_unquantized_gemm(layer, remove_weight=True)
@@ -268,10 +271,13 @@ def __init__(
         self.quant_config = quant_config
         self.prefix = prefix
         self.allow_fp8_block_shape_mismatch = False
+        self.quant_method: QuantizeMethodBase
         if quant_config is None:
-            self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
+            self.quant_method = UnquantizedLinearMethod()
+        elif quant_method := quant_config.get_quant_method(self, prefix=prefix):
+            self.quant_method = quant_method
         else:
-            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+            raise ValueError("All linear layers should support quant method.")
         self.return_bias = return_bias
         self.disable_tp = disable_tp
         self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
@@ -335,8 +341,6 @@ def __init__(
             disable_tp=disable_tp,
         )
 
-        # All the linear layer supports quant method.
-        assert self.quant_method is not None
         self.quant_method.create_weights(
             self,
             self.input_size,
@@ -389,7 +393,6 @@ def forward(
         x: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
         bias = self.bias if not self.skip_bias_add else None
-        assert self.quant_method is not None
 
         output = self.quant_method.apply(self, x, bias)
 
@@ -474,7 +477,6 @@ def __init__(
         self._maybe_allow_fp8_block_shape_mismatch()
         self.gather_output = gather_output
 
-        assert self.quant_method is not None
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size_per_partition,
@@ -583,7 +585,6 @@ def forward(
         bias = self.bias if not self.skip_bias_add else None
 
         # Matrix multiply.
-        assert self.quant_method is not None
         output_parallel = self.quant_method.apply(self, input_, bias)
 
         if self.gather_output and self.tp_size > 1:
@@ -1463,7 +1464,6 @@ def __init__(
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
 
-        assert self.quant_method is not None
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size_per_partition,
@@ -1553,7 +1553,6 @@ def forward(
             input_parallel = split_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
-        assert self.quant_method is not None
         # Only fuse bias add into GEMM for rank 0 (this ensures that
         # bias will not get added more than once in TP>1 case)
         bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
diff --git a/vllm/model_executor/layers/mamba/gdn_linear_attn.py b/vllm/model_executor/layers/mamba/gdn_linear_attn.py
index a621ab962f0a..b4699d4f0060 100644
--- a/vllm/model_executor/layers/mamba/gdn_linear_attn.py
+++ b/vllm/model_executor/layers/mamba/gdn_linear_attn.py
@@ -271,9 +271,16 @@ def __init__(
             else 0
         )
         self.gqa_interleaved_layout = gqa_interleaved_layout
-        self._forward_method = (
-            self.forward_xpu if current_platform.is_xpu() else self.forward_cuda
-        )
+        self._forward_method = self.forward_cuda
+        if current_platform.is_xpu():
+            self._forward_method = self.forward_xpu
+        elif current_platform.is_cpu():
+            from vllm.model_executor.layers.mamba.ops.cpu.gdn_attention import (
+                register_cpu_gdn_attention_ops,
+            )
+
+            register_cpu_gdn_attention_ops()
+            self._forward_method = self.forward_cpu
 
         # QKV
         self.conv_dim = self.key_dim * 2 + self.value_dim
@@ -645,6 +652,56 @@ def forward_xpu(
         core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
         output[:num_tokens], _ = self.out_proj(core_attn_out)
 
+    def forward_cpu(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ):
+        assert not hasattr(self, "in_proj_qkv"), "lora isn't supported on CPU."
+
+        mixed_qkvz, _ = self.in_proj_qkvz(hidden_states)
+        ba, _ = self.in_proj_ba(hidden_states)
+
+        if self.gqa_interleaved_layout:
+            # Qwen3-Next: unpack the interleaved GQA layout
+            query, key, value, z, b, a = self.fix_query_key_value_ordering(
+                mixed_qkvz, ba
+            )
+            query, key, value = map(
+                lambda x: rearrange(x, "l p d -> l (p d)"), (query, key, value)
+            )
+            mixed_qkv = torch.cat((query, key, value), dim=-1)
+        else:
+            # Qwen3.5: weights are already in [q, k, v, z] and [b, a] order
+            qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size
+            z_size = self.value_dim // self.tp_size
+            mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
+            z = z.reshape(z.size(0), -1, self.head_v_dim)
+            b, a = ba.chunk(2, dim=-1)
+
+        num_tokens = hidden_states.size(0)
+        core_attn_out = torch.zeros(
+            (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        torch.ops.vllm.cpu_gdn_attention_core(
+            mixed_qkv,
+            b,
+            a,
+            core_attn_out,
+            _encode_layer_name(self.prefix),
+        )
+
+        z_shape_og = z.shape
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+        output[:num_tokens], _ = self.out_proj(core_attn_out)
+
     def _warmup_prefill_kernels(self, mixed_qkv: torch.Tensor) -> None:
         """Warm up GDN prefill kernels during V1 profiling.
 
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index a5a30502b218..c1fd81e40e34 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -55,9 +55,6 @@ def linear_attention_state_dtype(
         model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
-        # TODO (tdoublep) requires testing
-        if mamba_cache_dtype == "float32":
-            raise ValueError("fp32 state for minimax is not yet supported")
         state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
         return (state_dtype,)
 
diff --git a/vllm/model_executor/layers/mamba/ops/cpu/__init__.py b/vllm/model_executor/layers/mamba/ops/cpu/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/model_executor/layers/mamba/ops/cpu/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/cpu/causal_conv1d.py
new file mode 100644
index 000000000000..b047ca6d6169
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/cpu/causal_conv1d.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import torch
+import torch.nn.functional as F
+
+
+# for prefill
+def causal_conv1d_torch(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor | None,
+    conv_states: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    cache_indices: torch.Tensor,
+    has_initial_state: torch.Tensor,
+    activation: str | None = "silu",
+) -> torch.Tensor:
+    out = torch.empty_like(x)
+    state_len = weight.shape[1] - 1
+    assert activation in {None, "silu", "swish"}
+
+    seq_begin_end_idx = [
+        (int(query_start_loc[idx].item()), int(query_start_loc[idx + 1].item()))
+        for idx in range(query_start_loc.shape[0] - 1)
+    ]
+    weight = weight.unsqueeze(1)
+    for seq_idx, (bos, eos) in enumerate(seq_begin_end_idx):
+        slot = int(cache_indices[seq_idx].item())
+
+        seq_x = x[:, bos:eos].unsqueeze(0)
+        if bool(has_initial_state[seq_idx].item()):
+            initial_state = conv_states[slot, :, :state_len].unsqueeze(0)
+        else:
+            initial_state = torch.zeros(
+                1,
+                weight.shape[0],
+                state_len,
+                device=seq_x.device,
+                dtype=seq_x.dtype,
+            )
+
+        conv_input = torch.cat([initial_state, seq_x], dim=-1).to(weight.dtype)
+        seq_out = F.conv1d(
+            conv_input,
+            weight,
+            bias,
+            padding=0,
+            groups=weight.shape[0],
+        )
+        seq_out = seq_out[..., -seq_x.shape[-1] :].to(dtype=x.dtype)
+        if activation in ("silu", "swish"):
+            seq_out = F.silu(seq_out)
+
+        out[:, bos:eos] = seq_out.squeeze(0)
+        conv_states[slot, :, :state_len].copy_(conv_input[..., -state_len:].squeeze(0))
+
+    return out
+
+
+# for decode
+def causal_conv1d_update_torch(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    activation: str | None = None,
+) -> torch.Tensor:
+    assert activation in {None, "silu", "swish"}
+
+    _, dim, seq_len = x.shape
+    state_len = conv_state.shape[-1]
+
+    x_new = torch.cat([conv_state, x], dim=-1).to(weight.dtype)
+    conv_state.copy_(x_new[:, :, -state_len:])
+
+    out = F.conv1d(
+        x_new,
+        weight.unsqueeze(1),
+        bias,
+        padding=0,
+        groups=dim,
+    )[:, :, -seq_len:]
+    if activation in ("silu", "swish"):
+        out = F.silu(out)
+    return out
diff --git a/vllm/model_executor/layers/mamba/ops/cpu/gdn_attention.py b/vllm/model_executor/layers/mamba/ops/cpu/gdn_attention.py
new file mode 100644
index 000000000000..8b4122cc487b
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/cpu/gdn_attention.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import torch
+
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.layers.mamba.mamba_utils import is_conv_state_dim_first
+from vllm.model_executor.layers.mamba.ops.cpu.causal_conv1d import (
+    causal_conv1d_torch,
+    causal_conv1d_update_torch,
+)
+from vllm.model_executor.layers.mamba.ops.cpu.recurrent_gated_delta_rule import (
+    chunk_gated_delta_rule,
+    gdn_gating,
+    recurrent_gated_delta_rule,
+)
+from vllm.utils.torch_utils import (
+    LayerNameType,
+    _resolve_layer_name,
+    direct_register_custom_op,
+)
+from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
+
+_CPU_GDN_ATTENTION_OPS_REGISTERED = False
+
+
+def cpu_gdn_attention_core(
+    mixed_qkv: torch.Tensor,
+    b: torch.Tensor,
+    a: torch.Tensor,
+    core_attn_out: torch.Tensor,
+    layer_name: LayerNameType,
+) -> None:
+    """CPU custom op for the core GDN attention computation."""
+    layer_name = _resolve_layer_name(layer_name)
+    forward_context: ForwardContext = get_forward_context()
+    layer = forward_context.no_compile_layers[layer_name]
+
+    attn_metadata = forward_context.attn_metadata
+
+    if attn_metadata is None:
+        return
+
+    assert isinstance(attn_metadata, dict)
+    attn_metadata_i = attn_metadata[layer.prefix]
+    assert isinstance(attn_metadata_i, GDNAttentionMetadata)
+
+    if attn_metadata_i.num_actual_tokens == 0:
+        return
+
+    assert (
+        attn_metadata_i.spec_sequence_masks is None
+        and attn_metadata_i.num_accepted_tokens is None
+    ), "speculative decode not supported in CPU GDN attention."
+
+    state_indices_tensor = attn_metadata_i.non_spec_state_indices_tensor
+    query_start_loc = attn_metadata_i.non_spec_query_start_loc
+    assert state_indices_tensor is not None
+    assert query_start_loc is not None
+
+    # [num_allocated_slots, conv_dim, kernel - 1]
+    conv_state = layer.kv_cache[0]
+    if not is_conv_state_dim_first():
+        conv_state = conv_state.transpose(-1, -2)
+
+    # [num_allocated_slots, num_v_heads / tp_size, v_dim, k_dim]
+    ssm_state = layer.kv_cache[1]
+
+    num_decodes = attn_metadata_i.num_decodes
+    num_decode_tokens = attn_metadata_i.num_decode_tokens
+    num_prefills = attn_metadata_i.num_prefills
+    num_prefill_tokens = attn_metadata_i.num_prefill_tokens
+
+    conv_weights = layer.conv1d.weight.view(
+        layer.conv1d.weight.size(0), layer.conv1d.weight.size(2)
+    )
+
+    # all decode requests (batched)
+    if num_decodes > 0:
+        decode_mixed_qkv = mixed_qkv[:num_decode_tokens]
+        decode_b = b[:num_decode_tokens]
+        decode_a = a[:num_decode_tokens]
+        decode_state_indices = state_indices_tensor[:num_decodes]
+        decode_conv_state = conv_state[decode_state_indices].contiguous()
+
+        decode_mixed_qkv = causal_conv1d_update_torch(
+            # [B, dim] -> [B, dim, 1]
+            x=decode_mixed_qkv.unsqueeze(-1),
+            conv_state=decode_conv_state,
+            weight=conv_weights,
+            bias=layer.conv1d.bias,
+            activation=layer.activation,
+        ).squeeze(-1)
+        conv_state[decode_state_indices] = decode_conv_state
+
+        query, key, value = layer.rearrange_mixed_qkv(decode_mixed_qkv)
+
+        # [1, L, H, D] -> [B, 1, H, D] for batched decode
+        query = query.transpose(0, 1).contiguous()
+        key = key.transpose(0, 1).contiguous()
+        value = value.transpose(0, 1).contiguous()
+
+        g, beta_output = gdn_gating(
+            A_log=layer.A_log,
+            a=decode_a,
+            b=decode_b,
+            dt_bias=layer.dt_bias,
+        )
+        if g.ndim == 2:
+            g = g.unsqueeze(1)
+            beta_output = beta_output.unsqueeze(1)
+
+        initial_state = ssm_state[decode_state_indices].contiguous()
+        attn_out, last_recurrent_state = recurrent_gated_delta_rule(
+            query=query,
+            key=key,
+            value=value,
+            g=g,
+            beta=beta_output,
+            initial_state=initial_state,
+            scale=None,
+            use_qk_l2norm_in_kernel=True,
+        )
+        ssm_state[decode_state_indices] = last_recurrent_state.to(
+            ssm_state.dtype
+        ).contiguous()
+        core_attn_out[:num_decode_tokens] = attn_out.squeeze(1)
+
+    # all prefill requests: (varlen) currently naively loops over sequences
+    if num_prefills > 0:
+        has_initial_state = attn_metadata_i.has_initial_state
+        assert has_initial_state is not None
+
+        prefill_token_start = num_decode_tokens
+        prefill_token_end = prefill_token_start + num_prefill_tokens
+        prefill_mixed_qkv = mixed_qkv[prefill_token_start:prefill_token_end]
+        prefill_b = b[prefill_token_start:prefill_token_end]
+        prefill_a = a[prefill_token_start:prefill_token_end]
+        prefill_state_indices = state_indices_tensor[
+            num_decodes : num_decodes + num_prefills
+        ]
+        prefill_query_start_loc = (
+            query_start_loc[num_decodes : num_decodes + num_prefills + 1]
+            - num_decode_tokens
+        )
+        prefill_has_initial_state = has_initial_state[
+            num_decodes : num_decodes + num_prefills
+        ]
+
+        prefill_mixed_qkv = causal_conv1d_torch(
+            x=prefill_mixed_qkv.transpose(0, 1),
+            weight=conv_weights,
+            bias=layer.conv1d.bias,
+            conv_states=conv_state,
+            query_start_loc=prefill_query_start_loc,
+            cache_indices=prefill_state_indices,
+            has_initial_state=prefill_has_initial_state,
+            activation=layer.activation,
+        ).transpose(0, 1)
+
+        query, key, value = layer.rearrange_mixed_qkv(prefill_mixed_qkv)
+        g, beta = gdn_gating(layer.A_log, prefill_a, prefill_b, layer.dt_bias)
+        if g.ndim == 2:
+            g = g.unsqueeze(0)
+            beta = beta.unsqueeze(0)
+
+        initial_state = ssm_state[prefill_state_indices].contiguous()
+        initial_state[~prefill_has_initial_state, ...] = 0
+        attn_out, last_recurrent_state = chunk_gated_delta_rule(
+            q=query,
+            k=key,
+            v=value,
+            g=g,
+            beta=beta,
+            scale=None,
+            initial_state=initial_state,
+            cu_seqlens=prefill_query_start_loc,
+            use_qk_l2norm_in_kernel=True,
+        )
+        ssm_state[prefill_state_indices] = last_recurrent_state.to(ssm_state.dtype)
+        core_attn_out[prefill_token_start:prefill_token_end] = attn_out.squeeze(0)
+
+
+def cpu_gdn_attention_core_fake(
+    mixed_qkv: torch.Tensor,
+    b: torch.Tensor,
+    a: torch.Tensor,
+    core_attn_out: torch.Tensor,
+    layer_name: LayerNameType,
+) -> None:
+    """Fake implementation for torch.compile."""
+    return
+
+
+def register_cpu_gdn_attention_ops() -> None:
+    global _CPU_GDN_ATTENTION_OPS_REGISTERED
+    if _CPU_GDN_ATTENTION_OPS_REGISTERED:
+        return
+
+    direct_register_custom_op(
+        op_name="cpu_gdn_attention_core",
+        op_func=cpu_gdn_attention_core,
+        mutates_args=["core_attn_out"],
+        fake_impl=cpu_gdn_attention_core_fake,
+    )
+    _CPU_GDN_ATTENTION_OPS_REGISTERED = True
diff --git a/vllm/model_executor/layers/mamba/ops/cpu/recurrent_gated_delta_rule.py b/vllm/model_executor/layers/mamba/ops/cpu/recurrent_gated_delta_rule.py
new file mode 100644
index 000000000000..30fca3423a38
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/cpu/recurrent_gated_delta_rule.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+import torch.nn.functional as F
+
+
+def l2norm(
+    x: torch.Tensor,
+    dim: int = -1,
+    eps: float = 1e-6,
+) -> torch.Tensor:
+    inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps)
+    return x * inv_norm
+
+
+def recurrent_gated_delta_rule(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    initial_state: torch.Tensor,
+    scale: float | None = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    initial_dtype = query.dtype
+    if use_qk_l2norm_in_kernel:
+        query = l2norm(query, dim=-1, eps=1e-6)
+        key = l2norm(key, dim=-1, eps=1e-6)
+
+    if query.shape[2] != value.shape[2]:
+        repeat_factor = value.shape[2] // query.shape[2]
+        query = query.repeat_interleave(repeat_factor, dim=2)
+        key = key.repeat_interleave(repeat_factor, dim=2)
+
+    query, key, value, beta, g = [
+        x.transpose(1, 2).contiguous().to(torch.float32)
+        for x in (query, key, value, beta, g)
+    ]
+
+    batch_size, num_heads, sequence_length, _ = key.shape
+    v_head_dim = value.shape[-1]
+    if scale is None:
+        scale = 1 / (query.shape[-1] ** 0.5)
+    query = query * scale
+
+    core_attn_out = torch.empty(
+        batch_size,
+        num_heads,
+        sequence_length,
+        v_head_dim,
+        dtype=value.dtype,
+    )
+    last_recurrent_state = initial_state.to(value)
+
+    for token_idx in range(sequence_length):
+        q_t = query[:, :, token_idx]
+        k_t = key[:, :, token_idx]
+        v_t = value[:, :, token_idx]
+        g_t = g[:, :, token_idx].exp().unsqueeze(-1).unsqueeze(-1)
+        beta_t = beta[:, :, token_idx].unsqueeze(-1)
+
+        last_recurrent_state = last_recurrent_state * g_t
+        kv_mem = (last_recurrent_state * k_t.unsqueeze(-2)).sum(dim=-1)
+        delta = (v_t - kv_mem) * beta_t
+        last_recurrent_state = last_recurrent_state + delta.unsqueeze(
+            -1
+        ) * k_t.unsqueeze(-2)
+        core_attn_out[:, :, token_idx] = (last_recurrent_state * q_t.unsqueeze(-2)).sum(
+            dim=-1
+        )
+
+    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
+    return core_attn_out, last_recurrent_state
+
+
+def gdn_gating(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    dt_bias: torch.Tensor,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    softplus_x = F.softplus(a.float() + dt_bias.float(), beta=beta, threshold=threshold)
+    g = -torch.exp(A_log.float()) * softplus_x
+    beta_output = torch.sigmoid(b.float()).to(dtype=b.dtype)
+    return g, beta_output
+
+
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    *,
+    initial_state: torch.Tensor,
+    scale: float | None = None,
+    cu_seqlens: torch.Tensor,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(v)
+    state_dtype = initial_state.dtype
+    chunk_size = 128
+    sequence_bounds = [
+        (
+            seq_idx,
+            int(cu_seqlens[seq_idx].item()),
+            int(cu_seqlens[seq_idx + 1].item()),
+        )
+        for seq_idx in range(len(cu_seqlens) - 1)
+    ]
+    chunk_eye = torch.eye(chunk_size, dtype=torch.float32)
+    num_sequences = len(sequence_bounds)
+    num_value_heads = v.shape[2]
+    value_head_dim = v.shape[3]
+    key_head_dim = k.shape[3]
+    final_state = torch.empty(
+        (num_sequences, num_value_heads, value_head_dim, key_head_dim),
+        dtype=state_dtype,
+    )
+
+    for seq_idx, begin, end in sequence_bounds:
+        q_seq = q[:, begin:end]
+        k_seq = k[:, begin:end]
+        v_seq = v[:, begin:end]
+        g_seq = g[:, begin:end]
+        beta_seq = beta[:, begin:end]
+
+        initial_dtype = q_seq.dtype
+        if use_qk_l2norm_in_kernel:
+            q_seq = l2norm(q_seq, dim=-1, eps=1e-6)
+            k_seq = l2norm(k_seq, dim=-1, eps=1e-6)
+
+        num_qk_heads = q_seq.shape[2]
+        num_value_heads = v_seq.shape[2]
+        if num_qk_heads != num_value_heads:
+            repeat_factor = num_value_heads // num_qk_heads
+            q_seq = q_seq.repeat_interleave(repeat_factor, dim=2)
+            k_seq = k_seq.repeat_interleave(repeat_factor, dim=2)
+
+        q_seq, k_seq, v_seq, beta_seq, g_seq = [
+            x.transpose(1, 2).contiguous().to(torch.float32)
+            for x in (q_seq, k_seq, v_seq, beta_seq, g_seq)
+        ]
+        seq_batch_size, num_heads, seq_len, qk_head_dim = q_seq.shape
+        value_head_dim = v_seq.shape[-1]
+
+        if scale is None:
+            scale = 1 / (qk_head_dim**0.5)
+
+        q_seq = q_seq * scale
+
+        seq_state = initial_state[seq_idx : seq_idx + 1].to(v_seq)
+        seq_output = torch.empty(
+            seq_batch_size,
+            num_heads,
+            seq_len,
+            value_head_dim,
+            dtype=v_seq.dtype,
+        )
+
+        for chunk_start in range(0, seq_len, chunk_size):
+            chunk_end = min(chunk_start + chunk_size, seq_len)
+            q_chunk = q_seq[:, :, chunk_start:chunk_end]
+            k_chunk = k_seq[:, :, chunk_start:chunk_end]
+            v_chunk = v_seq[:, :, chunk_start:chunk_end]
+            beta_chunk = beta_seq[:, :, chunk_start:chunk_end]
+            g_chunk = g_seq[:, :, chunk_start:chunk_end]
+            chunk_len = chunk_end - chunk_start
+
+            cum_g = g_chunk.cumsum(dim=-1)
+            exp_cum_g = cum_g.exp()
+            decay = (cum_g.unsqueeze(-1) - cum_g.unsqueeze(-2)).exp()
+
+            interaction = (k_chunk * beta_chunk.unsqueeze(-1)) @ k_chunk.transpose(
+                -1, -2
+            )
+            interaction = torch.tril(interaction * decay, diagonal=-1)
+            system = interaction + chunk_eye[:chunk_len, :chunk_len]
+
+            solved_values = torch.linalg.solve_triangular(
+                system,
+                v_chunk * beta_chunk.unsqueeze(-1),
+                upper=False,
+            )
+            solved_keys = torch.linalg.solve_triangular(
+                system,
+                (k_chunk * beta_chunk.unsqueeze(-1)) * exp_cum_g.unsqueeze(-1),
+                upper=False,
+            )
+
+            incoming_memory = torch.einsum("bhvk,bhck->bhcv", seq_state, solved_keys)
+            transformed_values = solved_values - incoming_memory
+
+            # Each chunk contributes both from the incoming recurrent state and
+            # from its own in-chunk interactions.
+            inter_chunk = torch.einsum(
+                "bhvk,bhck->bhcv",
+                seq_state,
+                q_chunk * exp_cum_g.unsqueeze(-1),
+            )
+            intra_chunk = torch.tril((q_chunk @ k_chunk.transpose(-1, -2)) * decay)
+            seq_output[:, :, chunk_start:chunk_end] = (
+                inter_chunk + intra_chunk @ transformed_values
+            )
+
+            # Carry the recurrent state forward to the next chunk boundary.
+            end_decay = (cum_g[:, :, -1:] - cum_g).exp().unsqueeze(-1)
+            decayed_keys = k_chunk * end_decay
+            seq_state = seq_state * exp_cum_g[:, :, -1, None, None] + torch.einsum(
+                "bhcv,bhck->bhvk", transformed_values, decayed_keys
+            )
+
+        output[0, begin:end].copy_(
+            seq_output.transpose(1, 2).contiguous().to(initial_dtype).squeeze(0)
+        )
+        final_state[seq_idx].copy_(seq_state.squeeze(0).to(state_dtype).contiguous())
+
+    return output, final_state
diff --git a/vllm/model_executor/layers/mhc.py b/vllm/model_executor/layers/mhc.py
index 96f894c790c3..cbc5ec2962ec 100644
--- a/vllm/model_executor/layers/mhc.py
+++ b/vllm/model_executor/layers/mhc.py
@@ -6,38 +6,23 @@
 
 import torch
 
-from vllm.logger import logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_tilelang
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import direct_register_custom_op
 
-# tilelang only ships kernels for NVIDIA CUDA targets and the mHC kernels
-# in this file additionally rely on Hopper-only PDL primitives
-# (T.pdl_sync/T.pdl_trigger) and PTXAS register tuning. On non-CUDA
-# platforms (e.g. ROCm), fall back to a torch reference implementation.
-_USE_TILELANG = (
-    TYPE_CHECKING or current_platform.is_cuda()
-) and has_tilelang()
-
-if _USE_TILELANG:
+# tilelang is only available on CUDA platforms
+if TYPE_CHECKING or current_platform.is_cuda_alike():
+    if not has_tilelang():
+        raise ImportError(
+            "tilelang is required for mhc but is not installed. Install it with "
+            "`pip install tilelang`."
+        )
     import tilelang
     import tilelang.language as T
 else:
     tilelang = None  # type: ignore[assignment]
     T = None  # type: ignore[assignment]
-    if current_platform.is_cuda() and not has_tilelang():
-        # Preserve the previous CUDA-only requirement: tilelang is the
-        # canonical fast path on NVIDIA. Surface the missing dependency
-        # loudly there so users do not silently fall onto the slow path.
-        raise ImportError(
-            "tilelang is required for mhc but is not installed. Install it with "
-            "`pip install tilelang`."
-        )
-    logger.info_once(
-        "tilelang is unavailable on this platform; using torch reference "
-        "implementation for DeepSeek-V4 mHC pre/post blocks."
-    )
 
 
 @cache
@@ -53,27 +38,12 @@ def compute_num_split(block_k: int, k: int | None, grid_size: int) -> int:
     return split_k
 
 
-def _tilelang_jit(*args, **kwargs):
-    """Decorator that becomes a no-op when tilelang is unavailable."""
-    if _USE_TILELANG:
-        return tilelang.jit(*args, **kwargs)
-
-    def _decorator(fn):
-        return fn
-
-    return _decorator
-
-
-@_tilelang_jit(
-    pass_configs=(
-        {
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
-        }
-        if _USE_TILELANG
-        else {}
-    ),
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
+    },
 )
 def mhc_pre_big_fuse_tilelang(
     gemm_out_mul,
@@ -208,74 +178,6 @@ def mhc_pre_big_fuse_tilelang(
         T.pdl_trigger()
 
 
-def _mhc_pre_torch(
-    residual: torch.Tensor,
-    fn: torch.Tensor,
-    hc_scale: torch.Tensor,
-    hc_base: torch.Tensor,
-    rms_eps: float,
-    hc_pre_eps: float,
-    hc_sinkhorn_eps: float,
-    hc_post_mult_value: float,
-    sinkhorn_repeat: int,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Pure-torch reference for ``mhc_pre``.
-
-    Mirrors ``mhc_pre_big_fuse_tilelang`` (RMS-norm scaling of a fused
-    GEMM, then sigmoid+bias for pre/post mixes, softmax+Sinkhorn for the
-    comb mix, and a residual-blend to produce ``layer_input``). Used on
-    platforms without a working tilelang/DeepGEMM stack (e.g. ROCm).
-    """
-    hc_mult = residual.shape[-2]
-    hidden_size = residual.shape[-1]
-    hc_mult2 = hc_mult * hc_mult
-    hc_mult3 = hc_mult * 2 + hc_mult2
-    outer_shape = residual.shape[:-2]
-
-    residual_flat = residual.reshape(-1, hc_mult, hidden_size)
-    num_tokens = residual_flat.shape[0]
-
-    x_f32 = residual_flat.reshape(num_tokens, hc_mult * hidden_size).to(
-        torch.float32
-    )
-    mixes = torch.matmul(x_f32, fn.t())
-    sqrsum = x_f32.square().sum(dim=-1)
-
-    rms = torch.rsqrt(sqrsum / (hc_mult * hidden_size) + rms_eps)
-    mixes = mixes * rms.unsqueeze(-1)
-
-    pre_part = mixes[:, :hc_mult]
-    post_part = mixes[:, hc_mult : 2 * hc_mult]
-    comb_part = mixes[:, 2 * hc_mult :].reshape(num_tokens, hc_mult, hc_mult)
-
-    post_base = hc_base[hc_mult : 2 * hc_mult]
-    post_mix = (
-        torch.sigmoid(post_part * hc_scale[1] + post_base) * hc_post_mult_value
-    )
-
-    comb_base = hc_base[2 * hc_mult :].reshape(hc_mult, hc_mult)
-    cm = comb_part * hc_scale[2] + comb_base
-    cm = torch.softmax(cm, dim=-1) + hc_sinkhorn_eps
-    cm = cm / (cm.sum(dim=-2, keepdim=True) + hc_sinkhorn_eps)
-    for _ in range(max(0, sinkhorn_repeat - 1)):
-        cm = cm / (cm.sum(dim=-1, keepdim=True) + hc_sinkhorn_eps)
-        cm = cm / (cm.sum(dim=-2, keepdim=True) + hc_sinkhorn_eps)
-    comb_mix_flat = cm.reshape(num_tokens, hc_mult2)
-
-    pre_base = hc_base[:hc_mult]
-    pre_mix = torch.sigmoid(pre_part * hc_scale[0] + pre_base) + hc_pre_eps
-
-    layer_input_f32 = torch.einsum(
-        "bn,bnh->bh", pre_mix, residual_flat.to(torch.float32)
-    )
-    layer_input = layer_input_f32.to(torch.bfloat16)
-
-    post_mix = post_mix.view(*outer_shape, hc_mult, 1)
-    comb_mix = comb_mix_flat.view(*outer_shape, hc_mult, hc_mult)
-    layer_input = layer_input.view(*outer_shape, hidden_size)
-    return post_mix, comb_mix, layer_input
-
-
 def mhc_pre(
     residual: torch.Tensor,
     fn: torch.Tensor,
@@ -326,25 +228,45 @@ def mhc_pre(
     assert hc_scale.shape == (3,)
     assert hc_base.shape == (hc_mult3,)
 
-    if not _USE_TILELANG:
-        return _mhc_pre_torch(
-            residual,
-            fn,
-            hc_scale,
-            hc_base,
-            rms_eps,
-            hc_pre_eps,
-            hc_sinkhorn_eps,
-            hc_post_mult_value,
-            sinkhorn_repeat,
-        )
-
     outer_shape = residual.shape[:-2]
 
     residual_flat = residual.view(-1, hc_mult, hidden_size)
     num_tokens = residual_flat.shape[0]
     fn_flat = fn
 
+    if current_platform.is_rocm():
+        x = residual_flat.view(num_tokens, hc_mult * hidden_size).to(torch.float32)
+        mixes = torch.matmul(x, fn_flat.t())
+        sqrsum = x.square().sum(dim=-1, keepdim=True)
+        mixes = mixes * torch.rsqrt(sqrsum / (hc_mult * hidden_size) + rms_eps)
+
+        pre_logits = mixes[:, :hc_mult] * hc_scale[0] + hc_base[:hc_mult]
+        pre_mix = torch.sigmoid(pre_logits) + hc_pre_eps
+
+        post_logits = (
+            mixes[:, hc_mult : 2 * hc_mult] * hc_scale[1]
+            + hc_base[hc_mult : 2 * hc_mult]
+        )
+        post_mix = torch.sigmoid(post_logits) * hc_post_mult_value
+
+        comb_logits = mixes[:, 2 * hc_mult :].view(
+            num_tokens, hc_mult, hc_mult
+        ) * hc_scale[2] + hc_base[2 * hc_mult :].view(1, hc_mult, hc_mult)
+        comb_mix = torch.softmax(comb_logits, dim=-1) + hc_sinkhorn_eps
+        comb_mix = comb_mix / (comb_mix.sum(dim=-2, keepdim=True) + hc_sinkhorn_eps)
+        for _ in range(sinkhorn_repeat - 1):
+            comb_mix = comb_mix / (comb_mix.sum(dim=-1, keepdim=True) + hc_sinkhorn_eps)
+            comb_mix = comb_mix / (comb_mix.sum(dim=-2, keepdim=True) + hc_sinkhorn_eps)
+
+        layer_input = torch.sum(
+            pre_mix.unsqueeze(-1) * residual_flat.to(torch.float32), dim=1
+        ).to(torch.bfloat16)
+        return (
+            post_mix.view(*outer_shape, hc_mult, 1),
+            comb_mix.view(*outer_shape, hc_mult, hc_mult),
+            layer_input.view(*outer_shape, hidden_size),
+        )
+
     # these number are from deepgemm kernel impl
     block_k = 64
     block_m = 64
@@ -460,16 +382,12 @@ def _mhc_pre_fake(
     return post_mix, comb_mix, layer_input
 
 
-@_tilelang_jit(
-    pass_configs=(
-        {
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
-        }
-        if _USE_TILELANG
-        else {}
-    ),
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
+    },
 )
 def mhc_post_tilelang(
     a,
@@ -481,7 +399,7 @@ def mhc_post_tilelang(
     hidden: int,
     n_thr: int = 128,
     h_blk: int = 1024,
-):
+) -> tilelang.JITKernel:
     # rename for shorter code
     n = T.dynamic("num_tokens")
     h = hidden
@@ -523,37 +441,20 @@ def mhc_post_tilelang(
         T.pdl_trigger()
 
 
-def _mhc_post_torch(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    post_layer_mix: torch.Tensor,
-    comb_res_mix: torch.Tensor,
-) -> torch.Tensor:
-    """Pure-torch reference for ``mhc_post``.
-
-    Mirrors ``mhc_post_tilelang``:
-        out[..., i_hco, h] = post_layer_mix[..., i_hco, 0] * x[..., h]
-                           + sum_{i_hci}(comb_res_mix[..., i_hci, i_hco]
-                                         * residual[..., i_hci, h])
-
-    Equivalently: ``post * x + comb.transpose(-1,-2) @ residual``.
-    """
-    x_f32 = x.to(torch.float32).unsqueeze(-2)
-    residual_f32 = residual.to(torch.float32)
-    term1 = post_layer_mix * x_f32
-    term2 = torch.matmul(comb_res_mix.transpose(-1, -2), residual_f32)
-    return (term1 + term2).to(torch.bfloat16)
-
-
 def mhc_post(
     x: torch.Tensor,
     residual: torch.Tensor,
     post_layer_mix: torch.Tensor,
     comb_res_mix: torch.Tensor,
 ) -> torch.Tensor:
-    if not _USE_TILELANG:
-        return _mhc_post_torch(x, residual, post_layer_mix, comb_res_mix)
-
+    if current_platform.is_rocm():
+        mixed_residual = torch.einsum(
+            "...ij,...ih->...jh",
+            comb_res_mix.to(torch.float32),
+            residual.to(torch.float32),
+        )
+        post_term = post_layer_mix.to(torch.float32) * x.unsqueeze(-2).to(torch.float32)
+        return (mixed_residual + post_term).to(residual.dtype)
     out = torch.empty_like(residual)
     mhc_post_tilelang(
         comb_res_mix,
@@ -588,3 +489,199 @@ def _mhc_post_fake(
     mutates_args=[],
     fake_impl=_mhc_post_fake,
 )
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
+    },
+)
+def hc_head_fuse_tilelang(
+    residual,
+    fn,
+    hc_scale,
+    hc_base,
+    out,
+    hidden_size: int,
+    rms_eps: float,
+    hc_eps: float,
+    hc_mult: int = 4,
+    n_thr: int = 128,
+    h_blk: int = 1024,
+):
+    """Two-pass fused kernel for hc_head.
+
+    Pass 1: accumulate per-token squared sum and hc_mult dot-products
+            (projections onto fn rows) using cross-thread reducers.
+    Pass 2: apply sigmoid-gated weighted sum of residual channels to output.
+
+    Avoids materialising mixes / rsqrt / pre tensors to global memory.
+    """
+    num_tokens = T.dynamic("num_tokens")
+    hc_dim = hc_mult * hidden_size
+    h_block = math.gcd(h_blk, hidden_size)
+    n_h = hidden_size // h_block
+
+    residual: T.Tensor[[num_tokens, hc_mult, hidden_size], T.bfloat16]  # type: ignore[no-redef,valid-type]
+    fn: T.Tensor[[hc_mult, hc_dim], T.float32]  # type: ignore[no-redef,valid-type]
+    hc_scale: T.Tensor[[1], T.float32]  # type: ignore[no-redef,valid-type]
+    hc_base: T.Tensor[[hc_mult], T.float32]  # type: ignore[no-redef,valid-type]
+    out: T.Tensor[[num_tokens, hidden_size], T.bfloat16]  # type: ignore[no-redef,valid-type]
+
+    with T.Kernel(num_tokens, threads=n_thr) as i:
+        T.pdl_sync()
+
+        # ------------------------------------------------------------------
+        # Pass 1 – for each residual channel m_c and h_block:
+        #   • accumulate squared sum (for RMS norm denominator)
+        #   • accumulate hc_mult dot-products with fn rows
+        # ------------------------------------------------------------------
+        sqrsum_r = T.alloc_reducer((1,), T.float32, replication="all")
+        mixes_r = T.alloc_reducer((hc_mult,), T.float32, replication="all")
+        T.fill(sqrsum_r, 0.0)
+        T.fill(mixes_r, 0.0)
+
+        for m_c in T.serial(hc_mult):
+            for i_h in T.serial(n_h):
+                x_local = T.alloc_fragment(h_block, T.float32)
+                T.copy(residual[i, m_c, i_h * h_block], x_local)
+
+                for k in T.Parallel(h_block):
+                    sqrsum_r[0] += x_local[k] * x_local[k]
+
+                for m_m in T.unroll(hc_mult):
+                    fn_local = T.alloc_fragment(h_block, T.float32)
+                    T.copy(fn[m_m, m_c * hidden_size + i_h * h_block], fn_local)
+                    for k in T.Parallel(h_block):
+                        mixes_r[m_m] += x_local[k] * fn_local[k]
+
+        T.finalize_reducer(sqrsum_r)
+        T.finalize_reducer(mixes_r)
+
+        # ------------------------------------------------------------------
+        # Compute pre_mix = sigmoid(mix * rsqrt * scale + base) + eps
+        # ------------------------------------------------------------------
+        pre_mix_shared = T.alloc_shared(hc_mult, T.float32)
+        rsqrt_val = T.alloc_fragment(1, T.float32)
+        rsqrt_val[0] = T.rsqrt(sqrsum_r[0] / hc_dim + rms_eps)
+        for m in T.Parallel(hc_mult):
+            pre_mix_shared[m] = (
+                T.sigmoid(mixes_r[m] * rsqrt_val[0] * hc_scale[0] + hc_base[m]) + hc_eps
+            )
+
+        # ------------------------------------------------------------------
+        # Pass 2 – apply_mix: pipelined weighted sum over residual channels
+        # ------------------------------------------------------------------
+        for i0_h in T.Pipelined(n_h, num_stages=2):
+            xs = T.alloc_shared((hc_mult, h_block), T.bfloat16)
+            xl = T.alloc_fragment((hc_mult, h_block), T.float32)
+            T.copy(residual[i, 0, i0_h * h_block], xs, disable_tma=True)
+            T.copy(xs, xl)
+
+            ol = T.alloc_fragment(h_block, T.float32)
+            T.clear(ol)
+            for i_hc in T.serial(hc_mult):
+                pre = pre_mix_shared[i_hc]
+                for i1_h in T.Parallel(h_block):
+                    ol[i1_h] += pre * xl[i_hc, i1_h]
+
+            T.copy(ol, out[i, i0_h * h_block], disable_tma=True)
+
+        T.pdl_trigger()
+
+
+def _hc_head_fused_reference(
+    hs_flat: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    out: torch.Tensor,
+    hidden_size: int,
+    rms_eps: float,
+    hc_eps: float,
+    hc_mult: int,
+) -> None:
+    """Pure-PyTorch reference for `hc_head_fuse_tilelang`.
+
+    Used on platforms where the tilelang HIP/CUDA backend is not available
+    (e.g. ROCm builds shipping a tilelang wheel without `target.build.tilelang_hip`).
+    Mirrors the math of the tilelang kernel exactly:
+
+        x      = hs_flat.flatten(-2, -1)                # (T, hc_mult * H), fp32
+        mixes  = x @ fn.T                               # (T, hc_mult)
+        rsqrt  = 1 / sqrt(||x||^2 / (hc_mult * H) + rms_eps)
+        pre[m] = sigmoid(mixes[m] * rsqrt * hc_scale[0] + hc_base[m]) + hc_eps
+        out    = sum_m pre[m] * hs_flat[:, m, :]        # cast back to bf16
+
+    `out` is mutated in place to keep the same op contract
+    (`mutates_args=["out"]`).
+    """
+    num_tokens = hs_flat.shape[0]
+    if num_tokens == 0:
+        return
+    x = hs_flat.reshape(num_tokens, hc_mult * hidden_size).to(torch.float32)
+    # fn: (hc_mult, hc_mult * hidden_size) → mixes: (T, hc_mult)
+    mixes = torch.matmul(x, fn.t())
+    sqrsum = x.square().sum(dim=-1, keepdim=True)
+    rsqrt = torch.rsqrt(sqrsum / (hc_mult * hidden_size) + rms_eps)
+    # hc_scale has shape (1,); hc_base has shape (hc_mult,)
+    pre_mix = torch.sigmoid(mixes * rsqrt * hc_scale[0] + hc_base) + hc_eps
+    # weighted sum over the hc_mult channel dim
+    result = torch.sum(pre_mix.unsqueeze(-1) * hs_flat.to(torch.float32), dim=1).to(
+        out.dtype
+    )
+    out.copy_(result)
+
+
+def _hc_head_fused_kernel(
+    hs_flat: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    out: torch.Tensor,
+    hidden_size: int,
+    rms_eps: float,
+    hc_eps: float,
+    hc_mult: int,
+) -> None:
+    """Fill pre-allocated `out` (T, H) in-place with the hc_head result."""
+    if hs_flat.shape[0] == 0:
+        return
+    if current_platform.is_rocm():
+        # tilelang ships only the CUDA codegen in upstream wheels, so the HIP
+        # FFI target (`target.build.tilelang_hip`) is missing and the JIT call
+        # would raise `ValueError: Cannot find global function ...`. Use a
+        # numerically equivalent torch fallback instead. `mhc_pre` and
+        # `mhc_post` already follow this same pattern above.
+        _hc_head_fused_reference(
+            hs_flat,
+            fn,
+            hc_scale,
+            hc_base,
+            out,
+            hidden_size,
+            rms_eps,
+            hc_eps,
+            hc_mult,
+        )
+        return
+    hc_head_fuse_tilelang(
+        hs_flat,
+        fn,
+        hc_scale,
+        hc_base,
+        out,
+        hidden_size,
+        rms_eps,
+        hc_eps,
+        hc_mult,
+    )
+
+
+direct_register_custom_op(
+    op_name="hc_head_fused_kernel",
+    op_func=_hc_head_fused_kernel,
+    mutates_args=["out"],
+)
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index 1d3e987b7e17..856f6bb8a3cf 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -64,6 +64,7 @@ def __init__(
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        skip_topk: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -87,6 +88,11 @@ def __init__(
         self.indexer_rope_emb = mla_modules.indexer_rotary_emb
         self.is_sparse = mla_modules.is_sparse
 
+        # Whether to skip top-k token selection computation in this layer.
+        # When True, the indexer will not be called, and the layer will reuse
+        # the topk_tokens buffer written by a previous layer in the same pass.
+        # Refer: https://arxiv.org/abs/2603.12201 for more details.
+        self.skip_topk = skip_topk
         if self.indexer is not None:
             assert hasattr(self.indexer, "topk_tokens")
             self.topk_tokens = self.indexer.topk_tokens
@@ -159,10 +165,8 @@ def forward(
                 positions, q[..., self.qk_nope_head_dim :], k_pe
             )
 
-        if self.indexer and self.is_sparse:
-            _topk_indices = self.indexer(
-                hidden_states, q_c, positions, self.indexer_rope_emb
-            )
+        if self.indexer and self.is_sparse and not self.skip_topk:
+            self.indexer(hidden_states, q_c, positions, self.indexer_rope_emb)
 
         if llama_4_scaling is not None:
             q *= llama_4_scaling
diff --git a/vllm/model_executor/layers/pooler/seqwise/methods.py b/vllm/model_executor/layers/pooler/seqwise/methods.py
index b967ff4ede7b..82170b5fbdc4 100644
--- a/vllm/model_executor/layers/pooler/seqwise/methods.py
+++ b/vllm/model_executor/layers/pooler/seqwise/methods.py
@@ -68,21 +68,23 @@ def forward(
             "partial prefill not supported with MEAN pooling"
         )
 
-        prompt_lens = pooling_cursor.prompt_lens_cpu.to(
-            hidden_states.device, dtype=torch.int64, non_blocking=True
-        )
-
-        num_seqs = prompt_lens.numel()
+        prompt_lens_cpu = pooling_cursor.prompt_lens_cpu
+        num_seqs = prompt_lens_cpu.numel()
         hidden_size = hidden_states.shape[-1]
 
         if num_seqs == 0:
             # early return for empty batch
             return hidden_states.new_empty((0, hidden_size), dtype=torch.float32)
 
-        # eg. [2, 1, 3] -> [0, 0, 1, 2, 2, 2]
+        # Build segment_ids on CPU so repeat_interleave doesn't need to sync
+        # GPU->CPU to learn its data-dependent output length, then upload
+        # non-blocking. eg. [2, 1, 3] -> [0, 0, 1, 2, 2, 2]
         segment_ids = torch.repeat_interleave(
-            torch.arange(num_seqs, device=hidden_states.device, dtype=torch.long),
-            prompt_lens,
+            torch.arange(num_seqs, dtype=torch.long),
+            prompt_lens_cpu,
+        ).to(hidden_states.device, non_blocking=True)
+        prompt_lens = prompt_lens_cpu.to(
+            hidden_states.device, dtype=torch.int64, non_blocking=True
         )
         segment_sums = torch.zeros(
             (num_seqs, hidden_size),
diff --git a/vllm/model_executor/layers/pooler/special.py b/vllm/model_executor/layers/pooler/special.py
index d06663b5b947..ae5926cd62ff 100644
--- a/vllm/model_executor/layers/pooler/special.py
+++ b/vllm/model_executor/layers/pooler/special.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
 from collections.abc import Mapping, Set
 from itertools import groupby
 
@@ -80,9 +81,11 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         poolers_by_task = self.poolers_by_task
+        cursor = pooling_metadata.pooling_cursor
 
         outputs = list[torch.Tensor | None]()
         offset = 0
+        token_offset = 0
         for task, group in groupby(pooling_metadata.tasks):
             if not (pooler := poolers_by_task.get(task)):
                 raise ValueError(
@@ -91,10 +94,37 @@ def forward(
                 )
 
             num_items = len(list(group))
-            group_output: PoolerOutput = pooler(
-                hidden_states,
-                pooling_metadata[offset : offset + num_items],
-            )
+            group_metadata = pooling_metadata[offset : offset + num_items]
+            if cursor is None:
+                group_hidden_states = hidden_states
+            else:
+                # Slice out this group's tokens so sub-poolers see only their
+                # portion of the batch. Token offset is computed from the CPU
+                # `num_scheduled_tokens_cpu` to avoid a GPU->CPU sync.
+                group_cursor = group_metadata.pooling_cursor
+                num_group_tokens = int(group_cursor.num_scheduled_tokens_cpu.sum())
+                group_hidden_states = hidden_states[
+                    token_offset : token_offset + num_group_tokens
+                ]
+                if token_offset:
+                    # Shift first/last indices to be relative to the slice
+                    # so seqwise poolers (which index `hidden_states` directly)
+                    # remain correct.
+                    pooling_cursor = dataclasses.replace(
+                        group_cursor,
+                        first_token_indices_gpu=(
+                            group_cursor.first_token_indices_gpu - token_offset
+                        ),
+                        last_token_indices_gpu=(
+                            group_cursor.last_token_indices_gpu - token_offset
+                        ),
+                    )
+                    group_metadata = dataclasses.replace(
+                        group_metadata, pooling_cursor=pooling_cursor
+                    )
+                token_offset += num_group_tokens
+
+            group_output: PoolerOutput = pooler(group_hidden_states, group_metadata)
 
             outputs.extend(group_output)
             offset += num_items
diff --git a/vllm/model_executor/layers/pooler/tokwise/methods.py b/vllm/model_executor/layers/pooler/tokwise/methods.py
index 9ee6e8527c9a..59b7234661b5 100644
--- a/vllm/model_executor/layers/pooler/tokwise/methods.py
+++ b/vllm/model_executor/layers/pooler/tokwise/methods.py
@@ -47,13 +47,12 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> list[TokenPoolingMethodOutputItem]:
         pooling_cursor = pooling_metadata.get_pooling_cursor()
-        hidden_states_lst = [
-            hidden_states[first : last + 1]
-            for first, last in zip(
-                pooling_cursor.first_token_indices_gpu.tolist(),
-                pooling_cursor.last_token_indices_gpu.tolist(),
-            )
-        ]
+        # Use the already-CPU num_scheduled_tokens tensor so `.tolist()`
+        # doesn't trigger a GPU->CPU sync. torch.split produces the same
+        # consecutive slices as indexing with first/last per-sequence indices.
+        hidden_states_lst = list(
+            torch.split(hidden_states, pooling_cursor.num_scheduled_tokens_cpu.tolist())
+        )
 
         if not self.enable_chunked_prefill:
             return hidden_states_lst
@@ -91,12 +90,14 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> list[TokenPoolingMethodOutputItem]:
         pooled_data_lst = super().forward(hidden_states, pooling_metadata)
-        prompt_token_ids = pooling_metadata.get_prompt_token_ids()
+        # Use the CPU copy of prompt_token_ids so the step_tag_id mask can be
+        # resolved to indices without a d2h sync from boolean indexing.
+        prompt_token_ids_cpu = pooling_metadata.get_prompt_token_ids_cpu()
         pooling_params = pooling_metadata.pooling_params
 
         pooled_data = list[torch.Tensor | None]()
-        for data, token_id, pooling_param in zip(
-            pooled_data_lst, prompt_token_ids, pooling_params
+        for data, token_id_cpu, pooling_param in zip(
+            pooled_data_lst, prompt_token_ids_cpu, pooling_params
         ):
             # for unfinished chunked prefill
             if data is None:
@@ -109,9 +110,11 @@ def forward(
                     data = data[:, returned_token_ids]
 
                 if step_tag_id is not None:
-                    data = data[token_id == step_tag_id]
+                    idx_cpu = (token_id_cpu == step_tag_id).nonzero(as_tuple=True)[0]
+                    idx = idx_cpu.to(data.device, non_blocking=True)
+                    data = data[idx]
 
-            pooled_data.append(data)
+                pooled_data.append(data)
 
         return pooled_data
 
diff --git a/vllm/model_executor/layers/quantization/humming.py b/vllm/model_executor/layers/quantization/humming.py
index 59f9c2ee9b97..79a1057c6003 100644
--- a/vllm/model_executor/layers/quantization/humming.py
+++ b/vllm/model_executor/layers/quantization/humming.py
@@ -9,11 +9,9 @@
 import torch
 
 from vllm import envs
-from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
-    FusedMoEQuantDesc,
 )
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
@@ -32,7 +30,6 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.parameter import (
     BasevLLMParameter,
@@ -215,6 +212,15 @@ def from_config(cls, config: dict[str, Any]) -> "HummingConfig":
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant, hf_config=None
     ) -> QuantizationMethods | None:
+        if user_quant == "humming" and hf_config is not None:
+            model_type = hf_config.model_type
+            quant_method = hf_quant_cfg.get("quant_method", None)
+            if model_type == "gpt_oss" and quant_method == "mxfp4":
+                msg = (
+                    "For gpt-oss model, use '--moe-backend humming' "
+                    "instead of '--quantization humming'."
+                )
+                raise ValueError(msg)
         return "humming" if user_quant == "humming" else None
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
@@ -299,8 +305,6 @@ def get_quant_config_for_layer(
                 force_weight_schema = schema
 
         if weight_schema is not None:
-            if weight_schema.quant_method == "gpt_oss_mxfp4" and layer_type != "moe":
-                return None
             input_schema = None
             force_input_schema = None
 
@@ -335,12 +339,6 @@ def get_quant_method(
         elif isinstance(layer, LinearBase):
             layer_type = "linear"
 
-        # TODO: remove this after humming moe backend is ready
-        quant_method = self.full_config.get("quant_method", None)
-        moe_activation = getattr(layer, "activation", None)
-        if quant_method == "mxfp4" and moe_activation == MoEActivation.SWIGLUOAI:
-            self.full_config["quan_method"] = "gpt_oss_mxfp4"
-
         quant_config = self.get_quant_config_for_layer(prefix, layer_type)
         if quant_config is None:
             if isinstance(layer, FusedMoE):
@@ -760,62 +758,18 @@ def create_weights(
         layer.register_buffer("locks", locks)
 
     def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
-        self.process_weights_after_loading(layer)
-
-        input_schema = self.input_schemas["w13"]
-        weight_schema = self.weight_schemas["w13"]
-
-        a_dtype = input_schema.a_dtype
-        if a_dtype is None or a_dtype.num_bits == 16:
-            a_quant_desc = FusedMoEQuantDesc(dtype=None)
-        else:
-            shape = GroupShape(row=1, col=-1)
-            a_quant_desc = FusedMoEQuantDesc(dtype=str(a_dtype), shape=shape)
-
-        weight_scale_group_size = weight_schema.weight_scale_group_size
-        weight_scale_group_size_n = weight_schema.weight_scale_group_size_n
-        weight_group_shape: tuple[int, ...] = ()
-        if weight_scale_group_size_n > 1:
-            weight_group_shape = GroupShape(
-                row=weight_scale_group_size,
-                col=weight_scale_group_size_n,
-            )
-        elif weight_scale_group_size == 0:
-            weight_group_shape = GroupShape(row=-1, col=1)
-        else:
-            weight_group_shape = GroupShape(row=weight_scale_group_size, col=1)
-
-        w1_quant_desc = FusedMoEQuantDesc(
-            dtype=str(weight_schema.b_dtype),
-            shape=weight_group_shape,
-            scale=getattr(layer, "w13_weight_scale", None),
-            alpha_or_gscale=getattr(layer, "w13_global_scale", None),
-            zp=getattr(layer, "w13_zero_point", None),
-            bias=getattr(layer, "w13_bias", None),
-        )
-
-        w2_quant_desc = FusedMoEQuantDesc(
-            dtype=str(weight_schema.b_dtype),
-            shape=weight_group_shape,
-            scale=getattr(layer, "w2_weight_scale", None),
-            alpha_or_gscale=getattr(layer, "w2_global_scale", None),
-            zp=getattr(layer, "w2_zero_point", None),
-            bias=getattr(layer, "w2_bias", None),
+        from vllm.model_executor.layers.quantization.utils.humming_utils import (
+            get_humming_moe_quant_config,
         )
 
-        return FusedMoEQuantConfig(
-            _a1=a_quant_desc,
-            _a2=a_quant_desc,
-            _w1=w1_quant_desc,
-            _w2=w2_quant_desc,
-        )
+        return get_humming_moe_quant_config(layer)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if getattr(self, "processed", False):
             return
         self.processed = True
-        self.weight_schemas = {}
-        self.input_schemas = {}
+        layer.weight_schemas = {}
+        layer.input_schemas = {}
         for sublayer_name, configs in layer.sublayer_configs.items():
             input_schema = self.input_schema
             weight_schema = self.weight_schema
@@ -858,8 +812,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     param = torch.nn.Parameter(tensor, requires_grad=False)
                     setattr(layer, name, param)
 
-                self.weight_schemas[sublayer_name] = weight_schema
-                self.input_schemas[sublayer_name] = input_schema
+                layer.weight_schemas[sublayer_name] = weight_schema
+                layer.input_schemas[sublayer_name] = input_schema
 
             # force requant (origin quant setting -> fp16/bf16 -> new_quant setting)
             assert isinstance(weight_schema, HummingWeightSchema)
@@ -913,10 +867,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # use moe modular
         experts: HummingIndexedExperts | HummingGroupedExperts
+        assert self.moe_quant_config is not None
         if get_humming_moe_gemm_type() == "indexed":
-            experts = HummingIndexedExperts(layer, self)
+            experts = HummingIndexedExperts(layer, self.moe, self.moe_quant_config)
         else:
-            experts = HummingGroupedExperts(layer, self)
+            experts = HummingGroupedExperts(layer, self.moe, self.moe_quant_config)
         self.experts = experts
 
     def select_gemm_impl(
@@ -927,12 +882,19 @@ def select_gemm_impl(
         from vllm.model_executor.layers.fused_moe import modular_kernel as mk
 
         activation_format = prepare_finalize.activation_format
+        assert self.moe_quant_config is not None
         if activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
-            return BatchedHummingGroupedExperts(layer, self, prepare_finalize)
+            return BatchedHummingGroupedExperts(
+                layer=layer,
+                moe_config=self.moe,
+                quant_config=self.moe_quant_config,
+                max_num_tokens=prepare_finalize.max_num_tokens_per_rank(),
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+            )
         elif get_humming_moe_gemm_type() == "indexed":
-            return HummingIndexedExperts(layer, self, prepare_finalize)
+            return HummingIndexedExperts(layer, self.moe, self.moe_quant_config)
         else:
-            return HummingGroupedExperts(layer, self, prepare_finalize)
+            return HummingGroupedExperts(layer, self.moe, self.moe_quant_config)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 242cc105e470..0862efbea294 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -114,12 +114,12 @@
     # MIXED_PRECISION,
     "MIXED_PRECISION",
 ]
-KV_CACHE_QUANT_ALGOS = ["FP8"]
+KV_CACHE_QUANT_ALGOS = ["FP8", "NVFP4"]
 
 
-class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+class ModelOptKVCacheMethod(BaseKVCacheMethod):
     """
-    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    Supports loading kv-cache scaling factors from FP8 or NVFP4 checkpoints.
     """
 
     def __init__(self, quant_config: "ModelOptQuantConfigBase"):
@@ -995,7 +995,7 @@ def apply(
 
 ModelOptFp8Config.LinearMethodCls = ModelOptFp8LinearMethod
 ModelOptFp8Config.FusedMoEMethodCls = ModelOptFp8MoEMethod
-ModelOptFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+ModelOptFp8Config.KVCacheMethodCls = ModelOptKVCacheMethod
 
 
 class ModelOptNvFp4Config(ModelOptQuantConfigBase):
@@ -1488,7 +1488,7 @@ def apply(
 
 ModelOptNvFp4Config.LinearMethodCls = ModelOptNvFp4LinearMethod
 ModelOptNvFp4Config.FusedMoEMethodCls = ModelOptNvFp4FusedMoE
-ModelOptNvFp4Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+ModelOptNvFp4Config.KVCacheMethodCls = ModelOptKVCacheMethod
 
 
 class ModelOptMxFp8Config(ModelOptQuantConfigBase):
@@ -2018,7 +2018,7 @@ def apply(
 # Register the method classes for ModelOptMxFp8Config
 ModelOptMxFp8Config.LinearMethodCls = ModelOptMxFp8LinearMethod
 ModelOptMxFp8Config.FusedMoEMethodCls = ModelOptMxFp8FusedMoE
-ModelOptMxFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+ModelOptMxFp8Config.KVCacheMethodCls = ModelOptKVCacheMethod
 
 
 class ModelOptMixedPrecisionConfig(ModelOptQuantConfigBase):
@@ -2166,7 +2166,7 @@ def get_quant_method(
         # KV-cache quantization
         if isinstance(layer, Attention):
             if self.kv_cache_quant_method:
-                return ModelOptFp8KVCacheMethod(self)
+                return ModelOptKVCacheMethod(self)
             return None
 
         # Excluded layers
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 0a516831c4ec..d6fef0b3d3d5 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -24,7 +24,7 @@
     make_mxfp4_moe_kernel,
     make_mxfp4_moe_quant_config,
     mxfp4_round_up_hidden_size_and_intermediate_size,
-    select_gpt_oss_mxfp4_moe_backend,
+    select_deepseek_v4_mxfp4_moe_backend,
     select_mxfp4_moe_backend,
 )
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -140,7 +140,7 @@ class GptOssMxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.weight_dtype = "gpt_oss_mxfp4"
-        self.mxfp4_backend, self.experts_cls = select_gpt_oss_mxfp4_moe_backend(moe)
+        self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(moe)
 
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
@@ -366,6 +366,7 @@ def _setup_kernel(
                 experts_cls=self.experts_cls,
                 routing_tables=layer._maybe_init_expert_routing_tables(),
                 shared_experts=layer.shared_experts,
+                layer=layer,
             )
 
     def process_weights_after_loading(self, layer):
@@ -404,6 +405,7 @@ def get_fused_moe_quant_config(
             gemm1_alpha=1.702,
             gemm1_beta=1.0,
             swiglu_limit=7.0,
+            layer=layer,
         )
 
     def select_gemm_impl(
@@ -466,7 +468,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.weight_dtype = "mxfp4"
-        self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(moe)
+        self.mxfp4_backend, self.experts_cls = select_deepseek_v4_mxfp4_moe_backend(moe)
 
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
@@ -692,6 +694,7 @@ def _setup_kernel(
                 experts_cls=self.experts_cls,
                 routing_tables=layer._maybe_init_expert_routing_tables(),
                 shared_experts=layer.shared_experts,
+                layer=layer,
             )
 
     def process_weights_after_loading(self, layer):
@@ -729,6 +732,7 @@ def get_fused_moe_quant_config(
             w1_bias=w1_bias,
             w2_bias=w2_bias,
             swiglu_limit=swiglu_limit,
+            layer=layer,
         )
 
     def select_gemm_impl(
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 6aaf9a645880..dd7f33e2619c 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -5,7 +5,6 @@
 from typing import TYPE_CHECKING, Any, cast
 
 import torch
-from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention import Attention
@@ -45,10 +44,6 @@
 
 logger = init_logger(__name__)
 
-# model_type values that use dynamic MXFP4 re-quantization for
-# OCP MX fp4 Quark checkpoints
-_DEEPSEEK_V3_FAMILY_MODEL_TYPES = frozenset({"deepseek_v3"})
-
 
 class QuarkConfig(QuantizationConfig):
     def __init__(
@@ -65,32 +60,12 @@ def __init__(
         self.kv_cache_group = kv_cache_group
         self.kv_cache_config = kv_cache_config
         self.pack_method = pack_method
+        # Note : this flag is kept disabled because the overhead of
+        # dynamic mxfp4 quantization negates the performance gains
+        # that come from shifting to mxfp4. It is left here in case
+        # we want to re-enable it in the future.
         self.dynamic_mxfp4_quant = False
 
-    def maybe_update_config(
-        self,
-        model_name: str,
-        hf_config: PretrainedConfig | None = None,
-        revision: str | None = None,
-    ):
-        """Enable dynamic MXFP4 only for DeepSeek-V3-family + fp4 Quark checkpoints."""
-
-        if (
-            getattr(hf_config, "model_type", None)
-            not in _DEEPSEEK_V3_FAMILY_MODEL_TYPES
-        ):
-            return
-
-        quant_config = getattr(hf_config, "quantization_config", None)
-        if quant_config is not None:
-            quant_dtype = (
-                quant_config.get("global_quant_config", {})
-                .get("weight", {})
-                .get("dtype")
-            )
-            if quant_dtype == "fp4":
-                self.dynamic_mxfp4_quant = True
-
     def get_linear_method(self) -> "QuarkLinearMethod":
         return QuarkLinearMethod(self)
 
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index d92acb85c265..a14bfbc9c19b 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -35,19 +35,19 @@
     make_mxfp4_moe_kernel,
     make_mxfp4_moe_quant_config,
     mxfp4_round_up_hidden_size_and_intermediate_size,
-    select_gpt_oss_mxfp4_moe_backend,
+    select_mxfp4_moe_backend,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_fp8_moe_layer_for_marlin,
 )
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    _swizzle_mxfp4,
-)
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_BLOCK_SIZE,
     OCP_MX_Scheme,
 )
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    kFp8StaticTensorSym,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d,
     normalize_e4m3fn_to_e4m3fnuz,
@@ -62,7 +62,6 @@
 __all__ = [
     "QuarkMoEMethod",
     "QuarkOCP_MX_MoEMethod",
-    "QuarkOCP_MX_MoEMethod_OSS",
 ]
 
 
@@ -94,22 +93,9 @@ def get_moe_method(
         elif quant_config._is_fp8_w8a8(weight_config, input_config):
             return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_w_ocp_mx_a_x(weight_config, input_config):
-            emulate = not current_platform.supports_mx() or not (
-                rocm_aiter_ops.is_fused_moe_enabled()
-            )
-            if (
-                input_config is not None
-                and input_config.get("dtype") == "fp8_e4m3"
-                and not input_config.get("is_dynamic")
-                and not emulate
-            ):
-                return QuarkOCP_MX_MoEMethod_OSS(
-                    weight_config, input_config, module.moe_config
-                )
-            else:
-                return QuarkOCP_MX_MoEMethod(
-                    weight_config, input_config, module.moe_config
-                )
+            # All OCP MX schemes (W4A16, W4A8, etc.) handled by QuarkOCP_MX_MoEMethod
+            # Backend selection happens inside via oracle
+            return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_static_tensor_w8a8(
             weight_config, input_config
         ) or quant_config._is_dynamic_per_token_w8a8(weight_config, input_config):
@@ -993,7 +979,7 @@ def __init__(
         self.experts_cls: type[mk.FusedMoEExperts] | None = None
         self.moe_kernel: mk.FusedMoEKernel | None = None
 
-        # Used for triton kernel precision configs
+        # Used for triton kernel precision configs (W4A8, TRITON backends)
         self.w13_precision_config = None
         self.w2_precision_config = None
 
@@ -1002,6 +988,17 @@ def __init__(
         else:
             self.static_input_scales = False
 
+        # Select backend based on OCP MX scheme
+        if self.ocp_mx_scheme == "w_mxfp4":
+            # W4A16: weight-only MXFP4
+            self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(moe)
+        elif self.ocp_mx_scheme == "w_mxfp4_a_fp8" and self.static_input_scales:
+            # W4A8: MXFP4 weights + static FP8 activations
+            self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(
+                moe, activation_key=kFp8StaticTensorSym
+            )
+
+        # Validation for unsupported schemes
         if any(
             self.ocp_mx_scheme.endswith(a_scheme)
             for a_scheme in ["a_mxfp4", "a_mxfp6_e3m2", "a_mxfp6_e2m3"]
@@ -1025,12 +1022,8 @@ def __init__(
             get_current_vllm_config().model_config.hf_config, "model_type", None
         )
 
-        # TODO(aiter): extend once rocm_aiter_fused_experts gains dispatch
-        # for the other OCP MX schemes. Today its CK MoE kernel only has an
-        # entry for `w_mxfp4` (w4a16); mixed schemes like `w_mxfp4_a_mxfp6_*`
-        # fall through to QuantMethod.NO and raise "Unsupported kernel config
-        # for moe heuristic dispatch".
-        _AITER_NATIVE_OCP_MX_SCHEMES = ("w_mxfp4",)
+        # TODO: Remove once all OCP MX schemes use the kernel abstraction
+        _AITER_NATIVE_OCP_MX_SCHEMES = ("w_mxfp4", "w_mxfp4_a_mxfp4", "w_mxfp4_a_fp8")
         self.emulate = (
             not current_platform.supports_mx()
             or self.ocp_mx_scheme not in _AITER_NATIVE_OCP_MX_SCHEMES
@@ -1038,9 +1031,6 @@ def __init__(
             self.mxfp4_backend is Mxfp4MoeBackend.NONE or not self.use_rocm_aiter_moe
         )
 
-        if self.ocp_mx_scheme == "w_mxfp4":
-            self.mxfp4_backend, self.experts_cls = select_gpt_oss_mxfp4_moe_backend(moe)
-
         if self.emulate:
             # We use the same code path between MXFP4/MXFP6 emulation.
             self.mxfp4_backend = Mxfp4MoeBackend.EMULATION
@@ -1050,7 +1040,12 @@ def __init__(
         if self.mxfp4_backend != Mxfp4MoeBackend.NONE:
             self.experts_cls = backend_to_kernel_cls(self.mxfp4_backend)[0]
 
-        if self.emulate:
+        # Log backend selection
+        if self.mxfp4_backend != Mxfp4MoeBackend.NONE:
+            logger.info_once(
+                f"Using {self.mxfp4_backend.value} backend for {self.ocp_mx_scheme}"
+            )
+        elif self.emulate:
             logger.warning_once(
                 f"The current mode (supports_mx={current_platform.supports_mx()}, "
                 f"use_rocm_aiter_moe={self.use_rocm_aiter_moe}, "
@@ -1060,10 +1055,6 @@ def __init__(
                 "QDQ (quantize and dequantize) will be used, with the linear "
                 "layers computed in high precision."
             )
-        else:
-            logger.warning_once(
-                "The current mode supports native MoE MXFP4 computation"
-            )
 
     def maybe_roundup_sizes(
         self,
@@ -1208,6 +1199,11 @@ def create_weights(
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer):
+        # For MXFP4 schemes with native backend, use oracle
+        if self.mxfp4_backend != Mxfp4MoeBackend.NONE:
+            self._setup_kernel(layer)
+            return
+
         if self.static_input_scales and self.input_dtype == "fp8":
             # firstly, process activations if fp8 static input
             if layer.w13_input_scale is None or layer.w2_input_scale is None:
@@ -1256,14 +1252,6 @@ def process_weights_after_loading(self, layer):
                         w2_input_scale, requires_grad=False
                     )
 
-        # For w_mxfp4, use oracle functions
-        if self.emulate or (
-            self.ocp_mx_scheme == "w_mxfp4"
-            and self.mxfp4_backend != Mxfp4MoeBackend.NONE
-        ):
-            self._setup_kernel_via_oracle(layer)
-            return
-
         # TODO(bowenbao): gradually migrate to oracles.
         # Existing AITER path for w_mxfp4_a_mxfp4 and other schemes
         from aiter.utility.fp4_utils import e8m0_shuffle
@@ -1302,46 +1290,48 @@ def process_weights_after_loading(self, layer):
         self.moe_quant_config = self.get_fused_moe_quant_config(layer)
         torch.accelerator.empty_cache()
 
-    def _setup_kernel_via_oracle(self, layer: FusedMoE):
-        """Setup kernel using oracle functions for w_mxfp4 scheme."""
-        w13 = layer.w13_weight
-        w2 = layer.w2_weight
-        w13_scale = layer.w13_weight_scale
-        w2_scale = layer.w2_weight_scale
+    def _setup_kernel(self, layer: FusedMoE):
+        """Setup kernel using oracle functions for MXFP4 schemes (W4A16, W4A8)."""
         w13_bias = getattr(layer, "w13_bias", None)
         w2_bias = getattr(layer, "w2_bias", None)
 
-        # Convert weights to kernel format
+        # Convert weights to kernel format (handles all backend-specific logic)
         w13, w2, w13_scale, w2_scale, w13_bias, w2_bias = (
             convert_gpt_oss_weight_to_mxfp4_moe_kernel_format(
                 mxfp4_backend=self.mxfp4_backend,
                 layer=layer,
-                w13_weight=w13,
-                w2_weight=w2,
-                w13_weight_scale=w13_scale,
-                w2_weight_scale=w2_scale,
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                w13_weight_scale=layer.w13_weight_scale,
+                w2_weight_scale=layer.w2_weight_scale,
                 w13_bias=w13_bias,
                 w2_bias=w2_bias,
             )
         )
 
-        # For TRITON backends, weights are wrapped tensors from triton_kernels
-        # that don't support .detach(). Manually assign parameters.
-        if self.mxfp4_backend not in TRITON_BACKENDS:
-            replace_parameter(layer, "w13_weight", w13)
-            replace_parameter(layer, "w2_weight", w2)
-            replace_parameter(layer, "w13_weight_scale", w13_scale)
-            replace_parameter(layer, "w2_weight_scale", w2_scale)
-        else:
+        # Handle weight/scale assignment based on backend type
+        if self.mxfp4_backend in TRITON_BACKENDS or self.mxfp4_backend in (
+            Mxfp4MoeBackend.AITER_MXFP4_FP8,
+        ):
+            # Triton-based backends: w13/w2 are triton_kernels.tensor.Tensor
+            # Store on layer for apply(), scales are PrecisionConfig
             layer.w13_weight = w13
             layer.w2_weight = w2
             self.w13_precision_config = w13_scale
             self.w2_precision_config = w2_scale
+        else:
+            # Standard backends: replace parameters
+            replace_parameter(layer, "w13_weight", w13)
+            replace_parameter(layer, "w2_weight", w2)
+            replace_parameter(layer, "w13_weight_scale", w13_scale)
+            replace_parameter(layer, "w2_weight_scale", w2_scale)
 
         if w13_bias is not None and w2_bias is not None:
             replace_parameter(layer, "w13_bias", w13_bias)
             replace_parameter(layer, "w2_bias", w2_bias)
 
+        torch.accelerator.empty_cache()
+
         # Build quant config and kernel
         self.moe_quant_config = self.get_fused_moe_quant_config(layer)
         if self.moe_quant_config is not None and self.experts_cls is not None:
@@ -1357,22 +1347,26 @@ def _setup_kernel_via_oracle(self, layer: FusedMoE):
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        # For w_mxfp4 with oracle backend, use oracle function
-        if self.ocp_mx_scheme == "w_mxfp4" and self.mxfp4_backend not in (
-            Mxfp4MoeBackend.NONE,
-            Mxfp4MoeBackend.EMULATION,
-        ):
-            w1_scale = layer.w13_weight_scale
-            w2_scale = layer.w2_weight_scale
-            if self.mxfp4_backend in TRITON_BACKENDS:
+        # For oracle-based backends (W4A16, W4A8), use make_mxfp4_moe_quant_config
+        if self.mxfp4_backend not in (Mxfp4MoeBackend.NONE, Mxfp4MoeBackend.EMULATION):
+            # Determine scale source based on backend type
+            if self.mxfp4_backend in TRITON_BACKENDS or self.mxfp4_backend in (
+                Mxfp4MoeBackend.AITER_MXFP4_FP8,
+            ):
                 w1_scale = self.w13_precision_config
                 w2_scale = self.w2_precision_config
+            else:
+                w1_scale = layer.w13_weight_scale
+                w2_scale = layer.w2_weight_scale
+
             return make_mxfp4_moe_quant_config(
                 mxfp4_backend=self.mxfp4_backend,
                 w1_scale=w1_scale,
                 w2_scale=w2_scale,
                 w1_bias=getattr(layer, "w13_bias", None),
                 w2_bias=getattr(layer, "w2_bias", None),
+                a1_scale=getattr(layer, "w13_input_scale", None),
+                a2_scale=getattr(layer, "w2_input_scale", None),
             )
 
         # Emulation and other schemes
@@ -1425,7 +1419,7 @@ def apply(
         topk_ids: torch.Tensor,
         shared_experts_input: torch.Tensor | None,
     ) -> torch.Tensor:
-        # For oracle kernel or emulation kernel
+        # For oracle-based kernels (W4A16, W4A8) or emulation kernel
         if self.moe_kernel is not None:
             return self.moe_kernel.apply(
                 hidden_states=x,
@@ -1477,135 +1471,3 @@ def apply_monolithic(
             expert_map=layer.expert_map,
             apply_router_weight_on_input=layer.apply_router_weight_on_input,
         )
-
-
-class QuarkOCP_MX_MoEMethod_OSS(QuarkOCP_MX_MoEMethod):
-    def __init__(
-        self,
-        weight_config: dict[str, Any],
-        input_config: dict[str, Any],
-        moe: FusedMoEConfig,
-    ):
-        super().__init__(weight_config, input_config, moe)
-
-    def process_weights_after_loading(self, layer):
-        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
-
-        w13_bias = layer.w13_bias.to(torch.float32)
-        w2_bias = layer.w2_bias.to(torch.float32)
-
-        layer.w13_bias = torch.nn.Parameter(w13_bias, requires_grad=False)
-        layer.w2_bias = torch.nn.Parameter(w2_bias, requires_grad=False)
-
-        # FIXME warp need to be adjusted based on batch size
-        # only apply to batched mode
-        if self.moe.use_ep:
-            num_warps = 4 if self.moe.max_num_tokens <= 512 else 8
-        else:
-            num_warps = 8
-
-        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
-            layer.w13_weight, layer.w13_weight_scale, num_warps
-        )
-        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
-            layer.w2_weight, layer.w2_weight_scale, num_warps
-        )
-
-        self.w13_weight_triton_tensor = w13_weight
-        self.w2_weight_triton_tensor = w2_weight
-
-        # need to delete the original weights to save memory on single GPU
-        del layer.w13_weight
-        del layer.w2_weight
-        layer.w13_weight = None
-        layer.w2_weight = None
-        torch.accelerator.empty_cache()
-
-        if self.static_input_scales:
-            if layer.w13_input_scale is None or layer.w2_input_scale is None:
-                raise ValueError(
-                    "QuantConfig has static quantization, but found "
-                    "activation scales are None."
-                )
-            if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
-                layer.w2_input_scale
-            ):
-                logger.warning_once(
-                    "Found input_scales that are not equal for "
-                    "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer."
-                )
-
-            layer.w13_input_scale = torch.nn.Parameter(
-                layer.w13_input_scale.max().to(torch.float32), requires_grad=False
-            )
-            layer.w2_input_scale = torch.nn.Parameter(
-                layer.w2_input_scale.max().to(torch.float32), requires_grad=False
-            )
-
-            from triton_kernels.numerics import InFlexData
-
-            lhs_data13 = InFlexData(scale=layer.w13_input_scale)
-            lhs_data2 = InFlexData(scale=layer.w2_input_scale)
-
-            self.w13_precision_config = PrecisionConfig(
-                weight_scale=w13_scale,
-                flex_ctx=FlexCtx(rhs_data=w13_flex, lhs_data=lhs_data13),
-            )
-
-            self.w2_precision_config = PrecisionConfig(
-                weight_scale=w2_scale,
-                flex_ctx=FlexCtx(rhs_data=w2_flex, lhs_data=lhs_data2),
-            )
-
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        return mxfp4_w4a8_moe_quant_config(
-            w1_scale=self.w13_precision_config,
-            w2_scale=self.w2_precision_config,
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
-            w1_bias=layer.w13_bias,
-            w2_bias=layer.w2_bias,
-            block_shape=None,
-        )
-
-    @property
-    def is_monolithic(self) -> bool:
-        return True
-
-    def apply_monolithic(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        expert_map: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if layer.enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet."
-            )
-
-        from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (  # noqa: E501
-            triton_kernel_moe_forward,
-        )
-
-        assert self.moe.hidden_dim_unpadded is not None
-        assert self.moe.intermediate_size_per_partition_unpadded is not None
-        return triton_kernel_moe_forward(
-            hidden_states=x,
-            w1=self.w13_weight_triton_tensor,
-            w2=self.w2_weight_triton_tensor,
-            gating_output=router_logits,
-            topk=layer.top_k,
-            renormalize=layer.renormalize,
-            global_num_experts=layer.global_num_experts,
-            expert_map=expert_map,
-            quant_config=self.moe_quant_config,
-            apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            unpadded_N_w1=self.moe.intermediate_size_per_partition_unpadded * 2,
-            unpadded_K_w1=self.moe.hidden_dim_unpadded,
-            unpadded_N_w2=self.moe.hidden_dim_unpadded,
-            unpadded_K_w2=self.moe.intermediate_size_per_partition_unpadded,
-        )
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index 620d29515d95..70a7e81cc455 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -96,21 +96,12 @@ def gemm_with_dynamic_quant(
                     x_q = x
                     x_s = x_scales
 
-                # 32 alignment is enough for dim0 padding of output for
-                # gemm_a4w4 kernel
-                y = torch.empty(
-                    (M + 31) // 32 * 32,
-                    weight.shape[0],
-                    device=x_q.device,
-                    dtype=out_dtype,
-                )
-
-                gemm_a4w4(
+                y = gemm_a4w4(
                     x_q,
                     weight.view(x_q.dtype),
                     x_s,
                     weight_scale.view(x_s.dtype),
-                    y,
+                    dtype=out_dtype,
                     bpreshuffle=True,
                 )
             return y[:M]
diff --git a/vllm/model_executor/layers/quantization/turboquant/config.py b/vllm/model_executor/layers/quantization/turboquant/config.py
index f9cfc89c0c1d..50beb8d1d9bf 100644
--- a/vllm/model_executor/layers/quantization/turboquant/config.py
+++ b/vllm/model_executor/layers/quantization/turboquant/config.py
@@ -2,8 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """TurboQuant configuration."""
 
+from __future__ import annotations
+
+import logging
 import math
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+logger = logging.getLogger(__name__)
 
 # Named TQ presets: each maps to frozen config parameters.
 # key_quant_bits: 8 = FP8 keys, 3-4 = MSE (Lloyd-Max) quantized keys.
@@ -159,12 +168,34 @@ def slot_size_aligned(self) -> int:
         return s + (s % 2)  # round up to even
 
     @staticmethod
-    def get_boundary_skip_layers(num_layers: int, n: int = 2) -> list[str]:
-        """Get layer indices to skip TQ compression (boundary protection).
-
-        Returns first N and last N layer indices as strings, suitable for
-        kv_cache_dtype_skip_layers.
+    def get_boundary_skip_layers(
+        model_config: ModelConfig,
+        n: int = 2,
+    ) -> list[str]:
+        """Layer indices to skip TQ compression (boundary protection).
+
+        For hybrid models (attention + Mamba/linear-attention), boundary
+        protection is disabled — hybrids typically have only 8-12
+        full-attention layers and a hard n=2 on each side would cover
+        ~40 % of them.  The dense GSM8K baselines that motivate n=2
+        don't apply to hybrids.
+
+        For dense models, skips first N and last N attention layers.
+        Empirically required for aggressive presets (k3v4_nc, 3bit_nc)
+        — without it GSM8K drops ~30 points on Qwen3-4B.
         """
+        if model_config.is_hybrid:
+            attn_indices = _get_full_attention_layer_indices(model_config)
+            if not attn_indices:
+                raise NotImplementedError(
+                    "TurboQuant KV cache requires identifiable "
+                    "full-attention layers, but none were found in "
+                    "the hybrid model config."
+                )
+            logger.info("TQ hybrid: full-attention layers %s", attn_indices)
+            return []
+
+        num_layers = model_config.hf_text_config.num_hidden_layers
         if n <= 0 or num_layers <= 0:
             return []
         n = min(n, num_layers // 2)  # don't skip more than half
@@ -175,7 +206,7 @@ def get_boundary_skip_layers(num_layers: int, n: int = 2) -> list[str]:
         return [str(i) for i in indices]
 
     @staticmethod
-    def from_cache_dtype(cache_dtype: str, head_dim: int) -> "TurboQuantConfig":
+    def from_cache_dtype(cache_dtype: str, head_dim: int) -> TurboQuantConfig:
         """Create config from a named preset.
 
         Valid presets: turboquant_k8v4, turboquant_4bit_nc, etc.
@@ -193,3 +224,31 @@ def from_cache_dtype(cache_dtype: str, head_dim: int) -> "TurboQuantConfig":
             value_quant_bits=preset["value_quant_bits"],
             norm_correction=preset["norm_correction"],
         )
+
+
+def _get_full_attention_layer_indices(model_config: ModelConfig) -> list[int]:
+    """Global indices of full-attention layers in a hybrid model.
+
+    Covers the conventions used across vLLM: ``layer_types`` (Qwen3.5/Next),
+    ``layers_block_type`` (Jamba/Zamba2), ``attn_type_list`` (Minimax).
+    """
+    text_cfg = model_config.hf_text_config
+    hf_cfg = model_config.hf_config
+
+    layer_types = getattr(text_cfg, "layer_types", None)
+    if layer_types is not None:
+        return [
+            i for i, t in enumerate(layer_types) if t in ("full_attention", "attention")
+        ]
+
+    layers_block_type = getattr(text_cfg, "layers_block_type", None)
+    if layers_block_type is not None:
+        return [
+            i for i, t in enumerate(layers_block_type) if t in ("attention", "hybrid")
+        ]
+
+    attn_type_list = getattr(hf_cfg, "attn_type_list", None)
+    if attn_type_list is not None:
+        return [i for i, t in enumerate(attn_type_list) if t == 1]
+
+    return []
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 9613b11d35e2..d9aab35c25f4 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -843,6 +843,15 @@ def w8a8_triton_block_scaled_mm(
     assert len(block_size) == 2
     block_n, block_k = block_size[0], block_size[1]
 
+    # Triton cannot currently bind E8M0 scale tensors directly. On ROCm,
+    # DeepSeek-V4 checkpoints store block scales in exponent-only E8M0 format,
+    # so decode them to fp32 before launching the kernel.
+    if current_platform.is_rocm():
+        if As.dtype == torch.float8_e8m0fnu:
+            As = _upcast_e8m0_to_fp32(As).contiguous()
+        if Bs.dtype == torch.float8_e8m0fnu:
+            Bs = _upcast_e8m0_to_fp32(Bs).contiguous()
+
     assert A.shape[-1] == B.shape[-1]
     assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
     assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
diff --git a/vllm/model_executor/layers/quantization/utils/humming_moe_utils.py b/vllm/model_executor/layers/quantization/utils/humming_moe_utils.py
deleted file mode 100644
index 82788a0e76e8..000000000000
--- a/vllm/model_executor/layers/quantization/utils/humming_moe_utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-
-from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size,
-)
-
-
-def humming_moe_align(
-    configs: list[int],
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    expert_map: torch.Tensor | None = None,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    assert len(configs) > 0 and len(configs) % 3 == 0
-    # NOTE: we choose moe_block_size based on
-    #       num_tokens * top_k (= topk_ids.nelement())
-    shape_m = topk_ids.nelement()
-
-    for i in range(len(configs) // 3):
-        if shape_m > configs[i * 3] and shape_m <= configs[i * 3 + 1]:
-            block_size = configs[i * 3 + 2]
-            break
-    else:
-        raise ValueError(f"Could not find a matching block_size for shape_m={shape_m}")
-
-    return moe_align_block_size(
-        topk_ids=topk_ids,
-        block_size=block_size,
-        num_experts=num_experts,
-        expert_map=expert_map,
-        pad_sorted_ids=False,
-        ignore_invalid_experts=True,
-    )
diff --git a/vllm/model_executor/layers/quantization/utils/humming_utils.py b/vllm/model_executor/layers/quantization/utils/humming_utils.py
new file mode 100644
index 000000000000..f8c10bdcae16
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/humming_utils.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import regex as re
+import torch
+from humming.layer import HummingInputSchema, HummingMethod
+from humming.schema import BaseWeightSchema
+
+from vllm import envs
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+    FusedMoEQuantDesc,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+
+
+def humming_is_layer_skipped(config: dict[str, Any], prefix: str):
+    if not config:
+        return True
+
+    keys = ["ignored_layers", "ignore", "modules_to_not_convert"]
+    ignored_layers: list[str] = []
+    for key in keys:
+        ignored_layers = config.get(key, []) or []
+        if not ignored_layers:
+            break
+
+    if any(module_name in prefix for module_name in ignored_layers):
+        return True
+    if "lm_head" in prefix:
+        return True
+
+    for regex in config.get("dynamic", {}):
+        if regex[:1] != "-":
+            continue
+        if re.match(regex[2:], prefix):
+            return True
+
+    return False
+
+
+def prepare_humming_layer(layer: LinearBase, quant_config: dict):
+    weight_schema = BaseWeightSchema.from_config(quant_config)
+    input_schema = HummingInputSchema()
+
+    shape_k_stacks = [layer.input_size_per_partition]
+    shape_n_stacks = layer.output_partition_sizes
+
+    # Step 1: convert weight to humming standard format
+    weight_schema, tensors = weight_schema.convert_humming(
+        tensors=layer.named_parameters(),
+        shape_n_stacks=shape_n_stacks,
+        shape_k_stacks=shape_k_stacks,
+        param_dtype=layer.params_dtype,
+    )
+
+    layer.weight_schema = weight_schema
+
+    for name, _ in list(layer.named_parameters()):
+        delattr(layer, name)
+
+    for name, tensor in tensors.items():
+        param = torch.nn.Parameter(tensor, requires_grad=False)
+        setattr(layer, name, param)
+
+    # Step 2: transform weight (humming standard format) for forwarding
+    HummingMethod.prepare_layer_meta(
+        layer=layer,
+        shape_n=layer.output_partition_sizes_sum,
+        shape_k=layer.input_size_per_partition,
+        weight_schema=weight_schema,
+        input_schema=input_schema,
+        pad_n_to_multiple=256,
+        pad_k_to_multiple=128,
+        has_bias=layer.has_bias,
+        torch_dtype=layer.param_dtype,
+    )
+
+    HummingMethod.transform_humming_layer(layer)
+
+
+def prepare_humming_moe_layer(layer: FusedMoE, quant_config: dict):
+    weight_schema = BaseWeightSchema.from_config(quant_config)
+    input_quant_config = envs.VLLM_HUMMING_INPUT_QUANT_CONFIG or {}
+    if humming_is_layer_skipped(input_quant_config, layer.layer_name):
+        input_schema = HummingInputSchema()
+    else:
+        # TODO: read input_quant_config from quant_config
+        input_schema = HummingInputSchema.from_config(input_quant_config)
+
+    is_gated = layer.activation.is_gated
+    shape_config = {
+        "w13": (
+            layer.moe_config.intermediate_size_per_partition * 2,
+            layer.moe_config.hidden_dim,
+        ),
+        "w2": (
+            layer.moe_config.hidden_dim,
+            layer.moe_config.intermediate_size_per_partition * (1 if is_gated else 2),
+        ),
+    }
+
+    layer.weight_schemas = {}
+    layer.input_schemas = {}
+
+    for sublayer_name in shape_config:
+        # Step 1: convert weight to humming standard format
+        tensors: dict[str, torch.Tensor] = dict(
+            (key.removeprefix(sublayer_name + "_"), value)
+            for key, value in layer.state_dict().items()
+            if key.startswith(sublayer_name + "_")
+        )
+
+        shape_n, shape_k = shape_config[sublayer_name]
+        shape_n_stacks = [shape_n]
+        shape_k_stacks = [shape_k]
+        if sublayer_name == "w13":
+            shape_n_stacks = [shape_n // 2] * 2
+
+        weight_schema_new, tensors = weight_schema.convert_humming(
+            tensors=tensors,
+            shape_n_stacks=shape_n_stacks,
+            shape_k_stacks=shape_k_stacks,
+            num_experts=layer.local_num_experts,
+            param_dtype=layer.params_dtype,
+        )
+
+        layer.weight_schemas[sublayer_name] = weight_schema_new
+        layer.input_schemas[sublayer_name] = input_schema
+
+        for name, _ in list(layer.named_parameters()):
+            if not name.startswith(sublayer_name + "_"):
+                continue
+            delattr(layer, name)
+
+        for name, tensor in tensors.items():
+            name = f"{sublayer_name}_{name}"
+            param = torch.nn.Parameter(tensor, requires_grad=False)
+            setattr(layer, name, param)
+
+        # Step 2: transform weight (humming standard format) for forwarding
+        HummingMethod.prepare_layer_meta(
+            layer=layer,
+            shape_n=shape_n,
+            shape_k=shape_k,
+            pad_n_to_multiple=256,
+            pad_k_to_multiple=128,
+            input_schema=input_schema,
+            weight_schema=weight_schema_new,
+            has_bias=layer.moe_config.has_bias,
+            num_experts=layer.num_experts,
+            torch_dtype=layer.params_dtype,
+            sublayer_name=sublayer_name,
+        )
+
+        HummingMethod.transform_humming_layer(layer, sublayer_name=sublayer_name)
+
+    if not hasattr(layer, "locks"):
+        device = layer.w13_weight.device
+        locks = torch.zeros(1024, dtype=torch.int32, device=device)
+        layer.register_buffer("locks", locks)
+
+
+def get_humming_moe_quant_config(layer: FusedMoE):
+    input_schema = layer.input_schemas["w13"]
+    weight_schema = layer.weight_schemas["w13"]
+
+    a_dtype = input_schema.a_dtype
+    if a_dtype is None or a_dtype.num_bits == 16:
+        a_quant_desc = FusedMoEQuantDesc(dtype=None)
+    else:
+        shape = GroupShape(row=1, col=-1)
+        a_quant_desc = FusedMoEQuantDesc(dtype=str(a_dtype), shape=shape)
+
+    weight_scale_group_size = weight_schema.weight_scale_group_size
+    weight_scale_group_size_n = weight_schema.weight_scale_group_size_n
+    weight_group_shape: tuple[int, ...] = ()
+    if weight_scale_group_size_n > 1:
+        weight_group_shape = GroupShape(
+            row=weight_scale_group_size,
+            col=weight_scale_group_size_n,
+        )
+    elif weight_scale_group_size == 0:
+        weight_group_shape = GroupShape(row=-1, col=1)
+    else:
+        weight_group_shape = GroupShape(row=weight_scale_group_size, col=1)
+
+    w1_quant_desc = FusedMoEQuantDesc(
+        dtype=str(weight_schema.b_dtype),
+        shape=weight_group_shape,
+        scale=getattr(layer, "w13_weight_scale", None),
+        alpha_or_gscale=getattr(layer, "w13_global_scale", None),
+        zp=getattr(layer, "w13_zero_point", None),
+        bias=getattr(layer, "w13_bias", None),
+    )
+
+    w2_quant_desc = FusedMoEQuantDesc(
+        dtype=str(weight_schema.b_dtype),
+        shape=weight_group_shape,
+        scale=getattr(layer, "w2_weight_scale", None),
+        alpha_or_gscale=getattr(layer, "w2_global_scale", None),
+        zp=getattr(layer, "w2_zero_point", None),
+        bias=getattr(layer, "w2_bias", None),
+    )
+
+    return FusedMoEQuantConfig(
+        _a1=a_quant_desc,
+        _a2=a_quant_desc,
+        _w1=w1_quant_desc,
+        _w2=w2_quant_desc,
+    )
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
index b9b7bd542738..a12918225348 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
@@ -85,7 +85,9 @@ def _mxfp8_e4m3_quantize_torch(
 
 
 def _mxfp8_e4m3_quantize_impl(
-    x: torch.Tensor, is_sf_swizzled_layout: bool = False
+    x: torch.Tensor,
+    is_sf_swizzled_layout: bool = False,
+    alignment: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     from vllm.platforms import current_platform
 
@@ -93,7 +95,9 @@ def _mxfp8_e4m3_quantize_impl(
         from flashinfer import mxfp8_quantize as flashinfer_mxfp8_quantize
 
         x_q, x_scales = flashinfer_mxfp8_quantize(
-            x, is_sf_swizzled_layout=is_sf_swizzled_layout
+            x,
+            is_sf_swizzled_layout=is_sf_swizzled_layout,
+            alignment=alignment if alignment > 0 else 32,
         )
         if x_scales.ndim == 1 and x.ndim == 2 and not is_sf_swizzled_layout:
             x_scales = x_scales.view(x.size(0), -1)
@@ -103,9 +107,11 @@ def _mxfp8_e4m3_quantize_impl(
 
 
 def mxfp8_e4m3_quantize(
-    x: torch.Tensor, is_sf_swizzled_layout: bool = False
+    x: torch.Tensor,
+    is_sf_swizzled_layout: bool = False,
+    alignment: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    return torch.ops.vllm.mxfp8_quantize(x, is_sf_swizzled_layout)
+    return torch.ops.vllm.mxfp8_quantize(x, is_sf_swizzled_layout, alignment)
 
 
 def dequant_mxfp8_to_bf16(x: torch.Tensor, scales: torch.Tensor) -> torch.Tensor:
@@ -125,7 +131,9 @@ def dequant_mxfp8_to_bf16(x: torch.Tensor, scales: torch.Tensor) -> torch.Tensor
 
 
 def mxfp8_e4m3_quantize_fake(
-    x: torch.Tensor, is_sf_swizzled_layout: bool = False
+    x: torch.Tensor,
+    is_sf_swizzled_layout: bool = False,
+    alignment: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Fake implementation for torch.compile tracing."""
     fp_data = torch.empty_like(x, dtype=MXFP8_VALUE_DTYPE)
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index af5c6f2a7ab5..39c78a9062be 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -4,7 +4,9 @@
 
 import torch
 
+from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
+from vllm.triton_utils import tl, triton
 
 __all__ = [
     "break_fp4_bytes",
@@ -20,6 +22,309 @@
 )
 
 
+@triton.jit
+def _e2m1_inline(magnitude):
+    """Inline E2M1 lookup using binary tree - 3 levels instead of 7 sequential.
+
+    Maps 3-bit magnitude to float: [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
+    Uses bit decomposition for fewer comparisons.
+    """
+    # Bit 2 (MSB): separates 0-3 from 4-7
+    # Bit 1: separates within groups
+    # Bit 0 (LSB): separates within pairs
+    b2 = (magnitude >> 2) & 1  # 0 for mag 0-3, 1 for mag 4-7
+    b1 = (magnitude >> 1) & 1  # middle bit
+    b0 = magnitude & 1  # LSB
+
+    # For mag 0-3: [0.0, 0.5, 1.0, 1.5]
+    low_group = tl.where(
+        b1 == 1, tl.where(b0 == 1, 1.5, 1.0), tl.where(b0 == 1, 0.5, 0.0)
+    )
+    # For mag 4-7: [2.0, 3.0, 4.0, 6.0]
+    high_group = tl.where(
+        b1 == 1, tl.where(b0 == 1, 6.0, 4.0), tl.where(b0 == 1, 3.0, 2.0)
+    )
+    return tl.where(b2 == 1, high_group, low_group)
+
+
+@triton.jit
+def _dequantize_nvfp4_kernel(
+    fp4_ptr,
+    scale_ptr,
+    global_scale_ptr,
+    output_ptr,
+    rows_per_batch: tl.constexpr,
+    num_blocks: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    has_batch_global_scale: tl.constexpr,
+    TILE_BLOCKS: tl.constexpr,
+):
+    """Triton kernel for NVFP4 dequantization (swizzle=False).
+
+    Optimized with 2D tile processing + interleave for coalesced stores.
+    """
+    BLOCK_PACKED: tl.constexpr = BLOCK_SIZE // 2
+
+    row_idx = tl.program_id(0)
+    tile_idx = tl.program_id(1)
+
+    if has_batch_global_scale:
+        batch_idx = row_idx // rows_per_batch
+        global_scale = tl.load(global_scale_ptr + batch_idx).to(tl.float32)
+    else:
+        global_scale = tl.load(global_scale_ptr).to(tl.float32)
+
+    fp4_row_offset = row_idx * num_blocks * BLOCK_PACKED
+    scale_row_offset = row_idx * num_blocks
+    output_row_offset = row_idx * num_blocks * BLOCK_SIZE
+
+    start_block = tile_idx * TILE_BLOCKS
+
+    # Load scales for this tile: [TILE_BLOCKS]
+    block_offsets = tl.arange(0, TILE_BLOCKS)
+    block_mask = (start_block + block_offsets) < num_blocks
+
+    raw_scales = tl.load(
+        scale_ptr + scale_row_offset + start_block + block_offsets,
+        mask=block_mask,
+        other=0,
+    )
+    scale_f32 = tl.cast(raw_scales, tl.float8e4nv, bitcast=True).to(tl.float32)
+    scale_values = (scale_f32 * global_scale)[:, None]
+
+    # Load [TILE_BLOCKS, BLOCK_PACKED] packed bytes
+    packed_offsets = tl.arange(0, BLOCK_PACKED)[None, :]
+    byte_indices = (
+        fp4_row_offset
+        + (start_block + block_offsets[:, None]) * BLOCK_PACKED
+        + packed_offsets
+    )
+    elem_mask = block_mask[:, None]
+    raw_bytes = tl.load(fp4_ptr + byte_indices, mask=elem_mask, other=0)
+
+    low_nibble = raw_bytes & 0x0F
+    high_nibble = (raw_bytes >> 4) & 0x0F
+
+    # Binary tree E2M1 decode
+    low_mag = low_nibble & 0x07
+    low_val = _e2m1_inline(low_mag)
+    low_sign = (low_nibble >> 3) & 1
+    low_result = tl.where(low_sign == 1, -low_val, low_val) * scale_values
+
+    high_mag = high_nibble & 0x07
+    high_val = _e2m1_inline(high_mag)
+    high_sign = (high_nibble >> 3) & 1
+    high_result = tl.where(high_sign == 1, -high_val, high_val) * scale_values
+
+    # Interleave for coalesced contiguous store
+    result = tl.interleave(low_result, high_result)
+
+    elem_offsets = tl.arange(0, BLOCK_SIZE)[None, :]
+    out_indices = (
+        output_row_offset
+        + (start_block + block_offsets[:, None]) * BLOCK_SIZE
+        + elem_offsets
+    )
+    tl.store(output_ptr + out_indices, result, mask=block_mask[:, None])
+
+
+@triton.jit
+def _e2m1_lookup(magnitude):
+    """Lookup E2M1 float value from 3-bit magnitude."""
+    result = tl.where(magnitude == 1, 0.5, 0.0)
+    result = tl.where(magnitude == 2, 1.0, result)
+    result = tl.where(magnitude == 3, 1.5, result)
+    result = tl.where(magnitude == 4, 2.0, result)
+    result = tl.where(magnitude == 5, 3.0, result)
+    result = tl.where(magnitude == 6, 4.0, result)
+    result = tl.where(magnitude == 7, 6.0, result)
+    return result
+
+
+@triton.jit
+def _round_to_fp4(x):
+    """Round float values to the nearest E2M1 representable value.
+
+    Matches the thresholds in the Python ``cast_to_fp4`` exactly.
+    """
+    sign = tl.where(x < 0.0, -1.0, 1.0)
+    abs_x = tl.abs(x)
+    result = tl.where(abs_x > 5.0, 6.0, 0.0)
+    result = tl.where((abs_x >= 3.5) & (abs_x <= 5.0), 4.0, result)
+    result = tl.where((abs_x > 2.5) & (abs_x < 3.5), 3.0, result)
+    result = tl.where((abs_x >= 1.75) & (abs_x <= 2.5), 2.0, result)
+    result = tl.where((abs_x > 1.25) & (abs_x < 1.75), 1.5, result)
+    result = tl.where((abs_x >= 0.75) & (abs_x <= 1.25), 1.0, result)
+    result = tl.where((abs_x > 0.25) & (abs_x < 0.75), 0.5, result)
+    return result * sign
+
+
+@triton.jit
+def _nvfp4_quant_dequant_kernel(
+    input_ptr,
+    output_ptr,
+    global_scale_ptr,
+    k: tl.constexpr,
+    num_blocks: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    FP4_MAX_RECIPROCAL: tl.constexpr,
+    TILE_BLOCKS: tl.constexpr,
+):
+    """Fused NVFP4 quantize-dequantize kernel.
+
+    Uses a 2D grid (rows x tiles) to parallelize across both rows
+    and quantization groups within a row. Each program handles
+    TILE_BLOCKS groups at once using vectorized 2D operations.
+    """
+    row_idx = tl.program_id(0)
+    tile_idx = tl.program_id(1)
+    global_scale = tl.load(global_scale_ptr).to(tl.float32)
+    row_offset = row_idx * k
+
+    start_block = tile_idx * TILE_BLOCKS
+    block_offsets = tl.arange(0, TILE_BLOCKS)
+    block_mask = (start_block + block_offsets) < num_blocks
+
+    # Load [TILE_BLOCKS, BLOCK_SIZE] elements
+    indices = (
+        row_offset
+        + (start_block + block_offsets[:, None]) * BLOCK_SIZE
+        + tl.arange(0, BLOCK_SIZE)[None, :]
+    )
+    mask_2d = block_mask[:, None]
+    x = tl.load(input_ptr + indices, mask=mask_2d, other=0.0).to(tl.float32)
+
+    # Per-group scale: [TILE_BLOCKS]
+    vec_max = tl.max(tl.abs(x), axis=1)
+    scale = global_scale * (vec_max * FP4_MAX_RECIPROCAL)
+    scale = tl.clamp(scale, -448.0, 448.0)
+    scale = scale.to(tl.float8e4nv).to(tl.float32)
+
+    # Safe reciprocal, broadcast to [TILE_BLOCKS, 1]
+    output_scale = tl.where(scale == 0.0, 0.0, global_scale / scale)[:, None]
+
+    # Quantize: scale, clamp, round to FP4
+    scaled_x = tl.clamp(x * output_scale, -6.0, 6.0)
+    fp4_val = _round_to_fp4(scaled_x)
+
+    # Dequantize: fp4_val * (scale / global_scale)
+    dequant_scale = (scale / global_scale)[:, None]
+    result = fp4_val * dequant_scale
+
+    tl.store(output_ptr + indices, result, mask=mask_2d)
+
+
+def _triton_nvfp4_quant_dequant(
+    x: torch.Tensor,
+    global_scale: torch.Tensor,
+    block_size: int,
+) -> torch.Tensor:
+    """Triton-accelerated NVFP4 quantize-dequantize."""
+    x_m, x_k = x.shape
+
+    if not torch.compiler.is_compiling():
+        assert x_k % block_size == 0, (
+            f"Weight shape K={x_k} is not divisible by block_size={block_size}"
+        )
+
+    output_dtype = x.dtype
+    num_blocks = x_k // block_size
+
+    output = torch.empty(x_m, x_k, dtype=output_dtype, device=x.device)
+
+    tile_blocks = min(64, triton.next_power_of_2(num_blocks))
+    num_tiles = (num_blocks + tile_blocks - 1) // tile_blocks
+    grid = (x_m, num_tiles)
+    _nvfp4_quant_dequant_kernel[grid](
+        x,
+        output,
+        global_scale,
+        x_k,
+        num_blocks,
+        block_size,
+        FLOAT4_E2M1_MAX_RECIPROCAL,
+        tile_blocks,
+    )
+
+    return output
+
+
+def _triton_dequantize_nvfp4(
+    tensor_fp4: torch.Tensor,
+    tensor_sf: torch.Tensor,
+    global_scale: torch.Tensor,
+    dtype: torch.dtype,
+    block_size: int = 16,
+) -> torch.Tensor:
+    """Dequantize NVFP4 using Triton (swizzle=False only).
+
+    Supports both 2D and 3D inputs:
+    - 2D: [m, packed_k] -> [m, k]
+    - 3D: [dim0, m, packed_k] -> [dim0, m, k]
+    """
+    assert tensor_fp4.dtype == torch.uint8
+
+    is_3d = tensor_fp4.ndim == 3
+    if is_3d:
+        dim0, m_per_batch, packed_k = tensor_fp4.shape
+        tensor_fp4_2d = tensor_fp4.reshape(-1, packed_k)
+        tensor_sf_2d = tensor_sf.reshape(-1, tensor_sf.shape[-1])
+        total_rows_flat = dim0 * m_per_batch
+    else:
+        m_per_batch, packed_k = tensor_fp4.shape
+        tensor_fp4_2d = tensor_fp4
+        tensor_sf_2d = tensor_sf
+        total_rows_flat = m_per_batch
+
+    k = packed_k * 2
+    num_blocks = k // block_size
+
+    output = torch.empty(total_rows_flat, k, dtype=dtype, device=tensor_fp4.device)
+
+    # View as uint8 so Triton can load raw bytes and bitcast to float8_e4m3fn
+    scale_raw = tensor_sf_2d.contiguous().view(torch.uint8)
+
+    # Shape-adaptive tile sizing: for large row counts (3D), process
+    # entire row in one tile. For small row counts (2D), use smaller
+    # tiles to increase parallelism across CUs.
+    np2 = triton.next_power_of_2(num_blocks)
+    if total_rows_flat >= 4096:
+        # Many rows: maximize work per CTA, one tile per row
+        tile_blocks = np2
+        nw = 1
+        ns = 2
+    elif total_rows_flat >= 2048:
+        # Medium-many rows: full row, 2 warps
+        tile_blocks = np2
+        nw = 2
+        ns = 2
+    else:
+        # Few rows: use moderate tiles for CU utilization
+        tile_blocks = min(64, np2)
+        nw = 4
+        ns = 2
+    num_tiles = (num_blocks + tile_blocks - 1) // tile_blocks
+    grid = (total_rows_flat, num_tiles)
+    _dequantize_nvfp4_kernel[grid](
+        tensor_fp4_2d,
+        scale_raw,
+        global_scale,
+        output,
+        m_per_batch,
+        num_blocks,
+        block_size,
+        is_3d,
+        tile_blocks,
+        num_warps=nw,
+        num_stages=ns,
+    )
+
+    if is_3d:
+        output = output.reshape(dim0, m_per_batch, k)
+
+    return output
+
+
 def break_fp4_bytes(a, dtype):
     assert a.dtype == torch.uint8
     m, n = a.shape
@@ -67,6 +372,11 @@ def dequantize_to_dtype(
     # Two fp4 values are packed into one uint8.
     assert tensor_fp4.dtype == torch.uint8
 
+    if not swizzle and current_platform.is_cuda_alike():
+        return _triton_dequantize_nvfp4(
+            tensor_fp4, tensor_sf, global_scale, dtype, block_size
+        )
+
     # We handle 3D tensors reshaping them to 2D.
     is_3d = tensor_fp4.ndim == 3
 
@@ -145,12 +455,15 @@ def ref_nvfp4_quant(x, global_scale, block_size):
 
 def ref_nvfp4_quant_dequant(
     x: torch.Tensor, global_scale: torch.Tensor, block_size: int
-) -> tuple[torch.Tensor, None]:
+) -> torch.Tensor:
     """
     NVFP4 quantize-dequantize operation.
 
     `global_scale` is expected to have a single element.
     """
+    if current_platform.is_cuda_alike():
+        return _triton_nvfp4_quant_dequant(x, global_scale, block_size)
+
     x_m, x_k = x.shape
     output_dtype = x.dtype
 
@@ -162,7 +475,7 @@ def ref_nvfp4_quant_dequant(
     x_blockscale = x_blockscale.unsqueeze(-1) / global_scale
     x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype)
 
-    return x_dq, None
+    return x_dq
 
 
 def run_nvfp4_emulations(
@@ -176,7 +489,7 @@ def run_nvfp4_emulations(
     output_dtype = x.dtype
     group_size = 16
 
-    x_dq, _ = ref_nvfp4_quant_dequant(x, input_global_scale, block_size=group_size)
+    x_dq = ref_nvfp4_quant_dequant(x, input_global_scale, block_size=group_size)
 
     # dequantize weight
     w_fp4 = weight.data.view(torch.uint8)
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index 6cb9101a78b1..9a06eedd0f7d 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -45,6 +45,7 @@ def __init__(
         beta_slow: int = 1,
         mscale: float = 1,
         mscale_all_dim: float = 0,
+        init_cache: bool = True,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
@@ -65,7 +66,13 @@ def __init__(
             and head_size in [64, 128, 256, 512]
         )
         super().__init__(
-            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            dtype,
+            init_cache=init_cache,
         )
 
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
@@ -211,7 +218,9 @@ class DeepseekV4ScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        # Avoid compute cache repeatedly
+        kwargs.pop("init_cache", None)
+        super().__init__(*args, **kwargs, init_cache=False)
         cache_fp32 = self._compute_cos_sin_cache()
         self.register_buffer("cos_sin_cache", cache_fp32, persistent=False)
 
diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py
index 3332f26c8c48..d73590638090 100644
--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -32,13 +32,6 @@
 elif current_platform.is_xpu():
     from vllm._xpu_ops import xpu_ops
 
-# Registers `vllm::rocm_sparse_attn_indexer_no_insert` (the V4 layout where the
-# compressor pre-inserts K and the indexer is called with k=None).
-# Keep this import at module scope so the op is visible at compile time, not
-# just on the first forward.
-if current_platform.is_rocm():
-    import vllm.v1.attention.ops.rocm_sparse_attn_indexer  # noqa: F401
-
 logger = init_logger(__name__)
 
 RADIX_TOPK_WORKSPACE_SIZE = 1024 * 1024
@@ -511,14 +504,18 @@ def forward_hip(
             "AMD sparse_attn_indexer expects a single FP8 q_quant tensor"
         )
 
-        if self.skip_k_cache_insert:
-            # DeepSeek-V4 layout: the compressor has already inserted the
-            # compressed K into the indexer's KV cache and passes k=None.
-            # The AITER op below always issues its own
-            # ``ops.indexer_k_quant_and_cache(k, ...)`` and dereferences ``k``,
-            # so it can't be reused here. Use the dedicated no-insert ROCm
-            # path that uses only ROCm-available helpers (and a Triton MQA
-            # kernel under the hood).
+        # We only take this path when the
+        # compressor has already inserted K (skip_k_cache_insert=True), AITER
+        # is off, and the env-var gate is on (default). Falls through to the
+        # upstream native path otherwise.
+        if (
+            self.skip_k_cache_insert
+            and not rocm_aiter_ops.is_enabled()
+            and envs.VLLM_ROCM_USE_V4_TRITON_FALLBACK
+        ):
+            # Import lazily so non-ROCm builds don't pay the import cost.
+            import vllm.v1.attention.ops.rocm_sparse_attn_indexer  # noqa: F401
+
             return torch.ops.vllm.rocm_sparse_attn_indexer_no_insert(
                 hidden_states,
                 _encode_layer_name(self.k_cache.prefix),
@@ -533,7 +530,27 @@ def forward_hip(
                 self.max_total_seq_len,
                 self.topk_indices_buffer,
             )
+        if self.skip_k_cache_insert or not rocm_aiter_ops.is_enabled():
+            from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
+                rocm_aiter_sparse_attn_indexer_native,
+            )
 
+            return rocm_aiter_sparse_attn_indexer_native(
+                hidden_states,
+                _encode_layer_name(self.k_cache.prefix),
+                self.k_cache.kv_cache,
+                q_quant,
+                k,
+                weights,
+                self.quant_block_size,
+                self.scale_fmt,
+                self.topk_tokens,
+                self.head_dim,
+                self.max_model_len,
+                self.max_total_seq_len,
+                self.topk_indices_buffer,
+                skip_k_cache_insert=self.skip_k_cache_insert,
+            )
         if rocm_aiter_ops.is_enabled():
             return torch.ops.vllm.rocm_aiter_sparse_attn_indexer(
                 hidden_states,
@@ -550,9 +567,4 @@ def forward_hip(
                 self.max_total_seq_len,
                 self.topk_indices_buffer,
             )
-        else:
-            raise RuntimeError(
-                "Sparse attention indexer ROCm custom op requires ROCm "
-                "Aiter ops to be enabled (or skip_k_cache_insert=True for "
-                "the V4 layout)."
-            )
+        raise RuntimeError("Sparse attention indexer ROCm path could not be selected.")
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 3aa0474c340c..63a79f668edf 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -159,6 +159,7 @@ def rocm_unquantized_gemm_impl(
             and weight.is_contiguous()
         )
     )
+
     if use_skinny_reduce_counting:
         return ops.wvSplitKrc(x, weight, cu_count, bias)
 
@@ -174,17 +175,21 @@ def rocm_unquantized_gemm_impl(
         and k % 8 == 0
     )
 
-    if not use_skinny:
-        return torch.nn.functional.linear(x, weight, bias)
-
-    x_view = x.reshape(-1, x.size(-1))
-    if m > 8 and 0 < n <= 4:
-        cu_count = num_compute_units()
-        out = ops.wvSplitK(weight, x_view, cu_count, bias)
-        return out.reshape(*x.shape[:-1], weight.shape[0])
-    elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None:
-        out = ops.LLMM1(weight, x_view, 4)
-        return out.reshape(*x.shape[:-1], weight.shape[0])
+    if use_skinny:
+        x_view = x.reshape(-1, x.size(-1))
+        if m > 8 and 0 < n <= 4:
+            cu_count = num_compute_units()
+            out = ops.wvSplitK(weight, x_view, cu_count, bias)
+            return out.reshape(*x.shape[:-1], weight.shape[0])
+        elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None:
+            out = ops.LLMM1(weight, x_view, 4)
+            return out.reshape(*x.shape[:-1], weight.shape[0])
+
+    if rocm_aiter_ops.is_tgemm_enabled():
+        from aiter.tuned_gemm import tgemm
+
+        return tgemm.mm(x, weight, bias)
+
     return torch.nn.functional.linear(x, weight, bias)
 
 
@@ -299,22 +304,6 @@ def cpu_unquantized_gemm(
     return layer.cpu_linear(x, weight, bias)
 
 
-def cublas_gemm_bf16_bf16_fp32(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-):
-    # The fused C++ op (csrc/moe/router_gemm.cu, registered via
-    # torch_bindings.cpp's `router_gemm_bf16_fp32`) is gated behind
-    # `#ifndef USE_ROCM` and is only compiled into _moe_C.so on CUDA builds.
-    # On other backends (e.g. ROCm) we fall back to a torch GEMM with the
-    # same bf16-in / fp32-out contract. rocBLAS already does fp32 accumulation
-    # internally for bf16 GEMMs on MI300X, so casting the bf16 output to fp32
-    # matches the cuBLAS bf16 x bf16 -> fp32 path numerically.
-    if current_platform.is_cuda():
-        return ops.router_gemm_bf16_fp32(x, weight)
-    return torch.nn.functional.linear(x, weight).to(torch.float32)
-
-
 def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
     if current_platform.is_rocm():
         return rocm_unquantized_gemm
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 037195b9063a..a76092028671 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -76,7 +76,11 @@ def __init__(self, load_config: LoadConfig):
         self.local_expert_ids: set[int] | None = None
 
         extra_config = load_config.model_loader_extra_config
-        allowed_keys = {"enable_multithread_load", "num_threads"}
+        allowed_keys = {
+            "enable_multithread_load",
+            "num_threads",
+            "enable_weights_track",
+        }
         unexpected_keys = set(extra_config.keys()) - allowed_keys
 
         if unexpected_keys:
@@ -86,6 +90,10 @@ def __init__(self, load_config: LoadConfig):
                 f"{unexpected_keys}"
             )
 
+        self.enable_weights_track: bool | None = extra_config.get(
+            "enable_weights_track", None
+        )
+
     def _prepare_weights(
         self,
         model_name_or_path: str,
@@ -377,7 +385,6 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
 
         self._init_ep_weight_filter(model_config)
 
-        weights_to_load = {name for name, _ in model.named_parameters()}
         loaded_weights = model.load_weights(self.get_all_weights(model_config, model))
 
         self.counter_after_loading_weights = time.perf_counter()
@@ -386,8 +393,36 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
             self.counter_after_loading_weights - self.counter_before_loading_weights,
         )
         # We only enable strict check for non-quantized models
-        # that have loaded weights tracking currently.
-        if model_config.quantization is None and loaded_weights is not None:
+        # that have loaded weights tracking by default.
+        default_enable_weights_track = (
+            model_config.quantization is None and loaded_weights is not None
+        )
+        enable_weights_track = (
+            self.enable_weights_track
+            if self.enable_weights_track is not None
+            else default_enable_weights_track
+        )
+        if enable_weights_track:
+            self.track_weights_loading(model, loaded_weights)
+
+    def track_weights_loading(
+        self, model: nn.Module, loaded_weights: set[str] | None
+    ) -> None:
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        if loaded_weights is not None:
+            # ignore online quantization scales
+            for name, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                has_online_quant = getattr(quant_method, "uses_meta_device", False)
+                has_postprocess_quant = getattr(
+                    quant_method, "process_weights_after_loading", None
+                )
+                # ignore kv_cache scale and online quant scale,
+                # which can be missing in checkpoints
+                if has_online_quant or has_postprocess_quant:
+                    for param_name, _ in module.named_parameters():
+                        full_name = f"{name}.{param_name}" if name else param_name
+                        loaded_weights.add(full_name)
             weights_not_loaded = weights_to_load - loaded_weights
             if weights_not_loaded:
                 raise ValueError(
diff --git a/vllm/model_executor/model_loader/reload/layerwise.py b/vllm/model_executor/model_loader/reload/layerwise.py
index 2ebe25444781..f8d8304199ea 100644
--- a/vllm/model_executor/model_loader/reload/layerwise.py
+++ b/vllm/model_executor/model_loader/reload/layerwise.py
@@ -3,7 +3,7 @@
 import inspect
 from collections.abc import Callable
 from functools import wraps
-from weakref import WeakKeyDictionary
+from weakref import WeakKeyDictionary, WeakSet
 
 import torch
 
@@ -21,7 +21,13 @@
     restore_layer_on_meta,
 )
 from .types import LayerReloadingInfo
-from .utils import get_layer_params_buffers, get_layer_size, get_layer_tensors
+from .utils import (
+    get_info_size,
+    get_layer_params_buffers,
+    get_layer_size,
+    get_layer_tensors,
+    has_device_tensors,
+)
 
 logger = init_logger(__name__)
 
@@ -43,6 +49,9 @@
     WeakKeyDictionary()
 )
 
+# Global set used to track loading for logging purposes only
+LOADING_LAYERS: WeakSet[torch.nn.Module] = WeakSet()
+
 
 def get_layerwise_info(layer: torch.nn.Module) -> LayerReloadingInfo:
     """
@@ -174,11 +183,30 @@ def online_process_loader(*args, **kwargs):
             info.load_numel_total,
         )
 
+        # Do not online process attention layers, must wait until finalize
+        if isinstance(layer, (Attention, MLAAttention)):
+            return ret
+
+        # Log warnings allocating excessive buffers on device
+        if has_device_tensors(bound_args):
+            LOADING_LAYERS.add(layer)
+            if len(LOADING_LAYERS) >= 2:
+                names = sorted([layer.__class__.__name__ for layer in LOADING_LAYERS])
+                mem_used = sum(
+                    get_info_size(LAYERWISE_INFO[layer]) for layer in LOADING_LAYERS
+                )
+                logger.warning_once(
+                    "Allocating %.1f MB of device memory to buffers to load %s layers. "
+                    "This extra memory usage can be avoided by ordering weights "
+                    "by their parent layer when reloading.",
+                    mem_used / 1e6,
+                    str(list(names)),
+                )
+
         # Process and copy when all weights are loaded
-        if info.load_numel >= info.load_numel_total and not isinstance(  # type: ignore[operator]
-            layer, (Attention, MLAAttention)
-        ):
+        if info.load_numel >= info.load_numel_total:  # type: ignore[operator]
             _layerwise_process(layer, info)
+            LOADING_LAYERS.discard(layer)
 
         return ret
 
@@ -240,6 +268,8 @@ def finalize_layerwise_processing(model: torch.nn.Module, model_config: ModelCon
         _finalize_attention_layer(layer, info, model_config)
         info.reset()
 
+    LOADING_LAYERS.clear()
+
 
 def finalize_layerwise_reload(*args, **kwargs):
     finalize_layerwise_processing(*args, **kwargs)
diff --git a/vllm/model_executor/model_loader/reload/meta.py b/vllm/model_executor/model_loader/reload/meta.py
index 91fce6f57b3e..baa2081d58b2 100644
--- a/vllm/model_executor/model_loader/reload/meta.py
+++ b/vllm/model_executor/model_loader/reload/meta.py
@@ -102,7 +102,7 @@ def materialize_layer(layer: torch.nn.Module, info: LayerReloadingInfo):
 
     with info.restore_device:
         for name, tensor in get_layer_tensors(layer).items():
-            if name not in SKIP_TENSORS:
+            if name not in SKIP_TENSORS and tensor.is_meta:
                 setattr(layer, name, materialize_meta_tensor(tensor))
 
 
diff --git a/vllm/model_executor/model_loader/reload/utils.py b/vllm/model_executor/model_loader/reload/utils.py
index 463ff6422213..7a3d6873e101 100644
--- a/vllm/model_executor/model_loader/reload/utils.py
+++ b/vllm/model_executor/model_loader/reload/utils.py
@@ -1,14 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from inspect import BoundArguments
+
 import torch
 
-from .types import LayerTensors
+from .types import LayerReloadingInfo, LayerTensors
 
 __all__ = [
     "get_layer_tensors",
     "get_layer_params_buffers",
     "get_layer_size",
+    "has_device_tensors",
+    "get_info_size",
 ]
 
 
@@ -39,3 +43,31 @@ def get_layer_size(layer: torch.nn.Module) -> int:
         for name, tensor in get_layer_tensors(layer).items()
         if name not in SKIP_TENSORS
     )
+
+
+def has_device_tensors(bound_args: BoundArguments) -> bool:
+    """
+    Return True if the loaded weights exist on an accelerator device
+
+    :param bound_args: args to load weights
+    :return: True if weights are on accelerator device
+    """
+    return any(
+        isinstance(value, torch.Tensor) and value.device.type not in ("meta", "cpu")
+        for value in bound_args.arguments.values()
+    )
+
+
+def get_info_size(info: LayerReloadingInfo) -> int:
+    """
+    Calculate the number of bytes used by loaded weights for a given layer
+
+    :param info: layerwise info to get size of
+    :return: number of bytes used by loaded weights
+    """
+    return sum(
+        value.nbytes
+        for _, args in info.loaded_weights
+        for value in args.arguments.values()
+        if isinstance(value, torch.Tensor) and value.device.type not in ("meta", "cpu")
+    )
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index 87b4b72db2a1..3f57fe7e0265 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -31,8 +31,8 @@ class ShardedStateLoader(BaseModelLoader):
     Model loader that directly loads each worker's model state dict, which
     enables a fast load path for large tensor-parallel models where each worker
     only needs to read its own shard rather than the entire checkpoint. See
-    `examples/offline_inference/save_sharded_state.py` for creating a sharded
-    checkpoint.
+    `examples/features/sharded_state/save_sharded_state_offline.py` for creating
+    a sharded checkpoint.
     """
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py
index e26adc17430e..bcf1200bd1b4 100644
--- a/vllm/model_executor/models/bailing_moe_linear.py
+++ b/vllm/model_executor/models/bailing_moe_linear.py
@@ -17,6 +17,7 @@
 )
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
+from vllm.model_executor.custom_op import PluggableLayer
 from vllm.model_executor.layers.fla.ops.layernorm_guard import (
     RMSNormGated,
     layernorm_fn,
@@ -204,14 +205,19 @@ def __init__(
             self.q_a_layernorm = None
             self.q_b_proj = None
 
-        rope_parameters = _build_rope_parameters(config)
+        rope_parameters = _build_rope_parameters(config) or {}
+        # MLA rotates the full qk_rope_head_dim,
+        # partial_rotary_factor is for the linear-attn head only.
+        rope_parameters = {
+            k: v for k, v in rope_parameters.items() if k != "partial_rotary_factor"
+        }
+        rope_parameters["rope_dim"] = self.qk_rope_head_dim
         max_position = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             head_size=self.qk_rope_head_dim,
             max_position=max_position,
             is_neox_style=False,
-            rope_parameters=rope_parameters or None,
-            dtype=torch.float32,
+            rope_parameters=rope_parameters,
         )
 
         # Build MLAModules for MultiHeadLatentAttentionWrapper
@@ -425,14 +431,18 @@ def _weight_loader(param: torch.nn.Parameter, loaded_weight: torch.Tensor) -> No
         param.data.copy_(loaded_weight[shard].contiguous())
 
 
-class BailingMoELinearAttention(nn.Module, MambaBase):
-    """
-    Bailing MoE Linear Attention implementation using minimax backend.
+# --8<-- [start:bailing_moe_linear_attention]
+@PluggableLayer.register("bailing_moe_linear_attention")
+class BailingMoELinearAttention(PluggableLayer, MambaBase):
+    """Pluggable Bailing MoE Linear Attention layer which allows OOT backends
+    to add custom implementations.
 
-    This implements the linear attention mechanism from sglang, adapted for vLLM's
-    v1 engine with MambaBase interface support.
+    This implements the linear attention mechanism from sglang, adapted for
+    vLLM's v1 engine with MambaBase interface support.
     """
 
+    # --8<-- [end:bailing_moe_linear_attention]
+
     @property
     def mamba_type(self) -> str:
         return "linear_attention"
@@ -569,7 +579,6 @@ def __init__(
             self.head_dim,
             max_position=self.max_position_embeddings,
             is_neox_style=True,
-            dtype=torch.float32,
             rope_parameters=rope_parameters or None,
         )
 
@@ -754,8 +763,6 @@ def _prefill_and_mix_infer(
 
     def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata):
         """Handle decode (single token per sequence)."""
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_prefills = attn_metadata.num_prefills
         hidden = linear_attention_decode(
             q,
             k,
@@ -763,10 +770,10 @@ def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata):
             kv_cache,
             self.tp_slope,
             state_indices_tensor,
-            q_start=num_prefill_tokens,
-            q_end=None,
-            slot_start=num_prefills,
-            slot_end=None,
+            q_start=0,
+            q_end=attn_metadata.num_decode_tokens,
+            slot_start=0,
+            slot_end=attn_metadata.num_decodes,
             block_size=32,
         )
         return hidden
@@ -1149,6 +1156,7 @@ def __init__(
                 config.vocab_size,
                 config.hidden_size,
                 quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
             )
             self.logits_processor = LogitsProcessor(config.vocab_size)
         else:
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index c3118ee7778d..cdb3bcbedb25 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -44,7 +44,12 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
 from .siglip import SiglipVisionModel
 from .utils import (
     AutoWeightsLoader,
@@ -309,16 +314,22 @@ def get_replacement(item_idx: int):
     info=Cohere2VisionProcessingInfo,
     dummy_inputs=Cohere2VisionDummyInputsBuilder,
 )
-class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+class Cohere2VisionForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
+):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_tower.": "vision_tower.",
             "model.multi_modal_projector.": "multi_modal_projector.",
             "model.language_model.": "language_model.model.",
-            "lm_head.": "language_model.lm_head.",
         }
     )
 
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: Cohere2VisionConfig = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/cohere_asr.py b/vllm/model_executor/models/cohere_asr.py
index 81ba1483bff8..584793832dc7 100644
--- a/vllm/model_executor/models/cohere_asr.py
+++ b/vllm/model_executor/models/cohere_asr.py
@@ -3,6 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, ClassVar
 
 import torch
 import torch.nn.functional as F
@@ -14,7 +15,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.config.speech_to_text import SpeechToTextParams
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.inputs import MultiModalDataDict, PromptType, TextPrompt
+from vllm.inputs import MultiModalDataDict, PromptType, TokensPrompt
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.attention import (
@@ -48,6 +49,7 @@
     PromptUpdate,
 )
 from vllm.renderers import TokenizeParams
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.transformers_utils.processors.cohere_asr import (
     INF_VAL,
     CohereASRFeatureExtractor,
@@ -2008,6 +2010,9 @@ class CohereAsrForConditionalGeneration(
     supported_languages = ISO639_1_SUPPORTED_LANGS
     skip_warmup_audio_preprocessing = True
     no_space_languages = {"ja", "zh"}
+    _default_prompt_token_ids_cache: ClassVar[
+        dict[tuple[str | None, str | None, str], tuple[int, ...]]
+    ] = {}
 
     @classmethod
     def validate_language(cls, language: str | None) -> str | None:
@@ -2025,30 +2030,81 @@ def get_generation_prompt(cls, stt_params: SpeechToTextParams) -> PromptType:
         audio = stt_params.audio
         stt_config = stt_params.stt_config
         language = stt_params.language
-        request_prompt = stt_params.request_prompt
+        model_config = stt_params.model_config
 
         if language is None:
             raise ValueError(
                 "Language must be specified when creating the CohereASR prompt"
             )
 
-        # NOTE: this function is used only by online inference and not offline inference
-        # CohereASR doesnt have encoder prompt
-        language_tag = f"<|{language}|><|{language}|>"
-        pnc = True  # TODO(ekagra): make this configurable later
-        pnc_tag = "<|pnc|>" if pnc else "<|nopnc|>"
-        default_prompt = (
-            f"<|startofcontext|><|startoftranscript|>"
-            f"<|emo:undefined|>{language_tag}{pnc_tag}"
-            f"<|noitn|><|notimestamp|><|nodiarize|>"
+        tokenizer = cached_tokenizer_from_config(model_config)
+
+        # prompt_text is None because CoherASR uses fast implementation of
+        # sentencepiece tokenizer which needs "▁" as the first token
+        # (which is different from "_") and encode("▁ABC") ignores the first token
+        # so the prompt_text is unreliable. However, prompt_token_ids can be used
+        # to get prompt_text but it wont have the first token "▁".
+        prompt_text = None
+        prompt_token_ids = cls._get_default_prompt_token_ids(
+            tokenizer,
+            model_config,
+            language,
         )
-        prompt_text = request_prompt if request_prompt else default_prompt
 
-        return TextPrompt(
+        return TokensPrompt(
             prompt=prompt_text,
+            prompt_token_ids=prompt_token_ids,
             multi_modal_data={"audio": (audio, stt_config.sample_rate)},
         )
 
+    @classmethod
+    def _get_default_prompt_tokens(cls, language: str) -> tuple[str, ...]:
+        # Use token-level control tags so fast tokenizers do not have to parse
+        # the raw string form of the decoder prefix.
+        return (
+            "▁",
+            "<|startofcontext|>",
+            "<|startoftranscript|>",
+            "<|emo:undefined|>",
+            f"<|{language}|>",
+            f"<|{language}|>",
+            "<|pnc|>",
+            "<|noitn|>",
+            "<|notimestamp|>",
+            "<|nodiarize|>",
+        )
+
+    @classmethod
+    def _get_default_prompt_token_ids(
+        cls,
+        tokenizer: Any,
+        model_config: ModelConfig,
+        language: str,
+    ) -> list[int]:
+        cache_key = (
+            getattr(model_config, "tokenizer", None),
+            getattr(model_config, "tokenizer_revision", None),
+            language,
+        )
+        prompt_token_ids = cls._default_prompt_token_ids_cache.get(cache_key)
+        if prompt_token_ids is None:
+            prompt_tokens = list(cls._get_default_prompt_tokens(language))
+            token_ids = tokenizer.convert_tokens_to_ids(prompt_tokens)
+            if not isinstance(token_ids, list):
+                token_ids = [token_ids]
+            unk_token_id = getattr(tokenizer, "unk_token_id", None)
+            if unk_token_id is not None and any(
+                token_id == unk_token_id for token_id in token_ids
+            ):
+                raise ValueError(
+                    "Failed to resolve the CohereASR decoder control tokens "
+                    "with the configured tokenizer."
+                )
+            prompt_token_ids = tuple(int(token_id) for token_id in token_ids)
+            cls._default_prompt_token_ids_cache[cache_key] = prompt_token_ids
+
+        return list(prompt_token_ids)
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         # Required as part of SupportsMultiModal interface.
diff --git a/vllm/model_executor/models/cohere_moe.py b/vllm/model_executor/models/cohere_moe.py
new file mode 100644
index 000000000000..a059d68c9d02
--- /dev/null
+++ b/vllm/model_executor/models/cohere_moe.py
@@ -0,0 +1,532 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+from transformers import CohereConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+
+from .commandr import LayerNorm
+from .interfaces import SupportsPP, SupportsQuant
+from .utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+
+@torch.compile(backend=current_platform.simple_compile_backend)
+def token_choice_with_bias(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    """Sigmoid -> top-k (-> renormalize) custom routing for CohereMoe."""
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    scores = gating_output.float().sigmoid()
+    topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1, sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+
+class CohereMoeMLP(nn.Module):
+    """Cohere MLP used as shared experts in the MoE block."""
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        intermediate_size: int | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (
+            intermediate_size
+            if intermediate_size is not None
+            else config.intermediate_size
+        )
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=False,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class CohereMoeAttention(nn.Module):
+    """Cohere MoE attention with sliding-window interleave."""
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        tp_size = get_tensor_model_parallel_world_size()
+        self.config = config
+        self.layer_idx = extract_layer_index(prefix)
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = getattr(
+            config, "model_max_length", None
+        ) or getattr(config, "max_position_embeddings", 8192)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=self.max_position_embeddings,
+            rope_parameters=config.rope_parameters,
+            is_neox_style=False,
+        )
+
+        self.sliding_window = None
+        layer_types = getattr(config, "layer_types", None)
+        if (
+            layer_types is not None
+            and layer_types[self.layer_idx] == "sliding_attention"
+        ):
+            self.sliding_window = config.sliding_window
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.sliding_window:
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class CohereMoe(nn.Module):
+    """Tensor-parallel MoE block for CohereMoe with shared experts."""
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        if (
+            hasattr(config, "expert_selection_fn")
+            and config.expert_selection_fn == "sigmoid"
+        ):
+            self.custom_routing_function = token_choice_with_bias
+        else:
+            self.custom_routing_function = None
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        if hasattr(config, "num_shared_experts") and config.num_shared_experts > 0:
+            self.shared_experts = CohereMoeMLP(
+                config=config,
+                intermediate_size=config.intermediate_size * config.num_shared_experts,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+            )
+            self.shared_expert_combination_strategy = getattr(
+                config, "shared_expert_combination_strategy", "sum"
+            )
+            assert self.shared_expert_combination_strategy in ("average", "sum"), (
+                "shared_expert_combination_strategy must be one of ['average', 'sum']"
+            )
+        else:
+            self.shared_experts = None
+            self.shared_expert_combination_strategy = None
+
+        self.experts = FusedMoE(
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            params_dtype=params_dtype,
+            renormalize=getattr(config, "norm_topk_prob", True),
+            quant_config=quant_config,
+            tp_size=tp_size,
+            prefix=f"{prefix}.experts",
+            custom_routing_function=self.custom_routing_function,
+            shared_experts=self.shared_experts,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits, _ = self.gate(hidden_states)
+        # FusedMoE handles shared expert overlap internally and returns
+        # shared_output + routed_output when shared_experts is set.
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        if self.shared_expert_combination_strategy == "average":
+            final_hidden_states = final_hidden_states / 2
+        return final_hidden_states.view(orig_shape)
+
+
+class CohereMoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: CohereConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = CohereMoeAttention(
+            config,
+            cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = CohereMoe(
+            config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.input_layernorm = LayerNorm(
+            param_shape=(config.hidden_size,), eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states_attention = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states_mlp = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+        return hidden_states, residual
+
+
+@support_torch_compile
+class CohereMoeModel(nn.Module):
+    """Transformer decoder for CohereMoe."""
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: CohereMoeDecoderLayer(
+                config, cache_config, quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = LayerNorm(
+            param_shape=(config.hidden_size,), eps=config.layer_norm_eps
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class CohereMoeForCausalLM(nn.Module, SupportsPP, SupportsQuant):
+    is_text_generation_model = True
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        assert getattr(config, "tie_word_embeddings", True)
+        self.unpadded_vocab_size = config.vocab_size
+        self.quant_config = quant_config
+        self.logits_scale = config.logit_scale
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, config.vocab_size, scale=self.logits_scale
+        )
+        self.model = CohereMoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        return self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.logits_processor(self.model.embed_tokens, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["lm_head."])
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index e8f5101b577d..459c16f8ec97 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -517,7 +517,7 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
                     "Nomic context extension is disabled. "
                     "Changing max_model_len from %s to %s. "
                     "To enable context extension, see: "
-                    "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.py",
+                    "https://github.com/vllm-project/vllm/tree/main/examples/features/context_extension/context_extension_offline.py",
                     max_model_len_before,
                     model_config.max_model_len,
                 )
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index fbbd5da1fd90..26a1903fa7b9 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -82,7 +82,10 @@
     default_weight_loader,
     maybe_remap_kv_scale_name,
 )
-from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.model_executor.models.utils import (
+    extract_layer_index,
+    sequence_parallel_chunk,
+)
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils.torch_utils import direct_register_custom_op
@@ -348,8 +351,9 @@ def __init__(
             self.is_rocm_aiter_moe_enabled
             and self.gate.e_score_correction_bias is not None
         ):
+            gate_out_dtype = self.gate.out_dtype or self.gate.weight.dtype
             self.gate.e_score_correction_bias.data = (
-                self.gate.e_score_correction_bias.data.to(self.gate.out_dtype)
+                self.gate.e_score_correction_bias.data.to(gate_out_dtype)
             )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -671,30 +675,45 @@ def forward(
     ) -> torch.Tensor:
         q, _ = self.wq_b(qr)
         q = q.view(-1, self.n_head, self.head_dim)
-        q_pe, q_nope = torch.split(
-            q, [self.rope_dim, self.head_dim - self.rope_dim], dim=-1
-        )
-        # Fused wk + weights_proj: one GEMM, then split
-        kw, _ = self.wk_weights_proj(hidden_states)
-        k = kw[:, : self.head_dim]
-        weights = kw[:, self.head_dim :]
-
-        k = self.k_norm(k)
-        k_pe, k_nope = torch.split(
-            k, [self.rope_dim, self.head_dim - self.rope_dim], dim=-1
-        )
 
-        q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
-        # Note: RoPE (NeoX) can introduce extra leading dimensions during compilation
-        # so we need to reshape back to token-flattened shapes
-        q_pe = q_pe.reshape(-1, self.n_head, self.rope_dim)
-        k_pe = k_pe.reshape(-1, 1, self.rope_dim)
+        if current_platform.is_rocm():
+            # This path should works on all platform, will remove extra
+            # branches in the future
+            # Fused wk + weights_proj: one GEMM, then split
+            kw, _ = self.wk_weights_proj(hidden_states)
+            k = kw[:, : self.head_dim]
+            weights = kw[:, self.head_dim :]
+
+            k = self.k_norm(k)
+
+            rotary_emb(
+                positions, q[..., : self.rope_dim], k[..., : self.rope_dim].unsqueeze(1)
+            )
+        else:
+            q_pe, q_nope = torch.split(
+                q, [self.rope_dim, self.head_dim - self.rope_dim], dim=-1
+            )
+            # Fused wk + weights_proj: one GEMM, then split
+            kw, _ = self.wk_weights_proj(hidden_states)
+            k = kw[:, : self.head_dim]
+            weights = kw[:, self.head_dim :]
+
+            k = self.k_norm(k)
+            k_pe, k_nope = torch.split(
+                k, [self.rope_dim, self.head_dim - self.rope_dim], dim=-1
+            )
+
+            q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
+            # Note: RoPE (NeoX) can introduce extra leading dimensions during
+            # compilation so we need to reshape back to token-flattened shapes
+            q_pe = q_pe.reshape(-1, self.n_head, self.rope_dim)
+            k_pe = k_pe.reshape(-1, 1, self.rope_dim)
 
-        # `rotary_emb` is shape-preserving; `q_pe` is already
-        # [num_tokens, n_head, rope_dim].
-        q = torch.cat([q_pe, q_nope], dim=-1)
-        # `k_pe` is [num_tokens, 1, rope_dim] (MQA).
-        k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1)
+            # `rotary_emb` is shape-preserving; `q_pe` is already
+            # [num_tokens, n_head, rope_dim].
+            q = torch.cat([q_pe, q_nope], dim=-1)
+            # `k_pe` is [num_tokens, 1, rope_dim] (MQA).
+            k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1)
 
         # we only quant q here since k quant is fused with cache insertion
         q = q.view(-1, self.head_dim)
@@ -963,6 +982,7 @@ def __init__(
 
         self.is_v32 = hasattr(config, "index_topk")
 
+        _skip_topk = False
         if self.is_v32:
             self.indexer_rope_emb = get_rope(
                 qk_rope_head_dim,
@@ -980,6 +1000,21 @@ def __init__(
                 topk_indices_buffer,
                 f"{prefix}.indexer",
             )
+
+            # Enable IndexCache for DeepSeek models to reduce redundant top-k
+            # token selection computations in sparse attention.
+            use_index_cache = getattr(config, "use_index_cache", False)
+            if use_index_cache:
+                # IndexCache config
+                # Refer: https://arxiv.org/abs/2603.12201 for more details.
+                _index_topk_freq = getattr(config, "index_topk_freq", 1)
+                _index_topk_pattern = getattr(config, "index_topk_pattern", None)
+                layer_id = extract_layer_index(prefix)
+                if _index_topk_pattern is None:
+                    _skip_topk = max(layer_id - 1, 0) % _index_topk_freq != 0
+                elif 0 <= layer_id < len(_index_topk_pattern):
+                    _skip_topk = _index_topk_pattern[layer_id] == "S"
+
         else:
             self.indexer_rope_emb = None
             self.indexer = None
@@ -1017,6 +1052,7 @@ def __init__(
             cache_config,
             quant_config,
             prefix,
+            skip_topk=_skip_topk,
         )
 
     def forward(
diff --git a/vllm/model_executor/models/deepseek_v4.py b/vllm/model_executor/models/deepseek_v4.py
index 0e7662579d24..36266dff9c15 100644
--- a/vllm/model_executor/models/deepseek_v4.py
+++ b/vllm/model_executor/models/deepseek_v4.py
@@ -7,7 +7,6 @@
 import regex as re
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig, get_current_vllm_config
@@ -54,7 +53,6 @@
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.triton_utils import tl, triton
-from vllm.utils.multi_stream_utils import AuxStreamType
 from vllm.utils.torch_utils import direct_register_custom_op
 
 from .utils import (
@@ -719,12 +717,15 @@ def __init__(
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.prefix = prefix
-        if vllm_config.parallel_config.enable_expert_parallel:
-            self.use_mega_moe = (
-                vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe"
+        self.use_mega_moe = (
+            vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe"
+        )
+        if self.use_mega_moe and not vllm_config.parallel_config.enable_expert_parallel:
+            raise NotImplementedError(
+                "DeepSeek V4 MegaMoE currently requires expert parallel. "
+                "Enable it with --enable-expert-parallel, or pick a different "
+                "moe backend."
             )
-        else:
-            self.use_mega_moe = False
 
         self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
         self.hidden_size = config.hidden_size
@@ -857,10 +858,9 @@ def _init_fused_moe_experts(
     def forward(
         self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None
     ) -> torch.Tensor:
-        if self.gate.tid2eid is not None:
-            if input_ids is None:
-                raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.")
-            input_ids = input_ids.to(dtype=self.hash_indices_dtype)
+        if self.gate.tid2eid is not None and input_ids is None:
+            raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.")
+
         if not self.use_mega_moe:
             return self._forward_fused_moe(hidden_states, input_ids)
 
@@ -928,7 +928,7 @@ def __init__(
         vllm_config: VllmConfig,
         prefix: str,
         topk_indices_buffer: torch.Tensor | None = None,
-        aux_stream: torch.cuda.Stream | None = None,
+        aux_stream_list: list[torch.cuda.Stream] | None = None,
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -1030,7 +1030,6 @@ def __init__(
             max_position=self.max_position_embeddings,
             rope_parameters=rope_parameters,
             is_neox_style=False,
-            dtype=config.torch_dtype,
         )
 
         self.indexer = None
@@ -1061,7 +1060,7 @@ def __init__(
             indexer=self.indexer,
             indexer_rotary_emb=self.rotary_emb,
             topk_indices_buffer=topk_indices_buffer,
-            aux_stream=aux_stream,
+            aux_stream_list=aux_stream_list,
         )
         self.mla_attn = DeepseekV4MultiHeadLatentAttentionWrapper(
             hidden_size=self.hidden_size,
@@ -1097,9 +1096,14 @@ def __init__(
         vllm_config,
         prefix,
         topk_indices_buffer: torch.Tensor | None = None,
-        aux_stream_dict: dict[AuxStreamType, torch.cuda.Stream] | None = None,
+        aux_stream_list: list[torch.cuda.Stream] | None = None,
     ):
         super().__init__()
+
+        # Lazy import to avoid top-level tilelang dependency.
+        # Registers both torch.ops.vllm.mhc_pre and mhc_post
+        import vllm.model_executor.layers.mhc  # noqa: F401
+
         config = vllm_config.model_config.hf_config
         self.hidden_size = config.hidden_size
 
@@ -1108,9 +1112,7 @@ def __init__(
             vllm_config,
             prefix=f"{prefix}.attn",
             topk_indices_buffer=topk_indices_buffer,
-            aux_stream=aux_stream_dict.get(AuxStreamType.Attention)
-            if aux_stream_dict is not None
-            else None,
+            aux_stream_list=aux_stream_list,
         )
         self.ffn = DeepseekV4MoE(vllm_config, prefix=f"{prefix}.ffn")
 
@@ -1172,11 +1174,6 @@ def hc_pre(
         hc_scale: torch.Tensor,
         hc_base: torch.Tensor,
     ):
-        # Lazy import to avoid top-level tilelang dependency.
-        # Registers both torch.ops.vllm.mhc_pre and mhc_post,
-        # so hc_post() doesn't need its own import.
-        import vllm.model_executor.layers.mhc  # noqa: F401
-
         post_mix, res_mix, layer_input = torch.ops.vllm.mhc_pre(
             residual=x,
             fn=hc_fn,
@@ -1231,17 +1228,31 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
-
+        self.use_mega_moe = (
+            vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe"
+        )
+        if self.use_mega_moe and not vllm_config.parallel_config.enable_expert_parallel:
+            raise NotImplementedError(
+                "DeepSeek V4 MegaMoE currently requires expert parallel. "
+                "Enable it with --enable-expert-parallel, or pick a different "
+                "moe backend."
+            )
         self.vocab_size = config.vocab_size
         self.hc_eps = config.hc_eps
         self.hc_mult = config.hc_mult
         self.hc_dim = self.hc_mult * config.hidden_size
         self.rms_norm_eps = config.rms_norm_eps
 
-        aux_stream_list = [torch.cuda.Stream() for _ in range(1)]
-        self.aux_stream_dict = {
-            AuxStreamType.Attention: aux_stream_list[0],
-        }
+        # Three aux streams: one per non-default input GEMM in
+        # DeepseekV4MultiHeadLatentAttentionWrapper.attn_gemm_parallel_execute
+        # (compressor kv_score, indexer.weights_proj, indexer.compressor
+        # kv_score). fused_wqa_wkv stays on the default stream.
+        # Disable them on ROCm because of hang issues.
+        aux_stream_list = (
+            None
+            if current_platform.is_rocm()
+            else [torch.cuda.Stream() for _ in range(3)]
+        )
 
         self.device = current_platform.device_type
         # Reserved topk indices buffer for all Indexer layers to reuse.
@@ -1265,7 +1276,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 vllm_config,
                 prefix=prefix,
                 topk_indices_buffer=self.topk_indices_buffer,
-                aux_stream_dict=self.aux_stream_dict,
+                aux_stream_list=aux_stream_list,
             ),
             prefix=f"{prefix}.layers",
         )
@@ -1314,7 +1325,8 @@ def forward(
     ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.embed_input_ids(input_ids)
         hidden_states = hidden_states.unsqueeze(-2).repeat(1, self.hc_mult, 1)
-
+        if self.use_mega_moe:
+            input_ids = input_ids.to(torch.int64)
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(
                 hidden_states,
@@ -1456,14 +1468,25 @@ def hc_head(
     rms_norm_eps: float,
     hc_eps: float,
 ) -> torch.Tensor:
-    x = hidden_states
-    shape, dtype = x.size(), x.dtype
-    x = x.flatten(1).float()
-    rsqrt = torch.rsqrt(x.square().mean(-1, keepdim=True) + rms_norm_eps)
-    mixes = F.linear(x, hc_fn) * rsqrt
-    pre = torch.sigmoid(mixes * hc_scale + hc_base) + hc_eps
-    y = torch.sum(pre.unsqueeze(-1) * x.view(shape), dim=1)
-    return y.to(dtype)
+    hc_mult, hidden_size = hidden_states.shape[-2:]
+    outer_shape = hidden_states.shape[:-2]
+    hs_flat = hidden_states.view(-1, hc_mult, hidden_size)
+    num_tokens = hs_flat.shape[0]
+    out = torch.empty(
+        num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device
+    )
+    torch.ops.vllm.hc_head_fused_kernel(
+        hs_flat,
+        hc_fn,
+        hc_scale,
+        hc_base,
+        out,
+        hidden_size,
+        rms_norm_eps,
+        hc_eps,
+        hc_mult,
+    )
+    return out.view(*outer_shape, hidden_size)
 
 
 def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
diff --git a/vllm/model_executor/models/deepseek_v4_mtp.py b/vllm/model_executor/models/deepseek_v4_mtp.py
index cb2ae6a55d84..195709c9dacf 100644
--- a/vllm/model_executor/models/deepseek_v4_mtp.py
+++ b/vllm/model_executor/models/deepseek_v4_mtp.py
@@ -35,7 +35,6 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils.multi_stream_utils import AuxStreamType
 
 from .deepseek_mtp import SharedHead
 from .deepseek_v2 import get_spec_layer_idx_from_weight_name
@@ -65,6 +64,7 @@ def __init__(
         vllm_config: VllmConfig,
         topk_indices_buffer: torch.Tensor,
         prefix: str,
+        aux_stream_list: list[torch.cuda.Stream] | None = None,
     ) -> None:
         super().__init__()
 
@@ -112,14 +112,11 @@ def __init__(
         self.shared_head = SharedHead(
             config=config, prefix=prefix, quant_config=quant_config
         )
-        self.aux_stream_dict = {
-            AuxStreamType.Attention: torch.cuda.Stream(),
-        }
         self.mtp_block = DeepseekV4DecoderLayer(
             vllm_config,
             prefix,
             topk_indices_buffer=topk_indices_buffer,
-            aux_stream_dict=self.aux_stream_dict,
+            aux_stream_list=aux_stream_list,
         )
 
     def forward(
@@ -169,6 +166,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             device=self.device,
         )
 
+        # Three aux streams shared across all MTP layers, mirroring
+        # DeepseekV4Model. ROCm runs the same work serially for now.
+        aux_stream_list = (
+            None
+            if current_platform.is_rocm()
+            else [torch.cuda.Stream() for _ in range(3)]
+        )
+
         # to map the exact layer index from weights
         self.layers = torch.nn.ModuleDict(
             {
@@ -176,6 +181,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                     vllm_config,
                     self.topk_indices_buffer,
                     f"{prefix}.layers.{idx}",
+                    aux_stream_list=aux_stream_list,
                 )
                 for idx in range(
                     self.mtp_start_layer_idx,
diff --git a/vllm/model_executor/models/gemma4.py b/vllm/model_executor/models/gemma4.py
index bb91fd601e70..31f2d6a28ddc 100644
--- a/vllm/model_executor/models/gemma4.py
+++ b/vllm/model_executor/models/gemma4.py
@@ -84,6 +84,10 @@
 logger = init_logger(__name__)
 
 
+def _remap_gemma4_expert_weight_name(name: str) -> str:
+    return re.sub(r"(?<!\.moe)\.experts\.(\d+)\.", r".moe.experts.\1.", name)
+
+
 @triton.jit
 def _gemma4_routing_kernel(
     gating_ptr,
@@ -356,7 +360,7 @@ def routing_function(
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
             custom_routing_function=routing_function,
-            activation="gelu",
+            activation="gelu_tanh",
         )
 
     def forward(self, x: torch.Tensor, router_logits: torch.Tensor) -> torch.Tensor:
@@ -1144,11 +1148,6 @@ def _make_empty_intermediate_tensors(
                     dtype=dtype,
                     device=device,
                 ),
-                "residual": torch.zeros(
-                    (batch_size, hidden_size),
-                    dtype=dtype,
-                    device=device,
-                ),
             }
             if ple_dim and ple_dim > 0:
                 tensors["per_layer_inputs"] = torch.zeros(
@@ -1312,13 +1311,12 @@ def forward(
                 per_layer_inputs = self.project_per_layer_inputs(
                     hidden_states, per_layer_embeds
                 )
-            residual = None
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-            per_layer_inputs = intermediate_tensors.get("per_layer_inputs")
-
+            if per_layer_inputs is not None:
+                per_layer_inputs = intermediate_tensors["per_layer_inputs"]
+        residual = None
         aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)
         for layer_idx, layer in enumerate(
             islice(self.layers, self.start_layer, self.end_layer)
@@ -1342,13 +1340,12 @@ def forward(
                 aux_hidden_states, layer_idx + 1, hidden_states, residual
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors(
-                {
-                    "hidden_states": hidden_states,
-                    "residual": residual,
-                    "per_layer_inputs": per_layer_inputs,
-                }
-            )
+            tensors: dict[str, torch.Tensor] = {
+                "hidden_states": hidden_states,
+            }
+            if per_layer_inputs is not None:
+                tensors["per_layer_inputs"] = per_layer_inputs
+            return IntermediateTensors(tensors)
         # Gemma4 incorporates residual into hidden_states directly
         # Apply norm without residual fusion when possible.
         if residual is None:
@@ -1657,7 +1654,7 @@ def _weight_iterator():
                 # Remap individual 2D expert weights:
                 # .experts.{id}.{proj} → .moe.experts.{id}.{proj}
                 # (This handles per-expert 2D quantized weights)
-                name = re.sub(r"\.experts\.(\d+)\.", r".moe.experts.\1.", name)
+                name = _remap_gemma4_expert_weight_name(name)
 
                 # MoE expert weights: checkpoint stores as 3D packed
                 # tensors.  Explode into per-expert 2D weights for
diff --git a/vllm/model_executor/models/gemma4_mm.py b/vllm/model_executor/models/gemma4_mm.py
index cdc54609a652..029f4ff9bf51 100644
--- a/vllm/model_executor/models/gemma4_mm.py
+++ b/vllm/model_executor/models/gemma4_mm.py
@@ -81,10 +81,26 @@
 logger = init_logger(__name__)
 
 # Video constants — match transformers Gemma4VideoProcessor defaults.
+_SUPPORTED_SOFT_TOKENS = (70, 140, 280, 560, 1120)
 _VIDEO_MAX_SOFT_TOKENS = 70  # soft tokens per video frame (vs 280 for images)
 _VIDEO_MAX_FRAMES = 32  # max sampled frames per video
 
 
+def _get_max_soft_tokens(
+    merged_kwargs: Mapping[str, object],
+) -> tuple[object | None, bool]:
+    """Return configured image max_soft_tokens and whether it is top-level."""
+    val = merged_kwargs.get("max_soft_tokens")
+    if val is not None:
+        return val, True
+
+    images_kwargs = merged_kwargs.get("images_kwargs")
+    if isinstance(images_kwargs, Mapping):
+        return images_kwargs.get("max_soft_tokens"), False
+
+    return None, False
+
+
 # ---------------------------------------------------------------------------
 # Input schema
 # ---------------------------------------------------------------------------
@@ -216,10 +232,14 @@ def get_mm_max_tokens_per_item(
         self, seq_len: int, mm_counts: Mapping[str, int]
     ) -> Mapping[str, int] | None:
         config = self.get_hf_config()
-        # Upper bound: the pooler outputs default_output_length slots
-        # per image (280).  After padding is stripped the actual count
-        # is ≤ this value, but vLLM needs the max for memory planning.
+        # Upper bound: the pooler outputs max_soft_tokens slots per image.
+        # After padding is stripped the actual count is ≤ this value, but
+        # vLLM needs the max for memory planning.
         tokens_per_image = config.vision_config.default_output_length
+        merged_kwargs = self.ctx.get_merged_mm_kwargs({})
+        val, _ = _get_max_soft_tokens(merged_kwargs)
+        if isinstance(val, int) and val in _SUPPORTED_SOFT_TOKENS:
+            tokens_per_image = val
         tokens: dict[str, int] = {"image": tokens_per_image}
         if config.audio_config is not None:
             # Audio max tokens from the processor's audio_seq_length.
@@ -265,7 +285,14 @@ def _compute_num_soft_tokens(
         target_h = max(unit, int(math.floor(image_height * scale / unit)) * unit)
         target_w = max(unit, int(math.floor(image_width * scale / unit)) * unit)
         num_patches = (target_h // patch_size) * (target_w // patch_size)
-        return num_patches // (pooling_kernel_size**2)
+        # Clamp to ``max_soft_tokens``: extreme aspect ratios (e.g. 3x900)
+        # cause the floor() above to round one dim up to ``unit`` while the
+        # other scales freely, which over-shoots ``max_patches``. The HF
+        # Gemma 4 image processor caps its vision-tower output at
+        # ``max_soft_tokens``, so without this clamp the prompt-side
+        # placeholder count exceeds the encoder output and
+        # ``_merge_multimodal_embeddings`` crashes.
+        return min(num_patches // (pooling_kernel_size**2), max_soft_tokens)
 
     def get_image_repl(
         self,
@@ -485,13 +512,8 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # Validate max_soft_tokens early and exit cleanly on bad values.
-        _SUPPORTED_SOFT_TOKENS = (70, 140, 280, 560, 1120)
-
         merged_kwargs = self.info.ctx.get_merged_mm_kwargs(mm_kwargs)
-        val = merged_kwargs.get("max_soft_tokens")
-        if val is None:
-            val = merged_kwargs.get("images_kwargs", {}).get("max_soft_tokens")
+        val, is_top_level_max_soft_tokens = _get_max_soft_tokens(merged_kwargs)
 
         if val is not None and val not in _SUPPORTED_SOFT_TOKENS:
             raise ValueError(
@@ -638,7 +660,7 @@ def _call_hf_processor(
         # HF side (Gemma4ProcessorKwargs.images_kwargs) so that
         # _merge_kwargs routes max_soft_tokens into images_kwargs.
         patched_mm_kwargs = dict(mm_kwargs)
-        if val is not None:
+        if val is not None and is_top_level_max_soft_tokens:
             patched_mm_kwargs["max_soft_tokens"] = val
 
         processed_outputs = super()._call_hf_processor(
@@ -748,7 +770,12 @@ def get_replacement_image(item_idx: int):
                 merged_kwargs = self.info.ctx.get_merged_mm_kwargs(
                     hf_processor_mm_kwargs,
                 )
-                max_soft_tokens = merged_kwargs.get("max_soft_tokens")
+                val, _ = _get_max_soft_tokens(merged_kwargs)
+                max_soft_tokens = (
+                    val
+                    if isinstance(val, int) and val in _SUPPORTED_SOFT_TOKENS
+                    else None
+                )
                 return self.info.get_image_repl(
                     image_width=image_size.width,
                     image_height=image_size.height,
diff --git a/vllm/model_executor/models/gemma4_mtp.py b/vllm/model_executor/models/gemma4_mtp.py
new file mode 100644
index 000000000000..c294ffc6f9a7
--- /dev/null
+++ b/vllm/model_executor/models/gemma4_mtp.py
@@ -0,0 +1,603 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Gemma4 MTP (Multi-Token Prediction) model.
+
+The Gemma4 assistant model is a lightweight decoder that shares KV cache
+with the target (backbone) model.  All assistant decoder layers are
+KV-shared: they only have Q projections (no K/V projections or norms),
+and read K/V from the target model's cache at runtime.
+
+Checkpoint layout (``gemma4_assistant``)::
+
+    model.embed_tokens.*          -- token embeddings
+    model.layers.{i}.*            -- decoder layers (Q-only attention + MLP)
+    model.norm.*                  -- final RMSNorm
+    pre_projection.*              -- Linear(2 * backbone_hidden_size, hidden_size)
+    post_projection.*             -- Linear(hidden_size, backbone_hidden_size)
+    lm_head.*                     -- language model head (tied to embed_tokens)
+    masked_embedding.centroids.*  -- centroid projection (when use_ordered_embeddings)
+    masked_embedding.token_ordering -- token-to-centroid mapping buffer
+"""
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.sequence import IntermediateTensors
+
+from .gemma4 import Gemma4MLP, _get_text_config
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    extract_layer_index,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class Gemma4MTPMaskedEmbedder(nn.Module):
+    """Sparse logit computation via centroid-based vocabulary masking.
+
+    Instead of computing logits against the full vocabulary, projects
+    hidden states to centroid scores, selects top-K centroids, and
+    computes logits only for the ~top_k * (vocab_size / num_centroids)
+    tokens belonging to those centroids.
+    """
+
+    token_ordering: torch.Tensor
+
+    def __init__(
+        self,
+        hidden_size: int,
+        vocab_size: int,
+        num_centroids: int,
+        centroid_intermediate_top_k: int,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_centroids = num_centroids
+        self.centroid_intermediate_top_k = centroid_intermediate_top_k
+        self.vocab_size_per_centroid = vocab_size // num_centroids
+        self.num_selected = centroid_intermediate_top_k * self.vocab_size_per_centroid
+
+        self.centroids = nn.Linear(hidden_size, num_centroids, bias=False)
+        self.register_buffer(
+            "token_ordering",
+            torch.empty(vocab_size, dtype=torch.long),
+        )
+
+    def _select_and_score(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head_weight: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Centroid selection + sparse dot product.
+
+        Returns:
+            logits: (num_tokens, num_selected) sparse logits.
+            indices: (num_tokens, num_selected) corresponding vocab indices.
+        """
+        num_tokens = hidden_states.shape[0]
+        _, top_k_indices = torch.topk(
+            self.centroids(hidden_states),
+            k=self.centroid_intermediate_top_k,
+            dim=-1,
+        )
+        clusters = self.token_ordering.view(
+            self.num_centroids,
+            self.vocab_size_per_centroid,
+        )
+        selected = clusters[top_k_indices]
+        embeddings = lm_head_weight[selected.reshape(-1)].view(
+            num_tokens,
+            self.num_selected,
+            self.hidden_size,
+        )
+        logits = torch.einsum("td,tsd->ts", hidden_states, embeddings)
+        return logits, selected.view(num_tokens, -1)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head_weight: torch.Tensor,
+    ) -> torch.Tensor:
+        """Full-vocab logits with non-selected positions masked to -inf."""
+        logits, indices = self._select_and_score(hidden_states, lm_head_weight)
+        output = torch.full(
+            (hidden_states.shape[0], self.vocab_size),
+            fill_value=torch.finfo(hidden_states.dtype).min,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        return output.scatter_(-1, indices, logits)
+
+    def get_top_tokens(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head_weight: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sparse argmax — returns vocab token IDs without full-vocab tensor."""
+        logits, indices = self._select_and_score(hidden_states, lm_head_weight)
+        return indices.gather(-1, logits.argmax(-1, keepdim=True)).squeeze(-1)
+
+
+class Gemma4MTPAttention(nn.Module):
+    """Q-only attention for Gemma4 MTP layers.
+
+    K/V come from the target model's KV cache via
+    ``kv_sharing_target_layer_name`` (set by the proposer after
+    model construction).
+    """
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position_embeddings: int,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        attn_logits_soft_cap: float | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.scaling = 1.0
+
+        self.q_proj = ColumnParallelLinear(
+            hidden_size,
+            self.total_num_heads * self.head_dim,
+            bias=config.attention_bias,
+            quant_config=None,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=None,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        layer_idx = extract_layer_index(prefix)
+        layer_type = config.layer_types[layer_idx]
+        self.is_sliding = layer_type == "sliding_attention"
+        sliding_window = config.sliding_window if self.is_sliding else None
+
+        if layer_type in config.rope_parameters:
+            rope_parameters = dict(config.rope_parameters[layer_type])
+        else:
+            rope_parameters = dict(config.rope_parameters.copy())
+            if self.is_sliding:
+                rope_parameters["rope_theta"] = getattr(
+                    config, "rope_local_base_freq", 10000.0
+                )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=rope_parameters,
+            is_neox_style=True,
+        )
+
+        # kv_sharing_target_layer_name is set after model construction
+        # by Gemma4Proposer._setup_gemma4_kv_sharing().
+        self.is_kv_shared_layer = True
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            logits_soft_cap=attn_logits_soft_cap,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        q, _ = self.q_proj(hidden_states)
+
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        q = self.q_norm(q)
+        q = q.flatten(-2, -1)
+
+        q, _ = self.rotary_emb(positions, q, None)
+
+        # Attention reads K/V from the target's cache via KV sharing;
+        # these dummy tensors are never consumed but required by the API.
+        num_tokens = q.shape[0]
+        kv_dummy = torch.empty(
+            num_tokens,
+            self.num_kv_heads * self.head_dim,
+            dtype=q.dtype,
+            device=q.device,
+        )
+        attn_output = self.attn(q, kv_dummy, kv_dummy)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma4MTPDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        layer_idx = extract_layer_index(prefix)
+        layer_type = config.layer_types[layer_idx]
+        is_full_attention = layer_type == "full_attention"
+        head_dim = (
+            getattr(config, "global_head_dim", config.head_dim)
+            if is_full_attention
+            else config.head_dim
+        )
+
+        self.self_attn = Gemma4MTPAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            attn_logits_soft_cap=getattr(config, "attn_logit_softcapping", None),
+            prefix=f"{prefix}.self_attn",
+        )
+
+        text_config = _get_text_config(config)
+        self.mlp = Gemma4MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=text_config.intermediate_size,
+            hidden_activation=text_config.hidden_activation,
+            quant_config=None,
+            prefix=f"{prefix}.mlp",
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.register_buffer("layer_scalar", torch.ones(1))
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            **kwargs,
+        )
+
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states * self.layer_scalar
+        return hidden_states, None
+
+
+class Gemma4MultiTokenPredictor(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        text_config = _get_text_config(config)
+        self.config = text_config
+
+        self.hidden_size = text_config.hidden_size
+        self.backbone_hidden_size = getattr(
+            config, "backbone_hidden_size", self.hidden_size
+        )
+        self.vocab_size = text_config.vocab_size
+        self.num_mtp_layers = text_config.num_hidden_layers
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            self.hidden_size,
+        )
+
+        self.pre_projection = ColumnParallelLinear(
+            2 * self.backbone_hidden_size,
+            self.hidden_size,
+            bias=False,
+            gather_output=True,
+            prefix=f"{prefix}.pre_projection",
+        )
+
+        self.post_projection = RowParallelLinear(
+            self.hidden_size,
+            self.backbone_hidden_size,
+            bias=False,
+            input_is_parallel=False,
+            prefix=f"{prefix}.post_projection",
+        )
+
+        self.layers = nn.ModuleList(
+            Gemma4MTPDecoderLayer(
+                text_config,
+                cache_config=vllm_config.cache_config,
+                quant_config=vllm_config.quant_config,
+                prefix=f"{prefix}.layers.{idx}",
+            )
+            for idx in range(self.num_mtp_layers)
+        )
+
+        self.norm = RMSNorm(self.hidden_size, eps=text_config.rms_norm_eps)
+
+        # After embedding sharing, embed_tokens is replaced with the
+        # target model's backbone-dim embedding.  Scale by
+        # sqrt(backbone_hidden_size) to match the target's convention.
+        self.register_buffer(
+            "normalizer",
+            torch.tensor(self.backbone_hidden_size**0.5),
+            persistent=False,
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids) * self.normalizer
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        params_dict.update(dict(self.named_buffers()))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Returns (draft_hidden_states, backbone_hidden_states).
+
+        draft_hidden_states: draft-dim, used by compute_logits via lm_head.
+        backbone_hidden_states: backbone-dim, stored in the proposer's
+            hidden-state buffer and fed back as input to the next step.
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_input_ids(input_ids)
+
+        combined = torch.cat([inputs_embeds, hidden_states], dim=-1)
+        hidden_states, _ = self.pre_projection(combined)
+
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+
+        draft_hidden_states = self.norm(hidden_states)
+
+        backbone_hidden_states, _ = self.post_projection(draft_hidden_states)
+        return draft_hidden_states, backbone_hidden_states
+
+
+@support_torch_compile
+class Gemma4MTP(nn.Module):
+    """Gemma4 Multi-Token Prediction model for speculative decoding.
+
+    forward() returns (draft_hidden_states, backbone_hidden_states).
+    The proposer uses draft_hidden_states for compute_logits (via
+    the draft-dim lm_head) and backbone_hidden_states for the
+    hidden-state feedback buffer.
+    """
+
+    has_own_lm_head = True
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "pre_projection.": "model.pre_projection.",
+            "post_projection.": "model.post_projection.",
+        },
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        text_config = _get_text_config(config)
+        self.config = config
+
+        self.model = Gemma4MultiTokenPredictor(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "draft_model"),
+        )
+
+        # lm_head operates in draft-dim.  Tied to embed_tokens at init
+        # so load_weights populates both from a single checkpoint entry.
+        # After embedding sharing, lm_head.weight still references the
+        # original draft-dim tensor.
+        self.lm_head = ParallelLMHead(
+            text_config.vocab_size,
+            text_config.hidden_size,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if getattr(config, "tie_word_embeddings", True):
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.logits_processor = LogitsProcessor(
+            text_config.vocab_size,
+            soft_cap=getattr(text_config, "final_logit_softcapping", None),
+        )
+
+        if getattr(config, "use_ordered_embeddings", False):
+            num_centroids = getattr(config, "num_centroids", 2048)
+            top_k = getattr(config, "centroid_intermediate_top_k", 32)
+            self.masked_embedding = Gemma4MTPMaskedEmbedder(
+                hidden_size=text_config.hidden_size,
+                vocab_size=text_config.vocab_size,
+                num_centroids=num_centroids,
+                centroid_intermediate_top_k=top_k,
+            )
+            logger.info(
+                "Gemma4 MTP: centroids masking enabled "
+                "(num_centroids=%d, top_k=%d, active_tokens=%d/%d).",
+                num_centroids,
+                top_k,
+                top_k * (text_config.vocab_size // num_centroids),
+                text_config.vocab_size,
+            )
+        else:
+            self.masked_embedding = None
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+        **kwargs: object,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.model(
+            input_ids,
+            positions,
+            hidden_states,
+            intermediate_tensors,
+            inputs_embeds,
+            spec_step_idx,
+        )
+
+    def _get_full_lm_head_weight(self) -> torch.Tensor:
+        lm_head_weight = self.lm_head.weight
+        tp_size = get_tensor_model_parallel_world_size()
+        if tp_size > 1:
+            lm_head_weight = tensor_model_parallel_all_gather(
+                lm_head_weight,
+                dim=0,
+            )
+        return lm_head_weight[: self.masked_embedding.vocab_size]
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor | None:
+        if self.masked_embedding is not None:
+            return self.masked_embedding(
+                hidden_states,
+                self._get_full_lm_head_weight(),
+            )
+        return self.logits_processor(self.lm_head, hidden_states)
+
+    def get_top_tokens(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sparse argmax via centroids masking. Returns token IDs directly."""
+        return self.masked_embedding.get_top_tokens(
+            hidden_states,
+            self._get_full_lm_head_weight(),
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/granite4_vision.py b/vllm/model_executor/models/granite4_vision.py
index 147f02eced97..710fc94ee5f8 100644
--- a/vllm/model_executor/models/granite4_vision.py
+++ b/vllm/model_executor/models/granite4_vision.py
@@ -887,9 +887,10 @@ def forward(
             and get_pp_group().is_first_rank
             and self._ds_layer_indices
         ):
+            n = inputs_embeds.size(0)
             ds: IntermediateTensors | None = IntermediateTensors(
                 {
-                    f"ds_{llm_layer}": self._ds_buffers[lvl]
+                    f"ds_{llm_layer}": self._ds_buffers[lvl][:n]
                     for lvl, llm_layer in enumerate(self._ds_layer_indices)
                 }
             )
diff --git a/vllm/model_executor/models/laguna.py b/vllm/model_executor/models/laguna.py
new file mode 100644
index 000000000000..08f35d691817
--- /dev/null
+++ b/vllm/model_executor/models/laguna.py
@@ -0,0 +1,886 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Laguna model compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class LagunaMLP(nn.Module):
+    """Dense MLP for Laguna (used in mlp_only_layers)."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # gate_proj and up_proj are kept as separate ColumnParallelLinear
+        # rather than merged via MergedColumnParallelLinear. The merged form
+        # requires per-partition NVFP4 global scales (weight_global_scale,
+        # input_global_scale) to be packed into a length-2 PerTensorScaleParameter
+        # and then collapsed via .max() in process_weights_after_loading; this
+        # doesn't round-trip cleanly through Marlin's NVFP4 stacked-layer code
+        # path. Splitting yields one global scale per Linear, exactly matching
+        # the standard compressed-tensors per-Linear schema on disk.
+        self.gate_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_proj",
+        )
+        self.up_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported."
+            )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate, _ = self.gate_proj(x)
+        up, _ = self.up_proj(x)
+        x, _ = self.down_proj(F.silu(gate) * up)
+        return x
+
+
+class LagunaMoE(nn.Module):
+    """Sparse MoE block for Laguna with optional shared expert and sigmoid routing.
+
+    Key differences from other MoE implementations:
+    - Uses SIGMOID routing activation (not softmax)
+    - Shared expert runs in parallel with routed experts (when enabled)
+    - Matches HF reference: modular_laguna.py LagunaSparseMoeBlock
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+
+        self.n_routed_experts = config.num_experts
+        self.n_shared_experts = 1 if config.shared_expert_intermediate_size > 0 else 0
+        self.routed_scaling_factor = float(
+            getattr(config, "moe_routed_scaling_factor", 1.0)
+        )
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+        eplb_config.num_redundant_experts = (
+            eplb_config.num_redundant_experts
+            if eplb_config.num_redundant_experts is not None
+            else 0
+        )
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        # Router gate
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        # Shared expert (optional) - passed to FusedMoE for overlap optimization
+        self.shared_expert: LagunaMLP | None
+        if config.shared_expert_intermediate_size > 0:
+            self.shared_expert = LagunaMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.shared_expert_intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,  # Reduce after shared+routed combine
+                prefix=f"{prefix}.shared_expert",
+            )
+        else:
+            self.shared_expert = None
+
+        # Auxiliary-loss-free load-balancing bias (arXiv:2408.15664). The
+        # checkpoint stores one [num_experts] tensor per MoE layer at
+        # `mlp.experts.e_score_correction_bias`; registering it as a Parameter
+        # on the FusedMoE lets the weight loader pick it up and the router
+        # add it during top-k selection. The fused top-k bias router requires
+        # float32 regardless of model dtype.
+        e_score_correction_bias = torch.nn.Parameter(
+            torch.zeros(config.num_experts, dtype=torch.float32),
+            requires_grad=False,
+        )
+
+        # FusedMoE with SIGMOID routing. Passing `shared_experts=` lets the
+        # layer overlap the shared-expert compute with the all2all dispatch.
+        # `apply_routed_scale_to_output=True` makes FusedMoE handle the
+        # routed_scaling_factor, shared+routed combine, and TP all-reduce
+        # internally, so forward() just returns the final hidden states.
+        self.experts = FusedMoE(
+            shared_experts=self.shared_expert,
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            scoring_func="sigmoid",
+            use_grouped_topk=False,
+            apply_router_weight_on_input=bool(config.moe_apply_router_weight_on_input),
+            e_score_correction_bias=e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            routed_scaling_factor=self.routed_scaling_factor,
+            apply_routed_scale_to_output=True,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits, _ = self.gate(hidden_states)
+        router_logits = router_logits.float()
+        softcap = getattr(self.config, "moe_router_logit_softcapping", 0.0) or 0.0
+        if softcap > 0.0:
+            router_logits = torch.tanh(router_logits / softcap) * softcap
+
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class LagunaAttention(nn.Module):
+    """Laguna attention with optional softplus output gating.
+
+    Supports per-layer sliding window attention when ``config.layer_types``
+    is present.  Layers whose type is ``"sliding_attention"`` use
+    ``config.sliding_window``; all other layers (typically labelled
+    ``"full_attention"``) use full attention.  When ``layer_types`` is
+    absent every layer defaults to full attention for backwards
+    compatibility.
+    """
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position_embeddings: int = 131072,
+        head_dim: int | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attention_sink: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        # Gating flag
+        self.gating = config.gating
+
+        # Per-layer sliding window (follows Gemma2/Cohere2 convention)
+        layer_types = getattr(config, "layer_types", None)
+        if layer_types is not None:
+            layer_idx = extract_layer_index(prefix)
+            is_sliding = layer_types[layer_idx] == "sliding_attention"
+            self.sliding_window = config.sliding_window if is_sliding else None
+        else:
+            self.sliding_window = None
+
+        # QKV projection (no bias for Laguna)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        # Output projection
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # Gating projection (Laguna-specific, optional)
+        # config.gating may be:
+        #   - True / "per-element": one gate per (head, head_dim) channel
+        #   - "per-head":           one gate per head, broadcast across head_dim
+        if self.gating:
+            # v5 LagunaConfig uses ``gating=True`` for per-head; older configs
+            # used ``"per-head"``. Accept both. ``"per-element"`` (or legacy
+            # ``True``) means per-element gating with output size num_heads ×
+            # head_dim.
+            gate_per_head = self.gating is True or self.gating == "per-head"
+            g_out = (
+                self.total_num_heads
+                if gate_per_head
+                else self.total_num_heads * self.head_dim
+            )
+            self.g_proj = ColumnParallelLinear(
+                hidden_size,
+                g_out,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.g_proj",
+            )
+            self.gate_per_head = gate_per_head
+        else:
+            self.g_proj = None
+            self.gate_per_head = False
+
+        # Attention sinks (learnable per-head bias for SWA layers)
+        sinks = None
+        if attention_sink:
+            self.sink = torch.nn.Parameter(
+                torch.empty(self.total_num_heads // tp_size, requires_grad=False)
+            )
+            sinks = self.sink
+
+        # Resolve rope params per-layer-type. ``config.rope_parameters`` is
+        # either a flat dict (legacy) or a nested ``{layer_type: rope_dict}``
+        # (v5 Laguna-XS schema). The v5 form is unhashable as-is and would
+        # crash `get_rope`'s cache lookup, so always pull out the layer's
+        # sub-dict before forwarding.
+        layer_type = (
+            layer_types[extract_layer_index(prefix)]
+            if layer_types is not None
+            else "full_attention"
+        )
+        is_sliding = layer_type == "sliding_attention"
+
+        top_rope = getattr(config, "rope_parameters", None) or {}
+        if any(isinstance(v, dict) for v in top_rope.values()):
+            # Nested per-layer-type form.
+            base_rope = top_rope.get(layer_type) or top_rope.get("full_attention") or {}
+        else:
+            base_rope = top_rope
+
+        # Older flat-rope ckpts can carry a separate `swa_rope_parameters`
+        # for SWA layers. Prefer it when present; otherwise the nested
+        # rope dict above already supplies the correct sub-config.
+        swa_rope = getattr(config, "swa_rope_parameters", None)
+        if (
+            is_sliding
+            and swa_rope is None
+            and not any(isinstance(v, dict) for v in top_rope.values())
+        ):
+            logger.warning_once(
+                "Laguna config has sliding_attention layers but neither "
+                "`swa_rope_parameters` nor a nested per-layer-type "
+                "`rope_parameters` — SWA layers will reuse the global rope. "
+                "If the checkpoint was trained with distinct SWA rope "
+                "(theta / partial_rotary_factor), regenerate its HF config "
+                "to include either form."
+            )
+        rope_params = swa_rope if (is_sliding and swa_rope is not None) else base_rope
+        # `partial_rotary_factor` may live on the top-level config (main attention)
+        # or on the per-layer rope dict itself (e.g. SWA can differ). Inject the
+        # top-level value into `rope_params` if the dict doesn't already set it.
+        top_partial = getattr(config, "partial_rotary_factor", None)
+        if top_partial is not None and "partial_rotary_factor" not in rope_params:
+            rope_params = {**rope_params, "partial_rotary_factor": top_partial}
+
+        # Rotary embeddings (YaRN)
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            max_position=max_position_embeddings,
+            is_neox_style=True,
+            rope_parameters=rope_params,
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+            sinks=sinks,
+        )
+
+        # QK normalization (like Qwen3)
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.view(q.shape)
+
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.view(k.shape)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+
+        # Apply gating if enabled (compute softplus in float32 for precision)
+        if self.gating and self.g_proj is not None:
+            gate, _ = self.g_proj(hidden_states)
+            gate = F.softplus(gate.float()).type_as(attn_output)
+            if self.gate_per_head:
+                # gate: [..., num_heads]; broadcast across head_dim
+                attn_shape = attn_output.shape
+                attn_output = (
+                    attn_output.view(*attn_shape[:-1], self.num_heads, self.head_dim)
+                    * gate.unsqueeze(-1)
+                ).view(attn_shape)
+            else:
+                attn_output = attn_output * gate
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class LagunaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        layer_idx = extract_layer_index(prefix)
+
+        # Determine if this layer uses sliding window attention
+        layer_types = getattr(config, "layer_types", None)
+        is_sliding = (
+            layer_types is not None and layer_types[layer_idx] == "sliding_attention"
+        )
+
+        # Enable attention sinks on SWA layers when configured
+        attention_sink = is_sliding and getattr(
+            config, "swa_attention_sink_enabled", False
+        )
+
+        # Optional per-layer override of head count (Laguna-XS).
+        per_layer_heads = getattr(config, "num_attention_heads_per_layer", None)
+        layer_num_heads = (
+            per_layer_heads[layer_idx]
+            if per_layer_heads is not None
+            else config.num_attention_heads
+        )
+
+        self.self_attn = LagunaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=layer_num_heads,
+            num_kv_heads=config.num_key_value_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            head_dim=getattr(config, "head_dim", None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attention_sink=attention_sink,
+        )
+
+        # Check if this layer uses MoE or dense MLP (matches Qwen2/Qwen3 convention)
+        mlp_only_layers = (
+            [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
+        )
+        self.is_moe_layer = (
+            (layer_idx not in mlp_only_layers)
+            and (config.num_experts > 0)
+            and ((layer_idx + 1) % config.decoder_sparse_step == 0)
+        )
+
+        if self.is_moe_layer:
+            self.mlp = LagunaMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = LagunaMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class LagunaModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        enable_eplb = vllm_config.parallel_config.enable_eplb
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+        self.config = config
+        self.quant_config = quant_config
+
+        # Disable the model-level sliding-window fallback in Attention.__init__.
+        # Laguna drives SWA per-layer via `layer_types`, passing
+        # `per_layer_sliding_window=self.sliding_window` (None for global
+        # layers). Without this, global layers whose `per_layer_sliding_window`
+        # is None would pick up `cache_config.sliding_window`
+        # (populated from `config.sliding_window`) as a fallback, silently
+        # applying a 512-token window to full-attention layers.
+        if cache_config is not None:
+            cache_config.sliding_window = None
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: LagunaDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        """Get expert parameter mapping for weight loading.
+
+        Returns mapping tuples of (param_name, weight_name, expert_id, shard_id)
+        that handle both weights and quantization scales.
+        """
+        return FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            # gate_proj and up_proj are loaded as separate Linears (see
+            # LagunaMLP) so no merge entry is needed here.
+        ]
+
+        # Suffixes to skip for GPTQ/modelopt models if param doesn't exist
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+
+        tp_rank = get_tensor_model_parallel_rank()
+
+        for name, loaded_weight in weights:
+            # Handle attention sinks (distributed across ranks). Derive the
+            # per-rank slice from the parameter's own shape so per-layer
+            # variations in head count are handled correctly.
+            if "sink" in name:
+                param = params_dict.get(name)
+                if param is not None:
+                    layer_heads_per_rank = param.shape[0]
+                    layer_head_start = tp_rank * layer_heads_per_rank
+                    narrow_weight = loaded_weight.narrow(
+                        0, layer_head_start, layer_heads_per_rank
+                    )
+                    param.data.copy_(narrow_weight)
+                    loaded_params.add(name)
+                continue
+
+            # Handle KV cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                assert loaded_weight.numel() == 1, (
+                    f"KV scale numel {loaded_weight.numel()} != 1"
+                )
+                loaded_weight = loaded_weight.squeeze()
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            # Handle stacked params (QKV, gate_up for
+            # non-expert layers and shared_expert)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # Skip expert weights - handled below via expert_params_mapping
+                if "mlp.experts" in name and "shared_expert" not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                if name.endswith(ignore_suffixes) and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                # Remap FP8 kv_scale names for backwards compatibility
+                if name.endswith("scale"):
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                # Try expert params mapping (handles weights + quantization scales)
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Mark as expert weight so we skip regular loading below
+                    is_expert_weight = True
+
+                    # Create mapped name without modifying original
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+                    if (
+                        name_mapped.endswith(ignore_suffixes)
+                        and name_mapped not in params_dict
+                    ):
+                        continue
+                    if name_mapped not in params_dict:
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # Use return_success to handle expert parallelism correctly
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        loaded_params.add(name_mapped)
+                        break
+                else:
+                    # Expert weight not mapped to this rank - skip
+                    if is_expert_weight:
+                        continue
+
+                    # Remap kv_scale names before the ignore_suffixes filter:
+                    # the suffix list includes .k_scale/.v_scale, so filtering
+                    # first drops the checkpoint key before remap can rewrite
+                    # it to the .attn.* name that exists in params_dict.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+
+        return loaded_params
+
+
+class LagunaForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = LagunaModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if self.config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2ecced3df8ba..3c797d05e932 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -337,20 +337,15 @@ def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None
         return vllm_config.quant_config
 
 
-def llama_model_invariants(
-    input_ids, positions, intermediate_tensors=None, inputs_embeds=None
-):
-    """Shape invariants for Llama model compilation, those are translated to
-    runtime assertions for unbacked dynamic shapes and are compiled away for
-    backed"""
-    if input_ids is not None:
-        torch._check(positions.size()[0] == input_ids.size()[0])
-
-
 @support_torch_compile(
     # TODO[#32068]: Investigate recompilation
     # mark_unbacked_dims={"input_ids": 0},
-    shape_invariants=llama_model_invariants
+    dynamic_arg_dims={
+        "input_ids": {0: "b"},
+        "positions": {0: "b"},
+        "intermediate_tensors": {0: "b"},
+        "inputs_embeds": {0: "b"},
+    },
 )
 class LlamaModel(nn.Module, EagleModelMixin):
     def __init__(
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index d81df6f33737..3dd1118aa8a4 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -69,6 +69,7 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
+    AutoWeightsLoader,
     PPMissingLayer,
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
@@ -485,6 +486,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
+        self.quant_config = quant_config
 
         self.vocab_size = config.vocab_size
 
@@ -551,77 +553,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    """Flash model for causal language modeling."""
-
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
-        quant_config = vllm_config.quant_config
-
-        self.config = config
-        config.intermediate_size = (
-            config.ffn_hidden_size
-            if hasattr(config, "ffn_hidden_size")
-            else config.intermediate_size
-        )
-
-        self.quant_config = quant_config
-
-        self.model = FlashModel(
-            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
-        )
-
-        if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-        else:
-            self.lm_head = PPMissingLayer()
-
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_input_ids(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor | None,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
-        hidden_states = self.model(
-            input_ids, positions, intermediate_tensors, inputs_embeds
-        )
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
@@ -730,9 +661,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             loaded_params.add(name)
         for layer_id in range(self.config.num_hidden_layers):
             for i in range(2):
-                if isinstance(self.model.layers[layer_id], PPMissingLayer):
+                if isinstance(self.layers[layer_id], PPMissingLayer):
                     continue
-                self_attn = self.model.layers[layer_id].self_attn[i]
+                self_attn = self.layers[layer_id].self_attn[i]
                 if hasattr(
                     self.quant_config, "weight_block_size"
                 ) and self_attn.kv_b_proj.weight.dtype in (
@@ -765,3 +696,81 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                         self.config.hidden_size / self.config.kv_lora_rank
                     ) ** 0.5
         return loaded_params
+
+
+class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    """Flash model for causal language modeling."""
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        config.intermediate_size = (
+            config.ffn_hidden_size
+            if hasattr(config, "ffn_hidden_size")
+            else config.intermediate_size
+        )
+
+        self.quant_config = quant_config
+
+        self.model = FlashModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mistral_eagle.py b/vllm/model_executor/models/mistral_eagle.py
new file mode 100644
index 000000000000..908b50f7ca00
--- /dev/null
+++ b/vllm/model_executor/models/mistral_eagle.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.models.interfaces import MultiModalEmbeddings
+from vllm.model_executor.models.llama import LlamaConfig
+from vllm.model_executor.models.mistral import (
+    MistralDecoderLayer,
+    MistralForCausalLM,
+    MistralModel,
+)
+from vllm.model_executor.models.utils import (
+    _merge_multimodal_embeddings,
+    get_draft_quant_config,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class EagleMistralDecoderLayer(MistralDecoderLayer):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        config: LlamaConfig | None = None,
+    ) -> None:
+        super().__init__(vllm_config, prefix=prefix, config=config)
+
+    def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
+        return get_draft_quant_config(vllm_config)
+
+
+@support_torch_compile
+class EagleMistralModel(MistralModel):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        start_layer_id: int = 0,
+    ) -> None:
+        # Bypass MistralModel.__init__ to avoid creating duplicate attention
+        # layer entries in the global context.
+        nn.Module.__init__(self)
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
+        self.vocab_size = self.config.vocab_size
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+            quant_config=self.quant_config,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                EagleMistralDecoderLayer(
+                    vllm_config,
+                    prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                    config=self.config,
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
+        self.fc = RowParallelLinear(
+            self.config.hidden_size * 2,
+            self.config.hidden_size,
+            bias=False,
+            input_is_parallel=False,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
+        )
+        self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_input_ids(input_ids)
+        hidden_states = self.fc(torch.cat((inputs_embeds, hidden_states), dim=-1))
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        # Pretend embed_tokens is loaded; the actual weight is shared
+        # from the target model at runtime by `load_eagle_model`.
+        return super().load_weights(weights) | {"embed_tokens.weight"}
+
+
+class EagleMistralForCausalLM(MistralForCausalLM):
+    mistral_mapping = MistralForCausalLM.mistral_mapping | {
+        "eagle_linear": "model.fc",
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        # Bypass MistralForCausalLM.__init__ to use the draft model config
+        # and to avoid creating an lm_head.
+        nn.Module.__init__(self)
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config
+        )
+        self.model = EagleMistralModel(
+            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+        )
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(
+            self.config.vocab_size, scale=logit_scale
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.model(input_ids, positions, hidden_states, inputs_embeds)
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        inputs_embeds = super().embed_input_ids(input_ids)
+
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        assert is_multimodal is not None
+
+        return _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
diff --git a/vllm/model_executor/models/moondream3.py b/vllm/model_executor/models/moondream3.py
new file mode 100644
index 000000000000..d5f3e6b195fb
--- /dev/null
+++ b/vllm/model_executor/models/moondream3.py
@@ -0,0 +1,1423 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Moondream3 model implementation."""
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+from functools import cached_property
+from itertools import islice
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.inputs import MultiModalDataDict
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
+from vllm.model_executor.layers.fused_moe import MoEActivation, fused_experts
+from vllm.model_executor.layers.fused_moe.config import biased_moe_quant_config
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (
+    BaseDummyInputsBuilder,
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.moondream3 import (
+    Moondream3Config,
+    Moondream3TextConfig,
+    Moondream3VisionConfig,
+)
+from vllm.transformers_utils.processors.moondream3 import Moondream3Processor
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    extract_layer_index,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+# ============================================================================
+# Image Processing Utilities
+# ============================================================================
+
+
+def reconstruct_from_crops(
+    crops: torch.Tensor,
+    tiling: tuple[int, int],
+    overlap_margin: int,
+    patch_size: int = 14,
+) -> torch.Tensor:
+    """Reconstruct features from overlapping crops."""
+    tiling_h, tiling_w = tiling
+    crop_height, crop_width = crops[0].shape[:2]
+    margin_pixels = overlap_margin * patch_size
+
+    output_h = (crop_height - 2 * margin_pixels) * tiling_h + 2 * margin_pixels
+    output_w = (crop_width - 2 * margin_pixels) * tiling_w + 2 * margin_pixels
+
+    reconstructed = torch.zeros(
+        (output_h, output_w, crops[0].shape[2]),
+        device=crops[0].device,
+        dtype=crops[0].dtype,
+    )
+
+    for i, crop in enumerate(crops):
+        tile_y = i // tiling_w
+        tile_x = i % tiling_w
+
+        x_start = 0 if tile_x == 0 else margin_pixels
+        x_end = crop_width if tile_x == tiling_w - 1 else crop_width - margin_pixels
+        y_start = 0 if tile_y == 0 else margin_pixels
+        y_end = crop_height if tile_y == tiling_h - 1 else crop_height - margin_pixels
+
+        out_x = tile_x * (crop_width - 2 * margin_pixels)
+        out_y = tile_y * (crop_height - 2 * margin_pixels)
+
+        reconstructed[
+            out_y + y_start : out_y + y_end, out_x + x_start : out_x + x_end
+        ] = crop[y_start:y_end, x_start:x_end]
+
+    return reconstructed
+
+
+# ============================================================================
+# Vision Encoder Components
+# ============================================================================
+
+
+class Moondream3VisionMLP(nn.Module):
+    """MLP for vision encoder blocks."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.act = get_act_fn("gelu_pytorch_tanh")
+        self.fc2 = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class Moondream3VisionAttention(nn.Module):
+    """Self-attention for vision encoder (bidirectional)."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = num_heads // tp_size
+
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_heads_per_partition,
+            head_size=self.head_dim,
+            scale=self.head_dim**-0.5,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(3, dim=-1)
+        out = self.attn(q, k, v)
+        out, _ = self.out_proj(out)
+        return out
+
+
+class Moondream3VisionBlock(nn.Module):
+    """Transformer block for vision encoder."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        num_heads: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(hidden_size, eps=1e-5)
+        self.attn = Moondream3VisionAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.ln2 = nn.LayerNorm(hidden_size, eps=1e-5)
+        self.mlp = Moondream3VisionMLP(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+class Moondream3VisionEncoder(nn.Module):
+    """Vision encoder (SigLIP-style ViT)."""
+
+    def __init__(
+        self,
+        config: Moondream3VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        # Patch embedding
+        self.patch_emb = nn.Linear(
+            config.enc_patch_size * config.enc_patch_size * 3,
+            config.enc_dim,
+            bias=True,
+        )
+
+        # Position embeddings (27x27 = 729 patches for 378x378 / 14)
+        num_patches = (config.crop_size // config.enc_patch_size) ** 2
+        self.pos_emb = nn.Parameter(torch.zeros(1, num_patches, config.enc_dim))
+
+        # Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                Moondream3VisionBlock(
+                    hidden_size=config.enc_dim,
+                    intermediate_size=config.enc_ff_dim,
+                    num_heads=config.enc_n_heads,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{i}",
+                )
+                for i in range(config.enc_n_layers)
+            ]
+        )
+
+        self.post_ln = nn.LayerNorm(config.enc_dim, eps=1e-5)
+
+    def create_patches(self, images: torch.Tensor) -> torch.Tensor:
+        """Convert images to patch embeddings.
+
+        Args:
+            images: (batch, channels, height, width)
+
+        Returns:
+            patches: (batch, num_patches, patch_dim)
+        """
+        patch_size = self.config.enc_patch_size
+        batch, channels, height, width = images.shape
+        patches_h = height // patch_size
+        patches_w = width // patch_size
+
+        # Unfold into patches
+        patches = images.unfold(2, patch_size, patch_size).unfold(
+            3, patch_size, patch_size
+        )
+        # (batch, channels, patches_h, patches_w, patch_size, patch_size)
+        patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
+        # (batch, patches_h, patches_w, channels, patch_size, patch_size)
+        patches = patches.view(batch, patches_h * patches_w, -1)
+        # (batch, num_patches, channels * patch_size * patch_size)
+
+        return patches
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """Encode images.
+
+        Args:
+            pixel_values: (batch, channels, height, width)
+
+        Returns:
+            features: (batch, num_patches, hidden_size)
+        """
+        # Create patches and embed
+        patches = self.create_patches(pixel_values)
+        x = self.patch_emb(patches)
+
+        # Add position embeddings
+        x = x + self.pos_emb
+
+        # Apply transformer blocks
+        for block in self.blocks:
+            x = block(x)
+
+        # Final layer norm
+        x = self.post_ln(x)
+
+        return x
+
+
+class Moondream3VisionProjection(nn.Module):
+    """Projects vision features to text embedding dimension."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        output_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # Input is concatenated global and local features (2 * input_dim)
+        self.fc1 = ColumnParallelLinear(
+            input_dim * 2,
+            inner_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.act = get_act_fn("gelu_pytorch_tanh")
+        self.fc2 = RowParallelLinear(
+            inner_dim,
+            output_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+# ============================================================================
+# Text Decoder Components
+# ============================================================================
+
+
+class Moondream3TextMLP(nn.Module):
+    """Standard MLP for non-MoE layers (layers 0-3)."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.act = get_act_fn("gelu_pytorch_tanh")
+        self.fc2 = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class Moondream3TextMoE(nn.Module):
+    """Mixture of Experts layer for layers 4+ with expert parallelism.
+
+    Moondream3 uses a custom GeGLU activation: gelu(h) * (g + 1)
+    where fc1 outputs [gate, up] and the activation is gelu(gate) * (up + 1).
+
+    Uses expert parallelism where each GPU stores num_experts/tp_size experts.
+    Routing and communication handled via all-to-all or replicated computation.
+
+    Checkpoint format:
+    - fc1.weight: [num_experts, expert_inner_dim * 2, hidden_size] (gate+up)
+    - fc2.weight: [num_experts, hidden_size, expert_inner_dim] (down)
+    - router.weight: [num_experts, hidden_size]
+    - router.bias: [num_experts]
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        expert_inner_dim: int,
+        num_experts: int,
+        experts_per_token: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.expert_inner_dim = expert_inner_dim
+        self.num_experts = num_experts
+        self.experts_per_token = experts_per_token
+
+        # Expert parallelism: each GPU stores a subset of experts
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.experts_per_rank = num_experts // self.tp_size
+        self.num_local_experts = self.experts_per_rank
+
+        # Router (gate) - use ReplicatedLinear for compatibility
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=True,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        # Local expert weights (only store experts_per_rank experts)
+        # fc1: [experts_per_rank, expert_inner_dim * 2, hidden_size]
+        # fc2: [experts_per_rank, hidden_size, expert_inner_dim]
+        self.fc1_weight = nn.Parameter(
+            torch.empty(self.num_local_experts, expert_inner_dim * 2, hidden_size)
+        )
+        self.fc2_weight = nn.Parameter(
+            torch.empty(self.num_local_experts, hidden_size, expert_inner_dim)
+        )
+        self._use_fused_moe = True
+
+        local_expert_start = get_tensor_model_parallel_rank() * self.experts_per_rank
+        expert_map = torch.full((num_experts,), -1, dtype=torch.int32)
+        expert_map[local_expert_start : local_expert_start + self.num_local_experts] = (
+            torch.arange(self.num_local_experts, dtype=torch.int32)
+        )
+        self.register_buffer("_expert_map", expert_map, persistent=False)
+
+        # Preserve Moondream3's exact GeGLU variant (gelu(h) * (g + 1)) by
+        # adding +1 bias to the second half of the fused fc1 activations.
+        fused_w1_bias = torch.zeros(
+            self.num_local_experts,
+            expert_inner_dim * 2,
+            dtype=torch.float32,
+        )
+        fused_w1_bias[:, expert_inner_dim:] = 1.0
+        self.register_buffer("_fused_w1_bias", fused_w1_bias, persistent=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass with expert parallelism and custom GeGLU activation."""
+
+        # Get router logits and compute top-k
+        router_logits, _ = self.gate(x)  # [num_tokens, num_experts]
+        topk_logits, topk_ids = torch.topk(
+            router_logits, self.experts_per_token, dim=-1
+        )
+        # Softmax over selected experts
+        topk_weights = F.softmax(topk_logits, dim=-1, dtype=torch.float32).to(x.dtype)
+
+        if self._use_fused_moe and x.is_cuda:
+            try:
+                out = fused_experts(
+                    hidden_states=x.contiguous(),
+                    w1=self.fc1_weight,
+                    w2=self.fc2_weight,
+                    topk_weights=topk_weights.contiguous(),
+                    topk_ids=topk_ids.contiguous(),
+                    activation=MoEActivation.GELU,
+                    global_num_experts=self.num_experts,
+                    expert_map=self._expert_map,
+                    quant_config=biased_moe_quant_config(self._fused_w1_bias, None),
+                )
+                out = tensor_model_parallel_all_reduce(out)
+                return out
+            except (NotImplementedError, RuntimeError) as exc:
+                self._use_fused_moe = False
+                logger.warning_once(
+                    "Disabling fused Moondream3 MoE path and falling back to "
+                    "the Python expert loop: %s",
+                    str(exc),
+                )
+
+        tp_rank = get_tensor_model_parallel_rank()
+        # Compute local expert range
+        local_expert_start = tp_rank * self.experts_per_rank
+
+        # Fallback path for environments where fused kernels are unavailable.
+        out = x.new_zeros(x.shape)
+
+        for local_expert_idx in range(self.num_local_experts):
+            global_expert_id = local_expert_start + local_expert_idx
+
+            # Find tokens assigned to this expert
+            token_pos, which_k = (topk_ids == global_expert_id).nonzero(as_tuple=True)
+            if token_pos.numel() == 0:
+                continue
+
+            # Get tokens and their routing weights
+            x_tok = x.index_select(0, token_pos)  # [n_tokens, hidden_size]
+            gate_tok = topk_weights[token_pos, which_k]  # [n_tokens]
+
+            # fc1: [expert_inner_dim * 2, hidden_size]
+            # h_full: [n_tokens, expert_inner_dim * 2]
+            h_full = F.linear(x_tok, self.fc1_weight[local_expert_idx])
+
+            # GeGLU with (g + 1): h, g = split; output = gelu(h) * (g + 1)
+            # HF MoE uses exact GELU (not tanh approximation).
+            h, g = h_full.chunk(2, dim=-1)  # Each [n_tokens, expert_inner_dim]
+            h = F.gelu(h) * (g + 1.0)
+
+            # fc2: [hidden_size, expert_inner_dim]
+            # y: [n_tokens, hidden_size]
+            y = F.linear(h, self.fc2_weight[local_expert_idx])
+
+            # Apply routing weight
+            y = y * gate_tok.unsqueeze(-1)
+
+            # Accumulate output
+            out.index_add_(0, token_pos, y)
+
+        # All-reduce to combine results from all experts across GPUs
+        out = tensor_model_parallel_all_reduce(out)
+
+        return out
+
+
+class Moondream3Attention(nn.Module):
+    """Decoder attention with RoPE and tau scaling.
+
+    Moondream3 uses a tau attention mechanism that scales Q and V
+    based on both token content and position.
+    """
+
+    def __init__(
+        self,
+        config: Moondream3TextConfig,
+        layer_idx: int,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.dim
+        self.num_heads = config.n_heads
+        self.num_kv_heads = config.n_kv_heads
+        self.head_dim = config.dim // config.n_heads
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = self.num_heads // tp_size
+        self.num_kv_heads_per_partition = max(1, self.num_kv_heads // tp_size)
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            total_num_kv_heads=self.num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.out_proj = RowParallelLinear(
+            input_size=self.hidden_size,
+            output_size=self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        # Moondream uses 32-dim rotation out of 64-dim head (partial_rotary_factor=0.5)
+        # HF Moondream uses non-interleaved RoPE (split by half)
+        # In vLLM, is_neox_style=True means split by half (GPT-NeoX style)
+        rope_parameters = {
+            "rope_theta": config.rope_theta,
+            "partial_rotary_factor": 32 / self.head_dim,  # 32/64 = 0.5
+        }
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            max_position=config.max_context,
+            rope_parameters=rope_parameters,
+            is_neox_style=True,  # Moondream uses split-by-half (GPT-NeoX) style
+        )
+
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(
+            num_heads=self.num_heads_per_partition,
+            head_size=self.head_dim,
+            scale=self.scaling,
+            num_kv_heads=self.num_kv_heads_per_partition,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Tau scaling parameters for position-dependent attention
+        # These are learned during training to modulate attention based on position
+        # tau_wq and tau_wv need full qkv_dim for correct computation
+        # Only heads are partitioned, qkv dimension is kept full for all-gather
+        qkv_dim = self.hidden_size * 3  # Q + K + V dimension (full)
+        self.tau_alpha = nn.Parameter(torch.zeros(self.num_heads_per_partition))
+        self.tau_wq = nn.Parameter(torch.zeros(self.num_heads_per_partition, qkv_dim))
+        self.tau_wv = nn.Parameter(torch.zeros(self.num_heads_per_partition, qkv_dim))
+        self.tp_size = tp_size
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = qkv.split(
+            [
+                self.num_heads_per_partition * self.head_dim,
+                self.num_kv_heads_per_partition * self.head_dim,
+                self.num_kv_heads_per_partition * self.head_dim,
+            ],
+            dim=-1,
+        )
+
+        # Apply tau scaling to Q and V
+        # Tau scaling has two components:
+        # 1. Token-based: tok_q = tanh(gelu(qkv) @ tau_wq.T)
+        # 2. Position-based: tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)
+        # Final: tau = tok + tau_pos
+        #
+        # For TP, tau weights are sharded by head, but qkv_dim is kept full
+
+        # Get full qkv for tau computation
+        # With TP, reconstruct qkv in correct layout [q_full, k_full, v_full]
+        # (all-gather would produce [q_0, k_0, v_0, q_1, k_1, v_1] - wrong)
+        if self.tp_size > 1:
+            # All-gather once, then reconstruct [q_full, k_full, v_full].
+            qkv_full_sharded = tensor_model_parallel_all_gather(qkv.contiguous())
+            q_local_dim = q.shape[-1]
+            kv_local_dim = k.shape[-1]
+            qkv_full_sharded = qkv_full_sharded.view(
+                qkv.shape[0],
+                self.tp_size,
+                q_local_dim + 2 * kv_local_dim,
+            )
+            q_full = qkv_full_sharded[:, :, :q_local_dim].reshape(qkv.shape[0], -1)
+            k_full = qkv_full_sharded[
+                :, :, q_local_dim : q_local_dim + kv_local_dim
+            ].reshape(qkv.shape[0], -1)
+            v_full = qkv_full_sharded[:, :, q_local_dim + kv_local_dim :].reshape(
+                qkv.shape[0], -1
+            )
+            qkv_full = torch.cat([q_full, k_full, v_full], dim=-1).contiguous()
+        else:
+            qkv_full = qkv
+
+        # Compute tau scaling factors matching HF implementation exactly:
+        # tok_feat = gelu(qkv)
+        # tok_q = tanh(tok_feat @ tau_wq.T)  # [num_tokens, num_heads]
+        # tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)  # [num_heads, num_tokens]
+        # tau = (tok_q.T + tau_pos).T  # [num_tokens, num_heads]
+        num_tokens = qkv_full.shape[0]
+        orig_dtype = q.dtype
+
+        # Token-based component
+        tok_feat = F.gelu(qkv_full)  # Apply GELU activation
+        tok_q = torch.tanh(tok_feat @ self.tau_wq.t())  # [N, H_per_partition]
+        tok_v = torch.tanh(tok_feat @ self.tau_wv.t())  # [N, H_per_partition]
+
+        # Position-based component
+        # tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)
+        # positions is [num_tokens], need to compute for each head
+        # tau_alpha: [num_heads_per_partition]
+        pos_float = (positions.to(orig_dtype) + 1.0).clamp(min=1e-6)
+        pos_log = pos_float.log()  # [num_tokens]
+        # alpha[:, None] * pos_log[None, :] -> [num_heads, num_tokens]
+        tau_pos = 1.0 + (
+            torch.sigmoid(self.tau_alpha[:, None] * pos_log[None, :]) - 0.5
+        )  # [H_per_partition, N]
+
+        # Combine token and position components
+        tau_q = (tok_q + tau_pos.t()).to(orig_dtype)  # [N, H_per_partition]
+        tau_v = (tok_v + tau_pos.t()).to(orig_dtype)  # [N, H_per_partition]
+
+        # Reshape q and v to apply per-head tau scaling
+        q = q.view(num_tokens, self.num_heads_per_partition, self.head_dim)
+        v = v.view(num_tokens, self.num_kv_heads_per_partition, self.head_dim)
+
+        # Apply tau scaling
+        q = q * tau_q.unsqueeze(-1)
+        v = v * tau_v[:, : self.num_kv_heads_per_partition].unsqueeze(-1)
+
+        # Reshape back
+        q = q.view(num_tokens, -1)
+        v = v.view(num_tokens, -1)
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class Moondream3DecoderLayer(nn.Module):
+    """Decoder layer with attention + MLP/MoE."""
+
+    def __init__(
+        self,
+        config: Moondream3TextConfig,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+
+        self.ln = nn.LayerNorm(config.dim, eps=1e-5, bias=True)
+
+        self.attn = Moondream3Attention(
+            config=config,
+            layer_idx=layer_idx,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Use MoE for layers >= moe_start_layer, standard MLP otherwise
+        if layer_idx >= config.moe_start_layer:
+            self.mlp = Moondream3TextMoE(
+                hidden_size=config.dim,
+                expert_inner_dim=config.moe_expert_inner_dim,
+                num_experts=config.moe_num_experts,
+                experts_per_token=config.moe_experts_per_token,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = Moondream3TextMLP(
+                hidden_size=config.dim,
+                intermediate_size=config.ff_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Pre-norm architecture
+        normed = self.ln(hidden_states)
+        attn_out = self.attn(positions, normed)
+        mlp_out = self.mlp(normed)
+        hidden_states = hidden_states + attn_out + mlp_out
+        return hidden_states
+
+
+class Moondream3TextModel(nn.Module):
+    """Text decoder model."""
+
+    def __init__(
+        self,
+        config: Moondream3TextConfig,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.dim,
+            prefix=f"{prefix}.wte",
+        )
+
+        blocks_prefix = maybe_prefix(prefix, "blocks")
+        self.start_layer, self.end_layer, self.blocks = make_layers(
+            config.n_layers,
+            lambda prefix: Moondream3DecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=blocks_prefix,
+        )
+
+        self.post_ln = nn.LayerNorm(config.dim, eps=1e-5, bias=True)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states"], config.dim
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        pp_group = get_pp_group()
+        if pp_group.is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                assert input_ids is not None
+                hidden_states = self.embed_input_ids(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for i, layer in enumerate(
+            islice(self.blocks, self.start_layer, self.end_layer)
+        ):
+            hidden_states = layer(positions, hidden_states)
+
+        if not pp_group.is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        hidden_states = self.post_ln(hidden_states)
+        return hidden_states
+
+
+@dataclass(frozen=True)
+class Moondream3ImageInput:
+    """Container holding per-image inputs for embedding."""
+
+    pixel_values: torch.Tensor
+    tiling: tuple[int, int] | None
+
+
+# ============================================================================
+# Multimodal Processing
+# ============================================================================
+
+
+class Moondream3ProcessingInfo(BaseProcessingInfo):
+    """Processing info for Moondream3."""
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(Moondream3Processor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": 1}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        # HF pre-fills BOS together with the fixed 27x27 vision grid under
+        # the same bidirectional prefix mask: 1 BOS + 729 image embeddings.
+        return 730
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        return ImageSize(width=378, height=378)
+
+    def get_max_image_tokens(self) -> int:
+        return 730
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+
+class Moondream3DummyInputsBuilder(BaseDummyInputsBuilder[Moondream3ProcessingInfo]):
+    """Dummy inputs builder for profiling."""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return (
+            "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>"
+            "What is this image?<|md_reserved_2|>"
+        )
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+        mm_processor_kwargs: Mapping[str, object] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        return {
+            "image": self._get_dummy_images(
+                width=378,
+                height=378,
+                num_images=num_images,
+            )
+        }
+
+
+class Moondream3MultiModalProcessor(BaseMultiModalProcessor[Moondream3ProcessingInfo]):
+    """Multimodal processor for Moondream3."""
+
+    image_placeholder: str = "<image>"
+    bos_image_placeholder: str = "<|endoftext|><image>"
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Moondream3's processor handles images directly rather than exposing a
+        # separate `image_processor`, so keep the cache path on text+MM calls.
+        return super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs)
+
+    @cached_property
+    def bos_image_placeholder_tokens(self) -> list[int]:
+        tokenizer = self.info.get_tokenizer()
+        token_ids = tokenizer.encode(
+            self.bos_image_placeholder,
+            add_special_tokens=False,
+        )
+        if len(token_ids) < 2:
+            raise ValueError(
+                "Tokenizer could not encode Moondream3 BOS/image placeholder "
+                f"{self.bos_image_placeholder!r}."
+            )
+        return token_ids
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return {
+            "pixel_values": MultiModalFieldConfig.batched("image"),
+            "tilings": MultiModalFieldConfig.batched("image", keep_on_cpu=True),
+        }
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        # Moondream3 HF processor does NOT expand placeholder tokens.
+        # vLLM expands BOS + <image> so the whole HF image prefix is marked
+        # bidirectional by the multimodal prefix-LM mask.
+        return False
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> list[PromptUpdate]:
+        image_size = self.info.get_image_size_with_most_features()
+        num_image_tokens = self.info.get_num_image_tokens(
+            image_width=image_size.width,
+            image_height=image_size.height,
+        )
+        placeholder_tokens = self.bos_image_placeholder_tokens
+        bos_token = placeholder_tokens[0]
+        image_token = placeholder_tokens[-1]
+        return [
+            PromptReplacement(
+                modality="image",
+                target=placeholder_tokens,
+                replacement=PromptUpdateDetails(
+                    full=[bos_token] + [image_token] * (num_image_tokens - 1),
+                ),
+            ),
+        ]
+
+
+# ============================================================================
+# Main Model
+# ============================================================================
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Moondream3MultiModalProcessor,
+    info=Moondream3ProcessingInfo,
+    dummy_inputs=Moondream3DummyInputsBuilder,
+)
+class Moondream3ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    """Moondream3 multimodal model for causal language modeling.
+
+    vLLM supports the standard autoregressive Moondream3 query and caption
+    prompt formats. The region-module point/detect skills require custom
+    coordinate decoding and are intentionally not exposed here.
+    """
+
+    supports_multimodal = True
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        hf_config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+
+        # Reuse the transformers_utils config implementation.
+        if isinstance(hf_config, Moondream3Config):
+            self.config = hf_config
+        else:
+            config_dict = hf_config.config if hasattr(hf_config, "config") else {}
+            self.config = Moondream3Config(config=config_dict)
+
+        with self._mark_tower_model(vllm_config, "image"):
+            # Vision encoder
+            self.vision = Moondream3VisionEncoder(
+                config=self.config.vision_config,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "vision"),
+            )
+
+            # Vision projection
+            self.vision_proj = Moondream3VisionProjection(
+                input_dim=self.config.vision_config.enc_dim,
+                inner_dim=self.config.vision_config.proj_inner_dim,
+                output_dim=self.config.text_config.dim,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "vision_proj"),
+            )
+
+        with self._mark_language_model(vllm_config):
+            # Text decoder
+            self.text = Moondream3TextModel(
+                config=self.config.text_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "text"),
+            )
+
+            # LM head (with bias - Moondream3 has lm_head bias)
+            self.lm_head = ParallelLMHead(
+                self.config.text_config.vocab_size,
+                self.config.text_config.dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+
+        self.logits_processor = LogitsProcessor(self.config.text_config.vocab_size)
+        self.make_empty_intermediate_tensors = self.text.make_empty_intermediate_tensors
+        self._answer_id = getattr(
+            self.config,
+            "answer_token_id",
+            getattr(hf_config, "answer_token_id", 3),
+        )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality == "image":
+            return "<image>"
+        return None
+
+    def get_language_model(self) -> nn.Module:
+        return self.text
+
+    def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
+        return num_image_tokens
+
+    def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
+        return num_vision_tokens
+
+    def _split_pixel_values(
+        self,
+        pixel_values: object,
+    ) -> list[torch.Tensor]:
+        # The processor should standardize image inputs into:
+        # - torch.Tensor [num_images, num_crops, C, H, W], or
+        # - list[torch.Tensor[num_crops, C, H, W]] for ragged crops.
+        if isinstance(pixel_values, torch.Tensor):
+            if pixel_values.dim() != 5:
+                raise ValueError(
+                    "Expected `pixel_values` tensor with shape "
+                    "[num_images, num_crops, C, H, W], got "
+                    f"{tuple(pixel_values.shape)}."
+                )
+            return [pv.contiguous() for pv in pixel_values]
+
+        if isinstance(pixel_values, (list, tuple)):
+            tensors: list[torch.Tensor] = []
+            for value in pixel_values:
+                if not isinstance(value, torch.Tensor):
+                    raise TypeError(
+                        "Expected each `pixel_values` element to be a tensor, "
+                        f"got {type(value)!r}."
+                    )
+                if value.dim() != 4:
+                    raise ValueError(
+                        f"Unsupported pixel_values element shape {tuple(value.shape)}."
+                    )
+                tensors.append(value.contiguous())
+            return tensors
+
+        raise TypeError(
+            "pixel_values must be a tensor or a sequence of tensors, "
+            f"got {type(pixel_values)!r}."
+        )
+
+    def _split_tilings(
+        self,
+        tilings: object,
+        expected: int,
+    ) -> list[tuple[int, int] | None]:
+        if tilings is None:
+            return [None] * expected
+
+        if isinstance(tilings, torch.Tensor):
+            if tilings.dim() != 2 or tilings.shape[1] != 2:
+                raise ValueError(
+                    "Expected `tilings` tensor with shape [num_images, 2], got "
+                    f"{tuple(tilings.shape)}."
+                )
+            tiling_items = tilings.tolist()
+        elif isinstance(tilings, (list, tuple)):
+            tiling_items = list(tilings)
+        else:
+            raise TypeError(
+                "tilings must be None, a tensor or a sequence of tuples, "
+                f"got {type(tilings)!r}."
+            )
+
+        if len(tiling_items) != expected:
+            raise ValueError(
+                "Mismatch between the number of pixel_values entries "
+                f"({expected}) and tilings ({len(tiling_items)})."
+            )
+
+        normalized: list[tuple[int, int] | None] = []
+        for tiling in tiling_items:
+            if tiling is None:
+                normalized.append(None)
+                continue
+            if isinstance(tiling, torch.Tensor):
+                tiling = tiling.tolist()
+            if isinstance(tiling, (list, tuple)) and len(tiling) == 2:
+                normalized.append((int(tiling[0]), int(tiling[1])))
+            else:
+                raise ValueError(
+                    f"Each tiling entry must be a pair of integers, got {tiling!r}."
+                )
+        return normalized
+
+    def _parse_image_inputs(self, **kwargs: object) -> list[Moondream3ImageInput]:
+        pixel_values = kwargs.get("pixel_values")
+        if pixel_values is None:
+            return []
+
+        pixel_values_list = self._split_pixel_values(pixel_values)
+        tilings_list = self._split_tilings(
+            kwargs.get("tilings"), len(pixel_values_list)
+        )
+
+        image_inputs: list[Moondream3ImageInput] = []
+        for value, tiling in zip(pixel_values_list, tilings_list):
+            if value.dim() != 4:
+                raise ValueError(
+                    f"Expected 4D tensor for crops, got {tuple(value.shape)}."
+                )
+            image_inputs.append(Moondream3ImageInput(pixel_values=value, tiling=tiling))
+        return image_inputs
+
+    def _encode_image_input(self, image_input: Moondream3ImageInput) -> torch.Tensor:
+        pixel_values = image_input.pixel_values
+        if pixel_values.dim() != 4:
+            raise ValueError(
+                f"Expected 4D tensor for crops, got {tuple(pixel_values.shape)}."
+            )
+
+        device = self.vision.patch_emb.weight.device
+        dtype = self.vision.patch_emb.weight.dtype
+        pixel_values = pixel_values.to(device=device, dtype=dtype)
+
+        features = self.vision(pixel_values)
+
+        # Grid size = crop_size / patch_size (e.g., 378 / 14 = 27)
+        grid_size = (
+            self.config.vision_config.crop_size
+            // self.config.vision_config.enc_patch_size
+        )
+        enc_dim = self.config.vision_config.enc_dim
+        global_features = features[0]
+
+        if features.shape[0] > 1:
+            if image_input.tiling is None:
+                raise ValueError(
+                    "Missing tiling metadata for multi-crop Moondream image."
+                )
+            local = features[1:].contiguous().view(-1, grid_size, grid_size, enc_dim)
+            reconstructed = reconstruct_from_crops(
+                local,
+                image_input.tiling,
+                overlap_margin=self.config.vision_config.overlap_margin,
+                patch_size=1,
+            )
+        else:
+            reconstructed = global_features.view(grid_size, grid_size, enc_dim)
+
+        recon = reconstructed.permute(2, 0, 1).contiguous()
+        # Mirror HF reference behavior: reconstructed local features are pooled
+        # to enc_n_layers x enc_n_layers. For moondream3-preview this is 27x27.
+        pooled_size = self.config.vision_config.enc_n_layers
+        if pooled_size != grid_size:
+            logger.warning_once(
+                "Moondream3 pooled_size (%d) differs from crop grid (%d). "
+                "Using enc_n_layers to match HF reference behavior.",
+                pooled_size,
+                grid_size,
+            )
+        recon = F.adaptive_avg_pool2d(recon, output_size=(pooled_size, pooled_size))
+        recon = recon.permute(1, 2, 0).contiguous().view(-1, enc_dim)
+
+        combined = torch.cat([global_features, recon], dim=-1).unsqueeze(0)
+        projected = self.vision_proj(combined).squeeze(0)
+
+        # Note: Vision embeddings are already synchronized across TP ranks
+        # because the vision projection uses RowParallelLinear which performs
+        # all-reduce internally, ensuring identical outputs on all ranks.
+
+        return projected
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        """Generate the HF image prefix: BOS embedding + 729 image embeddings."""
+        image_inputs = self._parse_image_inputs(**kwargs)
+        if not image_inputs:
+            return []
+
+        device = self.vision.patch_emb.weight.device
+        bos_ids = torch.tensor([self.config.bos_token_id], device=device)
+        bos_embedding = self.text.embed_input_ids(bos_ids)
+
+        embeddings: list[torch.Tensor] = []
+        for image_input in image_inputs:
+            image_embeddings = self._encode_image_input(image_input)
+            embeddings.append(
+                torch.cat([bos_embedding.to(image_embeddings.dtype), image_embeddings])
+            )
+        return embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.text(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        if logits is not None:
+            logits[:, self._answer_id] = float("-inf")
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Load weights with remapping from HuggingFace format."""
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        # Get expert intermediate size for fc1 splitting
+
+        for name, loaded_weight in weights:
+            # Map from HF naming to vLLM naming
+            # model.vision.* -> vision.*
+            # model.text.* -> text.*
+            if name.startswith("model."):
+                name = name[6:]  # Remove "model." prefix
+
+            # Specific name mappings
+            # Vision projection: vision.proj_mlp.fc1 -> vision_proj.fc1
+            name = name.replace("vision.proj_mlp.", "vision_proj.")
+
+            # Text embedding: text.wte (no suffix) -> text.wte.weight
+            if name == "text.wte":
+                name = "text.wte.weight"
+
+            # LM head: text.lm_head -> lm_head
+            name = name.replace("text.lm_head.", "lm_head.")
+
+            # Attention mapping
+            name = name.replace(".attn.qkv.", ".attn.qkv_proj.")
+            name = name.replace(".attn.proj.", ".attn.out_proj.")
+
+            # Tau attention scaling weights
+            # HF format: .attn.tau.alpha -> .attn.tau_alpha
+            name = name.replace(".attn.tau.alpha", ".attn.tau_alpha")
+            name = name.replace(".attn.tau.wq", ".attn.tau_wq")
+            name = name.replace(".attn.tau.wv", ".attn.tau_wv")
+
+            # MoE router mapping: mlp.router -> mlp.gate
+            name = name.replace(".mlp.router.", ".mlp.gate.")
+
+            # Handle MoE expert weights for layers 4+ with expert parallelism
+            # fc1.weight: [n_experts, expert_inner_dim * 2, hidden_size] (gate+up)
+            # fc2.weight: [n_experts, hidden_size, expert_inner_dim] (down)
+            # Each GPU stores n_experts/tp_size experts
+            # Note: Only 3D weights are MoE, 2D weights are standard MLP
+            if ".mlp.fc1.weight" in name and loaded_weight.dim() == 3:
+                from vllm.distributed import get_tensor_model_parallel_rank
+
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                num_experts = loaded_weight.shape[0]
+                experts_per_rank = num_experts // tp_size
+                expert_start = tp_rank * experts_per_rank
+                expert_end = expert_start + experts_per_rank
+                # Shard by expert dimension
+                loaded_weight = loaded_weight[expert_start:expert_end].contiguous()
+                # Map to our custom MoE format: mlp.fc1_weight
+                name = name.replace(".mlp.fc1.weight", ".mlp.fc1_weight")
+
+            if ".mlp.fc2.weight" in name and loaded_weight.dim() == 3:
+                from vllm.distributed import get_tensor_model_parallel_rank
+
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                num_experts = loaded_weight.shape[0]
+                experts_per_rank = num_experts // tp_size
+                expert_start = tp_rank * experts_per_rank
+                expert_end = expert_start + experts_per_rank
+                # Shard by expert dimension
+                loaded_weight = loaded_weight[expert_start:expert_end].contiguous()
+                # Map to our custom MoE format: mlp.fc2_weight
+                name = name.replace(".mlp.fc2.weight", ".mlp.fc2_weight")
+
+            # Handle tau weights with tensor parallelism
+            # tau_alpha: [num_heads] -> [num_heads/tp]
+            # tau_wq: [num_heads, qkv_dim] -> [num_heads/tp, qkv_dim/tp]
+            # tau_wv: [num_heads, qkv_dim] -> [num_heads/tp, qkv_dim/tp]
+            if ".tau_alpha" in name:
+                from vllm.distributed import get_tensor_model_parallel_rank
+
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                num_heads = loaded_weight.shape[0]
+                heads_per_partition = num_heads // tp_size
+                start = tp_rank * heads_per_partition
+                end = start + heads_per_partition
+                loaded_weight = loaded_weight[start:end].contiguous()
+
+            if ".tau_wq" in name or ".tau_wv" in name:
+                from vllm.distributed import get_tensor_model_parallel_rank
+
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                num_heads, qkv_dim = loaded_weight.shape
+                heads_per_partition = num_heads // tp_size
+                # Only shard by head dimension, keep full qkv_dim for all-gather
+                head_start = tp_rank * heads_per_partition
+                head_end = head_start + heads_per_partition
+                loaded_weight = loaded_weight[head_start:head_end, :].contiguous()
+
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        return loaded_params
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 684ced0a6abd..994b52606b18 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1499,6 +1499,11 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        mm_config = self.model_config.multimodal_config
+        load_multimodal_weights = not all(
+            mm_config.get_limit_per_prompt(modality) == 0
+            for modality in ("image", "video", "audio")
+        )
         adapter_dict = dict(self.mlp1.named_parameters())
 
         def is_llm(name: str) -> bool:
@@ -1523,23 +1528,30 @@ def is_sound_weights(name: str) -> bool:
                 # Strip 'language_model.' prefix for LLM weights
                 llm_weights.append((".".join(name.split(".")[1:]), w))
             elif is_adapter_weights((name, w)):
+                if not load_multimodal_weights:
+                    continue
                 # Load vision-language adapter weights directly
                 trimmed_name = ".".join(name.split(".")[1:])
                 param = adapter_dict[trimmed_name]
                 with torch.no_grad():
                     default_weight_loader(param, w)
             elif is_vision_weights(name):
+                if not load_multimodal_weights:
+                    continue
                 # Convert: vision_model.radio_model.* → radio_model.*
                 hf_key = name[len("vision_model.") :]  # Remove "vision_model." prefix
                 vision_weights.append((hf_key, w))
             elif is_sound_weights(name):
+                if not load_multimodal_weights:
+                    continue
                 assert self.sound_encoder is not None
                 sound_weights.append((name, w))
 
         self.language_model.load_weights(llm_weights)
-        self.vision_model.load_weights(vision_weights)
-        if self.sound_encoder is not None and len(sound_weights) > 0:
-            self.sound_encoder.load_weights(sound_weights)
+        if load_multimodal_weights:
+            self.vision_model.load_weights(vision_weights)
+            if self.sound_encoder is not None and len(sound_weights) > 0:
+                self.sound_encoder.load_weights(sound_weights)
 
     def get_vit_model_from_radio_config(self, hf_config):
         hf_config_vision = hf_config.vision_config
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index e81541b29aec..c6d369fffa1d 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -797,6 +797,83 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            # Update the weight names to be compatible with the vllm version
+            # of the model.
+            # Do not change the order of the replacements.
+            replacements = {
+                # Rename incompatible weight names.
+                ".A_log": ".A",
+                ".B_norm_weight": ".B_norm.weight",
+                ".C_norm_weight": ".C_norm.weight",
+                ".dt_norm_weight": ".dt_norm.weight",
+                ".q_weight": ".q_norm.weight",
+                ".k_weight": ".k_norm.weight",
+            }
+            # Apply replacements based on the defined mappings
+            for old, new in replacements.items():
+                if old in name:
+                    name = name.replace(old, new)
+
+            # Reshape the in_proj weights to match the shape expected
+            # by MergedColumnParallelLinear.
+            # This works both for unquantized weights and
+            # for quantized weights.
+            # In the quantized case, the weights are already transposed.
+            # Also, in addition to the quantized weights,
+            # the zero points and scales have to be reshaped as well.
+            # Packing should not be affected by this.
+            if (
+                ".mixer.in_proj.weight" in name
+                or "mixer.in_proj.qweight" in name
+                or "mixer.in_proj.scales" in name
+                or "mixer.in_proj.qzeros" in name
+            ):
+                if "mixer.in_proj.weight" in name:
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                # for weight:
+                # loaded_weight.shape[0] == self.config.hidden_size
+                # for qweight:
+                # loaded_weight.shape[0] == self.config.hidden_size // param.pack_factor  # noqa
+                # for scales and qzeros:
+                # loaded_weight.shape[0] == self.config.hidden_size // self.vllm_config.quant_config.group_size  # noqa
+                loaded_weight = loaded_weight.reshape(
+                    loaded_weight.shape[0], self.config.mamba_num_heads, -1
+                )
+                gate_weight, hidden_states_weight = loaded_weight.chunk(2, dim=-1)
+                gate_weight = gate_weight.reshape(loaded_weight.shape[0], -1)
+                hidden_states_weight = hidden_states_weight.reshape(
+                    loaded_weight.shape[0], -1
+                )
+                loaded_weight = torch.cat([gate_weight, hidden_states_weight], dim=-1)
+                if "mixer.in_proj.weight" in name:
+                    loaded_weight = loaded_weight.transpose(0, 1)
+
+            # Offset parameter with vllm's RMSNorm haven't been supported yet.
+            if ".pre_mixer_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mixer_norm" in name:
+                loaded_weight += 1.0 / 5
+            elif ".pre_mlp_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mlp_norm" in name:
+                loaded_weight += 1.0 / (5**1.5)
+            elif name == "norm.weight":
+                loaded_weight += 1.0
+
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Plamo2ForCausalLM(
     torch.nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid
@@ -906,88 +983,9 @@ def compute_logits(
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            # Both tie_word_embeddings=True and lm_head.weight in the safetensor
-            # at the same time causes dict key access error.
-            if name == "lm_head.weight" and self.config.tie_word_embeddings:
-                assert "lm_head.weight" not in params_dict
-                continue
-            # Same workaround as AutoWeightsLoader for GPTQModel
-            if any(
-                substr in name
-                for substr in AutoWeightsLoader.ROTARY_EMBEDS_UNUSED_WEIGHTS
-            ):
-                continue
-
-            # Update the weight names to be compatible with the vllm version
-            # of the model.
-            # Do not change the order of the replacements.
-            replacements = {
-                # Rename incompatible weight names.
-                ".A_log": ".A",
-                ".B_norm_weight": ".B_norm.weight",
-                ".C_norm_weight": ".C_norm.weight",
-                ".dt_norm_weight": ".dt_norm.weight",
-                ".q_weight": ".q_norm.weight",
-                ".k_weight": ".k_norm.weight",
-            }
-            # Apply replacements based on the defined mappings
-            for old, new in replacements.items():
-                if old in name:
-                    name = name.replace(old, new)
-
-            # Reshape the in_proj weights to match the shape expected
-            # by MergedColumnParallelLinear.
-            # This works both for unquantized weights and
-            # for quantized weights.
-            # In the quantized case, the weights are already transposed.
-            # Also, in addition to the quantized weights,
-            # the zero points and scales have to be reshaped as well.
-            # Packing should not be affected by this.
-            if (
-                ".mixer.in_proj.weight" in name
-                or "mixer.in_proj.qweight" in name
-                or "mixer.in_proj.scales" in name
-                or "mixer.in_proj.qzeros" in name
-            ):
-                if "mixer.in_proj.weight" in name:
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                # for weight:
-                # loaded_weight.shape[0] == self.config.hidden_size
-                # for qweight:
-                # loaded_weight.shape[0] == self.config.hidden_size // param.pack_factor  # noqa
-                # for scales and qzeros:
-                # loaded_weight.shape[0] == self.config.hidden_size // self.vllm_config.quant_config.group_size  # noqa
-                loaded_weight = loaded_weight.reshape(
-                    loaded_weight.shape[0], self.config.mamba_num_heads, -1
-                )
-                gate_weight, hidden_states_weight = loaded_weight.chunk(2, dim=-1)
-                gate_weight = gate_weight.reshape(loaded_weight.shape[0], -1)
-                hidden_states_weight = hidden_states_weight.reshape(
-                    loaded_weight.shape[0], -1
-                )
-                loaded_weight = torch.cat([gate_weight, hidden_states_weight], dim=-1)
-                if "mixer.in_proj.weight" in name:
-                    loaded_weight = loaded_weight.transpose(0, 1)
-
-            # Offset parameter with vllm's RMSNorm haven't been supported yet.
-            if ".pre_mixer_norm" in name:
-                loaded_weight += 1.0
-            elif ".post_mixer_norm" in name:
-                loaded_weight += 1.0 / 5
-            elif ".pre_mlp_norm" in name:
-                loaded_weight += 1.0
-            elif ".post_mlp_norm" in name:
-                loaded_weight += 1.0 / (5**1.5)
-            elif "model.norm.weight" in name:
-                loaded_weight += 1.0
-
-            # Skip layers on other devices.
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader", default_weight_loader)
-            weight_loader(param, loaded_weight)
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qianfan_ocr.py b/vllm/model_executor/models/qianfan_ocr.py
new file mode 100644
index 000000000000..ef2bec1e2900
--- /dev/null
+++ b/vllm/model_executor/models/qianfan_ocr.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# QianfanOCR is built on InternVL with a Qwen3 language backbone.
+# The model architecture and weights are fully compatible with InternVLChatModel,
+# only the config model_type / architectures strings differ.
+
+from transformers import PretrainedConfig
+
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.transformers_utils.processors.internvl import (
+    InternVLImageProcessor,
+    InternVLProcessor,
+)
+
+from .internvl import (
+    BaseInternVLDummyInputsBuilder,
+    BaseInternVLMultiModalProcessor,
+    BaseInternVLProcessingInfo,
+    InternVLChatModel,
+)
+
+
+class QianfanOCRProcessingInfo(BaseInternVLProcessingInfo):
+    """Image-only ProcessingInfo for QianfanOCR (no video support)."""
+
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", vision_config.image_size)
+        kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+
+        image_processor = InternVLImageProcessor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+        return InternVLProcessor(
+            tokenizer=self.get_tokenizer(),
+            image_processor=image_processor,
+            video_processor=None,
+            image_seq_length=image_seq_length,
+            ctx_video_token=None,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    BaseInternVLMultiModalProcessor,
+    info=QianfanOCRProcessingInfo,
+    dummy_inputs=BaseInternVLDummyInputsBuilder,
+)
+class QianfanOCRForConditionalGeneration(InternVLChatModel):
+    """QianfanOCR multimodal model.
+
+    Identical in structure to InternVLChatModel (InternViT vision encoder +
+    pixel-shuffle MLP connector + Qwen3 language model).  This class exists
+    solely to register the ``QianfanOCRForConditionalGeneration`` architecture
+    name that appears in the model's config.json.
+    """
+
+    def _patch_quant_config(
+        self, config: PretrainedConfig, quant_config: QuantizationConfig
+    ) -> None:
+        super()._patch_quant_config(config, quant_config)
+        # ignore vit layers to preserve model performance
+        if isinstance(quant_config, Fp8Config):
+            _FP8_IGNORED_LAYERS = [
+                *(
+                    layer
+                    for i in range(config.vision_config.num_hidden_layers)
+                    for layer in [
+                        f"vision_model.encoder.layers.{i}.attn.qkv",
+                        f"vision_model.encoder.layers.{i}.attn.proj",
+                        f"vision_model.encoder.layers.{i}.mlp.fc1",
+                        f"vision_model.encoder.layers.{i}.mlp.fc2",
+                    ]
+                ),
+                "language_model.lm_head",
+                "mlp1.1",
+                "mlp1.3",
+            ]
+            for layer in _FP8_IGNORED_LAYERS:
+                if layer not in quant_config.ignored_layers:
+                    quant_config.ignored_layers.append(layer)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 27aa6175b9bc..b83fedc70db0 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -312,48 +312,15 @@ def forward(
         return hidden_states, residual
 
 
-def qwen_2_model_invariants(
-    input_ids: torch.Tensor,
-    positions: torch.Tensor,
-    intermediate_tensors: IntermediateTensors | None = None,
-    inputs_embeds: torch.Tensor | None = None,
-):
-    """Shape invariants for Qwen2Model Model, those are translated to
-    runtime assertions for unbacked dynamic shapes and are compiled away for
-    backed"""
-    # All these should be equal.
-    # input_ids.size()[0]
-    # positions.size()[-1]
-    # intermediate_tensors["hidden_states"].size()[0]
-    # inputs_embeds.size()[0]
-    torch._check(input_ids.size()[0] == positions.size()[-1])
-    if intermediate_tensors is not None:
-        torch._check(
-            input_ids.size()[0] == intermediate_tensors["hidden_states"].size()[0]
-        )
-
-    if inputs_embeds is not None:
-        torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
-
-    # Hidden dimensions should match (hidden_size)
-    # intermediate_tensors["hidden_states"].size()[1]
-    # inputs_embeds.size()[1]
-    if inputs_embeds is not None and intermediate_tensors is not None:
-        torch._check(
-            inputs_embeds.size()[1] == intermediate_tensors["hidden_states"].size()[1]
-        )
-
-
 @support_torch_compile(
     dynamic_arg_dims={
-        "input_ids": 0,
+        "input_ids": {0: "b"},
         # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
         # otherwise (seq_len, ).
-        "positions": -1,
-        "intermediate_tensors": 0,
-        "inputs_embeds": 0,
-    },
-    shape_invariants=qwen_2_model_invariants,
+        "positions": {-1: "b"},
+        "intermediate_tensors": {0: "b"},
+        "inputs_embeds": {0: "b"},
+    }
 )
 class Qwen2Model(nn.Module, EagleModelMixin):
     def __init__(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index c11684b4b89b..54334c91bfa6 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -85,11 +85,13 @@
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.worker.encoder_cudagraph_defs import EncoderCudaGraphReplayBuffers
 
 from .interfaces import (
     MultiModalEmbeddings,
     SupportsEagle,
     SupportsEagle3,
+    SupportsEncoderCudaGraph,
     SupportsLoRA,
     SupportsMRoPE,
     SupportsMultiModal,
@@ -771,22 +773,54 @@ def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
         inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
         return inv
 
-    def forward(
+    def prepare_encoder_metadata(
         self,
-        x: torch.Tensor,
         grid_thw: list[list[int]],
-    ) -> torch.Tensor:
+        *,
+        max_batch_size: int | None = None,
+        max_frames_per_batch: int | None = None,
+        max_window_seqs_per_batch: int | None = None,
+        max_seqlen_override: int | None = None,
+        max_seqlen_window_override: int | None = None,
+        device: torch.device | None = None,
+    ) -> dict[str, torch.Tensor]:
+        """Compute encoder metadata from grid_thw.
+
+        Shared by the eager forward path, CUDA graph capture, and
+        CUDA graph replay to avoid duplicated implementation.
+
+        Args:
+            grid_thw: Grid configurations as list of [t, h, w].
+            max_batch_size: If set, pad cu_seqlens to this size
+                (needed for CUDA graph capture/replay).
+            max_frames_per_batch: If set, overrides max_batch_size for
+                cu_seqlens padding. For video inputs each item contributes
+                T attention sequences (frames); this sizes the buffer to
+                the total frame budget so video replays never overflow.
+            max_window_seqs_per_batch: If set, pad cu_window_seqlens to this
+                number of window sequences. This keeps cu_window_seqlens shape
+                stable across capture/replay for CUDA graph safety.
+            max_seqlen_override: If set, use this value for max_seqlen
+                instead of computing from cu_seqlens (needed for CUDA
+                graph capture to cover worst-case replay scenarios).
+            max_seqlen_window_override: If set, use this value for
+                window-attention max_seqlen instead of computing from
+                cu_window_seqlens (needed for CUDA graph capture to
+                cover worst-case replay scenarios).
+            device: Device to place tensors on. Defaults to self.device.
+        """
+
+        if device is None:
+            device = self.device
+        metadata: dict[str, torch.Tensor] = {}
+
         # patchify
-        seq_len, _ = x.size()
         rotary_pos_emb_cos = []
         rotary_pos_emb_sin = []
         window_index: list = []
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
 
-        hidden_states = x.to(device=self.device, dtype=self.dtype)
-        hidden_states = self.patch_embed(hidden_states)
-
         window_index_id = 0
         cu_window_seqlens_last = 0
         for t, h, w in grid_thw:
@@ -825,23 +859,99 @@ def forward(
         cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32)
         cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
 
+        # Pad cu_seqlens to the required number of sequences.
+        # For videos each item contributes T frames = T attention sequences,
+        # so the total can exceed max_batch_size. max_frames_per_batch
+        # overrides the pad target when set.
+        pad_to = (
+            max_frames_per_batch if max_frames_per_batch is not None else max_batch_size
+        )
+        if pad_to is not None:
+            num_seqs = len(cu_seqlens) - 1
+            if num_seqs < pad_to:
+                cu_seqlens = torch.cat(
+                    (
+                        cu_seqlens,
+                        torch.full(
+                            (pad_to - num_seqs,),
+                            cu_seqlens[-1],
+                            dtype=cu_seqlens.dtype,
+                            device=cu_seqlens.device,
+                        ),
+                    )
+                )
+
+        # Pad cu_window_seqlens to a stable number of window sequences.
+        # Like cu_seqlens, we repeat the last cumulative offset so padded
+        # entries represent empty sequences.
+        if max_window_seqs_per_batch is not None:
+            num_window_seqs = len(cu_window_seqlens) - 1
+            if num_window_seqs < max_window_seqs_per_batch:
+                cu_window_seqlens = torch.cat(
+                    (
+                        cu_window_seqlens,
+                        torch.full(
+                            (max_window_seqs_per_batch - num_window_seqs,),
+                            cu_window_seqlens[-1],
+                            dtype=cu_window_seqlens.dtype,
+                            device=cu_window_seqlens.device,
+                        ),
+                    )
+                )
+
         # transformers
         # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
-        max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
-        max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
+        if max_seqlen_override is None:
+            max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
+        else:
+            max_seqlen_full = torch.tensor(max_seqlen_override, dtype=torch.int32)
+        if max_seqlen_window_override is None:
+            max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
+        else:
+            max_seqlen_window = torch.tensor(
+                max_seqlen_window_override, dtype=torch.int32
+            )
 
-        cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
-        cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
-        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
-            device=self.device, non_blocking=True
-        )
-        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
-            device=self.device, non_blocking=True
-        )
-        window_index = window_index.to(device=hidden_states.device, non_blocking=True)
-        reverse_indices = reverse_indices.to(
-            device=hidden_states.device, non_blocking=True
-        )
+        cu_seqlens = cu_seqlens.to(device=device, non_blocking=True)
+        cu_window_seqlens = cu_window_seqlens.to(device=device, non_blocking=True)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(device=device, non_blocking=True)
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(device=device, non_blocking=True)
+        window_index = window_index.to(device=device, non_blocking=True)
+        reverse_indices = reverse_indices.to(device=device, non_blocking=True)
+
+        metadata["rotary_pos_emb_cos"] = rotary_pos_emb_cos
+        metadata["rotary_pos_emb_sin"] = rotary_pos_emb_sin
+        metadata["window_index"] = window_index
+        metadata["reverse_indices"] = reverse_indices
+        metadata["cu_seqlens"] = cu_seqlens
+        metadata["cu_window_seqlens"] = cu_window_seqlens
+        metadata["max_seqlen_full"] = max_seqlen_full
+        metadata["max_seqlen_window"] = max_seqlen_window
+
+        return metadata
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: list[list[int]],
+        *,
+        encoder_metadata: dict[str, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.patch_embed(hidden_states)
+
+        seq_len = hidden_states.shape[0]
+        if encoder_metadata is None:
+            encoder_metadata = self.prepare_encoder_metadata(grid_thw)
+
+        rotary_pos_emb_cos = encoder_metadata["rotary_pos_emb_cos"]
+        rotary_pos_emb_sin = encoder_metadata["rotary_pos_emb_sin"]
+        window_index = encoder_metadata["window_index"]
+        reverse_indices = encoder_metadata["reverse_indices"]
+        cu_seqlens = encoder_metadata["cu_seqlens"]
+        cu_window_seqlens = encoder_metadata["cu_window_seqlens"]
+        max_seqlen_full = encoder_metadata["max_seqlen_full"]
+        max_seqlen_window = encoder_metadata["max_seqlen_window"]
 
         hidden_states = hidden_states.reshape(
             seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
@@ -1003,6 +1113,7 @@ def get_replacement_qwen2vl(item_idx: int, modality: str):
 class Qwen2_5_VLForConditionalGeneration(
     nn.Module,
     SupportsMultiModal,
+    SupportsEncoderCudaGraph,
     SupportsLoRA,
     SupportsPP,
     SupportsQuant,
@@ -1124,6 +1235,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         self.config = config
+        self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.multimodal_config = multimodal_config
         self.video_pruning_rate = multimodal_config.video_pruning_rate
@@ -1447,6 +1559,302 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
                 multimodal_embeddings += tuple(video_embeddings)
         return multimodal_embeddings
 
+    # -- SupportsEncoderCudaGraph protocol methods --
+
+    def get_encoder_cudagraph_config(self):
+        from vllm.v1.worker.encoder_cudagraph_defs import (
+            EncoderCudaGraphConfig,
+        )
+
+        # NOTE: With EVS pruning enabled, multimodal embeddings are post-processed
+        # (append positions for image and prune+append positions for video) in
+        # embed_multimodal(). The encoder CUDA graph path bypasses that postprocess
+        # hook, so disable CUDA graph for all modalities to avoid inconsistent
+        # embedding formats between eager and cudagraph paths.
+        modalities = [] if self.is_multimodal_pruning_enabled else ["image", "video"]
+
+        return EncoderCudaGraphConfig(
+            modalities=modalities,
+            input_key_by_modality={
+                "image": "pixel_values",
+                "video": "pixel_values_videos",
+            },
+            buffer_keys=[
+                "rotary_pos_emb_cos",
+                "rotary_pos_emb_sin",
+                "window_index",
+                "reverse_indices",
+                "cu_seqlens",
+                "cu_window_seqlens",
+                "max_seqlen_full",
+                "max_seqlen_window",
+            ],
+            out_hidden_size=self.visual.out_hidden_size,
+        )
+
+    def get_input_modality(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> str:
+        if "image_grid_thw" in mm_kwargs:
+            return "image"
+        return "video"
+
+    def get_max_frames_per_video(self) -> int:
+        mm_registry = MULTIMODAL_REGISTRY
+        info = mm_registry.get_processing_info(self.model_config)
+        max_frames_per_video = info.get_num_frames_with_most_features(
+            seq_len=self.model_config.max_model_len,
+            mm_counts={"video": self.multimodal_config.get_limit_per_prompt("video")},
+        )
+        return max_frames_per_video
+
+    def get_encoder_cudagraph_budget_range(
+        self,
+        vllm_config: VllmConfig,
+    ) -> tuple[int, int]:
+        # Min: estimated smallest possible encoder input.
+        # 224x224 image → 16x16 patches (patch_size=14)
+        #                 spatial_merge_size=2 → 8x8 = 64 tokens
+        min_budget = 64
+        # Max: capped by max_num_batched_tokens
+        max_budget = min(
+            vllm_config.scheduler_config.max_num_batched_tokens,
+            self.model_config.max_model_len,
+        )
+        return (min_budget, max_budget)
+
+    def _get_pixel_values_by_modality(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> torch.Tensor:
+        if self.get_input_modality(mm_kwargs) == "image":
+            pixel_values = mm_kwargs["pixel_values"]
+        else:
+            pixel_values = mm_kwargs["pixel_values_videos"]
+        return pixel_values
+
+    def _get_grid_thw_by_modality(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> list[tuple[int, int, int]]:
+        grid_thw_key = f"{self.get_input_modality(mm_kwargs)}_grid_thw"
+        grid_thw = mm_kwargs[grid_thw_key]
+        if not isinstance(grid_thw, list):
+            grid_thw = grid_thw.tolist()
+        return grid_thw
+
+    def get_encoder_cudagraph_num_items(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> int:
+        return len(self._get_grid_thw_by_modality(mm_kwargs))
+
+    def get_encoder_cudagraph_per_item_output_tokens(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> list[int]:
+        m = self.visual.spatial_merge_size
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        return [t * (h // m) * (w // m) for t, h, w in grid_thw]
+
+    def get_encoder_cudagraph_per_item_input_sizes(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> list[int]:
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        return [t * h * w for t, h, w in grid_thw]
+
+    def select_encoder_cudagraph_items(
+        self,
+        mm_kwargs: dict[str, Any],
+        indices: list[int],
+    ) -> dict[str, Any]:
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        pixel_values = self._get_pixel_values_by_modality(mm_kwargs)
+
+        if len(indices) == 0:
+            if self.get_input_modality(mm_kwargs) == "image":
+                return {
+                    "pixel_values": pixel_values[:0],
+                    "image_grid_thw": [],
+                }
+            else:
+                return {
+                    "pixel_values_videos": pixel_values[:0],
+                    "video_grid_thw": [],
+                }
+
+        # Compute cumulative patch offsets for slicing pixel_values
+        patches_per_item = [t * h * w for t, h, w in grid_thw]
+        cum_patches = [0]
+        for p in patches_per_item:
+            cum_patches.append(cum_patches[-1] + p)
+
+        selected_pv = torch.cat(
+            [pixel_values[cum_patches[i] : cum_patches[i + 1]] for i in indices]
+        )
+        selected_grid = [grid_thw[i] for i in indices]
+
+        if self.get_input_modality(mm_kwargs) == "image":
+            return {
+                "pixel_values": selected_pv,
+                "image_grid_thw": selected_grid,
+            }
+        else:
+            return {
+                "pixel_values_videos": selected_pv,
+                "video_grid_thw": selected_grid,
+            }
+
+    def prepare_encoder_cudagraph_capture_inputs(
+        self,
+        token_budget: int,
+        max_batch_size: int,
+        max_frames_per_batch: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        from vllm.v1.worker.encoder_cudagraph_defs import (
+            EncoderCudaGraphCaptureInputs,
+        )
+
+        spatial_merge_size = self.visual.spatial_merge_size
+        max_window_seqs_per_batch = min(
+            self.vllm_config.scheduler_config.max_num_batched_tokens,
+            self.model_config.max_model_len,
+        )
+        # Use ceil here (not floor) so total captured capacity is never smaller
+        # than token_budget when token_budget is not divisible by max_batch_size
+        # (e.g., 324 budget with max_batch_size=8). Floor under-allocates
+        # input_buffer and can fail replay copy for valid single-item batches.
+        per_mm_item_output = (token_budget + max_batch_size - 1) // max_batch_size
+
+        frames_per_item = max_frames_per_batch // max_batch_size
+        if frames_per_item > 1:
+            # Build the capture grid using a video-format layout so that
+            # cu_seqlens is sized for video replays from the start.
+            # cu_seqlens has one entry per attention sequence (one per frame),
+            # so using T > 1 per item makes the buffer large enough without
+            # relying solely on padding.
+            # Ceiling ensures frames_per_item * tokens_per_frame >= per_mm_item_output
+            # so the pixel_values buffer covers any valid single-item replay.
+            tokens_per_frame = (
+                per_mm_item_output + frames_per_item - 1
+            ) // frames_per_item
+            # Video-format grid_config (T=frames_per_item).
+            grid_config = [
+                [
+                    frames_per_item,
+                    spatial_merge_size,
+                    tokens_per_frame * spatial_merge_size,
+                ]
+                for _ in range(max_batch_size)
+            ]
+        else:
+            # Image-format grid_config (T=1).
+            grid_config = [
+                [1, spatial_merge_size, per_mm_item_output * spatial_merge_size]
+                for _ in range(max_batch_size)
+            ]
+
+        # Create dummy pixel_values
+        patch_embed = self.visual.patch_embed
+        in_channels = patch_embed.proj.in_channels
+        patch_size = patch_embed.patch_size
+        temporal_patch_size = patch_embed.temporal_patch_size
+        total_patches = sum(t * h * w for t, h, w in grid_config)
+        flattened_patch_size = (
+            in_channels * temporal_patch_size * patch_size * patch_size
+        )
+        dummy_pixel_values = torch.randn(
+            total_patches, flattened_patch_size, device=device, dtype=dtype
+        )
+
+        # Override max_seqlen with a safe upper bound for capture.
+        # max_seqlen.item() gets baked into the CUDA graph (not replayed),
+        # so the capture value must cover any replay scenario.
+        # Worst case: 1 item consuming the full budget ->
+        # seq_len = token_budget * spatial_merge_size^2.
+        # For window-attention, each local window is bounded by fixed geometry:
+        # (window_size / patch_size / spatial_merge_size)^2 windows in merged
+        # token space, multiplied by spatial_merge_size^2 to map back to the
+        # unmerged sequence length used by attention kernels.
+        vit_merger_window_size = (
+            self.visual.window_size
+            // self.visual.spatial_merge_size
+            // self.visual.patch_size
+        )
+        max_seqlen_window_override = vit_merger_window_size**2 * (spatial_merge_size**2)
+        buffers = self.visual.prepare_encoder_metadata(
+            grid_config,
+            max_batch_size=max_batch_size,
+            max_frames_per_batch=max_frames_per_batch,
+            max_window_seqs_per_batch=max_window_seqs_per_batch,
+            max_seqlen_override=token_budget * (spatial_merge_size**2),
+            max_seqlen_window_override=max_seqlen_window_override,
+            device=device,
+        )
+
+        # Just use image-modality dummy input_buffer for capturing, since it's also
+        # compatible for video inputs (has the same shape: [num_patches, C*T*P*P]).
+        mm_kwargs = {
+            "pixel_values": dummy_pixel_values,
+            "image_grid_thw": grid_config,
+        }
+
+        return EncoderCudaGraphCaptureInputs(
+            mm_kwargs=mm_kwargs,
+            buffers=buffers,
+        )
+
+    def prepare_encoder_cudagraph_replay_buffers(
+        self,
+        mm_kwargs: dict[str, Any],
+        max_batch_size: int,
+        max_frames_per_batch: int,
+    ):
+        modality = self.get_input_modality(mm_kwargs)
+        grid_thw_list = self._get_grid_thw_by_modality(mm_kwargs)
+
+        if modality == "image":
+            buffers = self.visual.prepare_encoder_metadata(
+                grid_thw_list,
+                max_batch_size=max_batch_size,
+                max_window_seqs_per_batch=min(
+                    self.vllm_config.scheduler_config.max_num_batched_tokens,
+                    self.model_config.max_model_len,
+                ),
+            )
+        else:
+            buffers = self.visual.prepare_encoder_metadata(
+                grid_thw_list,
+                max_frames_per_batch=max_frames_per_batch,
+                max_window_seqs_per_batch=min(
+                    self.vllm_config.scheduler_config.max_num_batched_tokens,
+                    self.model_config.max_model_len,
+                ),
+            )
+
+        return EncoderCudaGraphReplayBuffers(buffers=buffers)
+
+    def encoder_cudagraph_forward(
+        self,
+        mm_kwargs: dict[str, Any],
+        buffers: dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        pixel_values = self._get_pixel_values_by_modality(mm_kwargs)
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        return self.visual(pixel_values, grid_thw, encoder_metadata=buffers)
+
+    def encoder_eager_forward(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> torch.Tensor:
+        pixel_values = self._get_pixel_values_by_modality(mm_kwargs)
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        return self.visual(pixel_values, grid_thw)
+
     def forward(
         self,
         input_ids: torch.Tensor | None,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 80cc8b895345..d38cd63b90b1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -89,6 +89,7 @@
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
+    "CohereMoeForCausalLM": ("cohere_moe", "CohereMoeForCausalLM"),
     "CwmForCausalLM": ("llama", "LlamaForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
@@ -150,6 +151,7 @@
     "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"),
     "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),
     "Lfm2MoeForCausalLM": ("lfm2_moe", "Lfm2MoeForCausalLM"),
+    "LagunaForCausalLM": ("laguna", "LagunaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),
     # For decapoda-research/llama-*
@@ -482,6 +484,8 @@
     ),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "Molmo2ForConditionalGeneration": ("molmo2", "Molmo2ForConditionalGeneration"),
+    "Moondream3ForCausalLM": ("moondream3", "Moondream3ForCausalLM"),
+    "HfMoondream": ("moondream3", "Moondream3ForCausalLM"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
     "NemotronH_Nano_Omni_Reasoning_V3": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
     "NemotronH_Super_Omni_Reasoning_V3": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
@@ -507,6 +511,10 @@
     "Phi4ForCausalLMV": ("phi4siglip", "Phi4ForCausalLMV"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),
+    "QianfanOCRForConditionalGeneration": (
+        "qianfan_ocr",
+        "QianfanOCRForConditionalGeneration",
+    ),
     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
     "Qwen2_5_VLForConditionalGeneration": (
@@ -583,6 +591,7 @@
     "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "EagleMistralForCausalLM": ("mistral_eagle", "EagleMistralForCausalLM"),
     "EagleMistralLarge3ForCausalLM": (
         "mistral_large_3_eagle",
         "EagleMistralLarge3ForCausalLM",
@@ -592,6 +601,7 @@
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "DeepSeekV4MTPModel": ("deepseek_v4_mtp", "DeepSeekV4MTP"),
+    "Gemma4MTPModel": ("gemma4_mtp", "Gemma4MTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
     "ExaoneMoeMTP": ("exaone_moe_mtp", "ExaoneMoeMTP"),
     "Exaone4_5_MTP": ("exaone4_5_mtp", "Exaone4_5_MTP"),
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 8f08f6c60713..a0e7e16a9bbf 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -41,6 +41,7 @@
 
 from .interfaces import SupportsPP
 from .utils import (
+    AutoWeightsLoader,
     PPMissingLayer,
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
@@ -382,55 +383,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class Step3TextForCausalLM(nn.Module, SupportsPP):
-    def __init__(
-        self,
-        *,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-
-        self.config = config
-        self.vllm_config = vllm_config
-
-        self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix)
-
-        if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-            self.logits_processor = LogitsProcessor(config.vocab_size)
-        else:
-            self.lm_head = PPMissingLayer()
-
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_input_ids(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor | None,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ):
-        hidden_states = self.model(
-            input_ids, positions, intermediate_tensors, inputs_embeds
-        )
-        return hidden_states
-
-    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         qkv_params_mapping = [
             # (param_name, shard_name, relative_start_idx, relative_end_idx)
@@ -553,3 +505,56 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                         weight_loader(param, loaded_weight)
                         loaded_params.add(name)
         return loaded_params
+
+
+class Step3TextForCausalLM(nn.Module, SupportsPP):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.vllm_config = vllm_config
+
+        self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix)
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            self.logits_processor = LogitsProcessor(config.vocab_size)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 0b844d1493d9..a0269be855a9 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -45,7 +45,10 @@ def set_weight_attrs(
 
 
 def replace_parameter(
-    layer: torch.nn.Module, param_name: str, new_data: torch.Tensor | None
+    layer: torch.nn.Module,
+    param_name: str,
+    new_data: torch.Tensor | None,
+    prefer_copy: bool = False,
 ):
     """
     Replace a parameter of a layer while maintaining the ability to reload the weight.
@@ -57,6 +60,12 @@ def replace_parameter(
         layer: Layer containing parameter to replace
         param_name: Name of parameter to replace
         new_data: New data of the new parameter, or None to set the parameter to None
+        prefer_copy: If True and the existing parameter is compatible with
+            ``new_data`` (same shape, dtype, and device), copy ``new_data``
+            into the existing parameter in place rather than re-registering
+            a new parameter. This preserves the parameter's storage address
+            (``data_ptr``), which is required for captured CUDA graphs to
+            remain valid across weight updates (e.g. in RL training loops).
     """
     # should not be used on a tied/shared param
 
@@ -67,9 +76,21 @@ def replace_parameter(
 
     if isinstance(new_data, torch.nn.Parameter):
         new_data = new_data.data
-    new_param = torch.nn.Parameter(new_data, requires_grad=False)
 
     old_param: torch.nn.Parameter | None = getattr(layer, param_name, None)
+
+    if (
+        prefer_copy
+        and old_param is not None
+        and old_param.shape == new_data.shape
+        and old_param.dtype == new_data.dtype
+        and old_param.device == new_data.device
+    ):
+        old_param.copy_(new_data)
+        return
+
+    new_param = torch.nn.Parameter(new_data, requires_grad=False)
+
     if old_param is not None and hasattr(old_param, "weight_loader"):
         weight_loader = old_param.weight_loader
         set_weight_attrs(new_param, {"weight_loader": weight_loader})
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index c0df19d4f483..833b405ff651 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -511,11 +511,27 @@ def get_and_update_item(
 
             self._p0_cache[mm_hash] = prompt_updates
             return self.address_as_item(address, monotonic_id), prompt_updates
-        except (ValueError, MemoryError) as e:
-            # put may fail if the object is too large or
-            # the cache is full.
-            # In this case we log the error and keep the original mm_input.
-            logger.debug("Failed to cache mm_input with hash %s: %s", mm_hash, e)
+        except ValueError as e:
+            # `put` raises ValueError either for an oversize item or for a
+            # duplicate key (concurrent insert); the latter is benign so we
+            # only warn on the oversize case. Subsequent UUID-only requests
+            # for an oversize item will fail with a cache miss.
+            if "already exists" not in str(e):
+                logger.warning_once(
+                    "mm_input %s too large to cache; "
+                    "raise --mm-shm-cache-max-object-size-mb. (%s)",
+                    mm_hash,
+                    str(e),
+                )
+            return mm_item
+        except MemoryError as e:
+            # Cache full and protected items prevent eviction.
+            logger.debug(
+                "mm_input %s not cached; shm cache full, "
+                "consider raising --mm-processor-cache-gb. (%s)",
+                mm_hash,
+                str(e),
+            )
             return mm_item
 
     @override
diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py
index ef9710374d81..bed66d0a4e9d 100644
--- a/vllm/multimodal/processing/context.py
+++ b/vllm/multimodal/processing/context.py
@@ -268,28 +268,6 @@ def call_hf_processor(
         try:
             output = hf_processor(**data, **allowed_kwargs)
         except Exception as exc:
-            # See https://github.com/huggingface/tokenizers/issues/537
-            if (
-                isinstance(exc, RuntimeError)
-                and exc
-                and exc.args[0] == "Already borrowed"
-                and num_tries < max_tries
-            ):
-                logger.warning(
-                    "Failed to acquire tokenizer in current thread. "
-                    "Retrying (%d/%d)...",
-                    num_tries,
-                    max_tries,
-                )
-                time.sleep(0.5)
-                return self.call_hf_processor(
-                    hf_processor,
-                    data,
-                    kwargs,
-                    num_tries=num_tries + 1,
-                    max_tries=max_tries,
-                )
-
             msg = (
                 f"Failed to apply {type(hf_processor).__name__} "
                 f"on data={data} with kwargs={allowed_kwargs}"
diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py
index 5943e5aafbc4..1b861dcade87 100644
--- a/vllm/parser/abstract_parser.py
+++ b/vllm/parser/abstract_parser.py
@@ -38,7 +38,10 @@
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.abstract_tool_parser import ToolParser
-from vllm.tool_parsers.streaming import extract_required_tool_call_streaming
+from vllm.tool_parsers.streaming import (
+    extract_named_tool_call_streaming,
+    extract_required_tool_call_streaming,
+)
 from vllm.tool_parsers.utils import Tool
 from vllm.utils import random_uuid
 
@@ -423,6 +426,17 @@ def extract_response_outputs(
 
         return outputs
 
+    def _get_function_name(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> str:
+        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
+            return request.tool_choice.name
+        if request.tool_choice and isinstance(
+            request.tool_choice, ChatCompletionNamedToolChoiceParam
+        ):
+            return request.tool_choice.function.name
+        raise ValueError("Invalid tool_choice for function name extraction.")
+
     def _parse_tool_calls(
         self,
         request: ResponsesRequest,
@@ -440,21 +454,15 @@ def _parse_tool_calls(
         """
         function_calls: list[FunctionCall] = []
 
-        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
-            # Forced Function Call (Responses API style)
-            assert content is not None
-            function_calls.append(
-                FunctionCall(name=request.tool_choice.name, arguments=content)
-            )
-            return function_calls, None  # Clear content since tool is called.
-
         if request.tool_choice and isinstance(
-            request.tool_choice, ChatCompletionNamedToolChoiceParam
+            request.tool_choice,
+            (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
         ):
-            # Forced Function Call (Chat Completion API style)
-            assert content is not None
+            # Forced Function Call
+            if content is None:
+                return [], None
             function_calls.append(
-                FunctionCall(name=request.tool_choice.function.name, arguments=content)
+                FunctionCall(name=self._get_function_name(request), arguments=content)
             )
             return function_calls, None  # Clear content since tool is called.
 
@@ -572,7 +580,7 @@ def _extract_tool_calls_streaming(
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-        request: ChatCompletionRequest,
+        request: ChatCompletionRequest | ResponsesRequest,
         # The following parameters are used for "required" tool choice parsing and are
         # tracked in StreamState for streaming parsing.
         tool_call_idx: int | None = None,
@@ -580,9 +588,19 @@ def _extract_tool_calls_streaming(
         function_name_returned: bool = False,
     ) -> tuple[DeltaMessage | None, bool]:
         if request.tool_choice and isinstance(
-            request.tool_choice, ChatCompletionNamedToolChoiceParam
+            request.tool_choice,
+            (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
         ):
-            return None, False
+            delta_message, function_name_returned = extract_named_tool_call_streaming(
+                delta_text=delta_text,
+                function_name=self._get_function_name(request),
+                function_name_returned=function_name_returned,
+                tool_call_idx=tool_call_idx,
+                tool_call_id_type=tool_call_id_type,
+                tokenizer=self.model_tokenizer,
+            )
+            return delta_message, function_name_returned
+
         if request.tool_choice == "required":
             delta_message, function_name_returned = (
                 extract_required_tool_call_streaming(
@@ -602,7 +620,7 @@ def _extract_tool_calls_streaming(
             previous_token_ids,
             current_token_ids,
             delta_token_ids,
-            request,
+            request,  # type: ignore[arg-type]
         ), False
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
@@ -618,15 +636,11 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]:
     def _in_reasoning_phase(self, state: StreamState) -> bool:
         if self._reasoning_parser is None:
             return False
-        if self._tool_parser is None:
-            return True
         return not state.reasoning_ended
 
     def _in_tool_call_phase(self, state: StreamState) -> bool:
         if self._tool_parser is None:
             return False
-        if self._reasoning_parser is None:
-            return True
         return state.reasoning_ended
 
     def parse_delta(
@@ -640,7 +654,9 @@ def parse_delta(
 
         if not state.prompt_reasoning_checked and prompt_token_ids is not None:
             state.prompt_reasoning_checked = True
-            if self.is_reasoning_end(prompt_token_ids):
+            if self._reasoning_parser is None or self.is_reasoning_end(
+                prompt_token_ids
+            ):
                 state.reasoning_ended = True
 
         current_text = state.previous_text + delta_text
@@ -691,8 +707,12 @@ def parse_delta(
                 )
             )
 
-        # No parsers: pass through as content
-        if self._reasoning_parser is None and self._tool_parser is None:
+        # No phase active: pass through as content
+        if (
+            delta_message is None
+            and not self._in_reasoning_phase(state)
+            and not self._in_tool_call_phase(state)
+        ):
             delta_message = DeltaMessage(content=delta_text)
 
         state.previous_text = current_text
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 0b41c381bd4e..750b7f2f4b9a 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -16,7 +16,6 @@
     get_memory_node_info,
 )
 from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.torch_utils import is_quantized_kv_cache
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
@@ -134,20 +133,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         scheduler_config = vllm_config.scheduler_config
         # async scheduling is not required on CPU
         scheduler_config.async_scheduling = False
-        if (
-            scheduler_config.enable_chunked_prefill
-            or cache_config.enable_prefix_caching
-        ) and is_quantized_kv_cache(cache_config.cache_dtype):
-            raise RuntimeError(
-                "Chunked-prefill and prefix-cache on the CPU "
-                "backend is not compatible with FP8 KV cache."
-            )
-
-        if is_quantized_kv_cache(cache_config.cache_dtype):
-            logger.warning(
-                "CPU backend doesn't support KV cache quantization fallback to auto."
-            )
-            cache_config.cache_dtype = "auto"
 
         parallel_config = vllm_config.parallel_config
         # OMP requires the MP executor to function correctly, UniProc is not
@@ -307,9 +292,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
     @classmethod
     def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
-        # TODO: CPU still sets block_size in check_and_update_config.
-        # Move that logic here so block_size is chosen by the backend.
-        pass
+        model_config = vllm_config.model_config
+        if model_config is None or not model_config.is_hybrid:
+            return
+
+        # reconcile attention and mamba page sizes
+        backend_cls = cls._find_non_ssm_backend(vllm_config)
+        if backend_cls is None:
+            return
+
+        cls._align_hybrid_block_size(vllm_config, backend_cls)
 
     @classmethod
     def discover_numa_topology(cls) -> list[list[int]]:
@@ -458,6 +450,10 @@ def pack_kv_cache(
             block_offsets.reshape(1, block_size)
             + indices.reshape(num_blocks, 1) * block_size
         ).flatten()
+        if key_cache.dtype == torch.uint8:
+            raise NotImplementedError(
+                "FP8 KV cache is not yet supported with KV transfer on CPU"
+            )
         cpu_attn_reshape_and_cache(
             key,
             value,
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 4f9b9d7bf234..9f04bf11660a 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -578,7 +578,9 @@ def get_default_ir_op_priority(cls, vllm_config: VllmConfig) -> IrOpPriorityConf
         if envs.VLLM_USE_OINK_OPS:
             rms_norm = ["oink"] + default
 
-        return IrOpPriorityConfig.with_default(default, rms_norm=rms_norm)
+        return IrOpPriorityConfig.with_default(
+            default, rms_norm=rms_norm, fused_add_rms_norm=rms_norm
+        )
 
 
 # NVML utils
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index c0d52620c086..80952ced73d1 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -86,6 +86,9 @@ def __gt__(self, other: Any) -> bool:
             return NotImplemented
         return (self.major, self.minor) > (other.major, other.minor)
 
+    def __hash__(self) -> int:
+        return hash((self.major, self.minor))
+
     def as_version_str(self) -> str:
         return f"{self.major}.{self.minor}"
 
@@ -542,6 +545,42 @@ def _align_hybrid_block_size(
                 dtype=kv_cache_dtype,
                 kv_quant_mode=kv_quant_mode,
             ).page_size_bytes
+        elif cache_config.cache_dtype.startswith("turboquant_"):
+            # TQ has a packed K|V layout; the standard FullAttentionSpec
+            # formula over-sizes it and trips unify_kv_cache_spec_page_size
+            # when all attention layers are TQ. With mixed skip+TQ the skip
+            # layers still use the standard layout — take max so mamba
+            # padding covers the largest actual page.
+            from vllm.model_executor.layers.quantization.turboquant.config import (
+                TurboQuantConfig,
+            )
+            from vllm.v1.kv_cache_interface import TQFullAttentionSpec
+
+            tq_cfg = TurboQuantConfig.from_cache_dtype(
+                cache_config.cache_dtype, model_config.get_head_size()
+            )
+            tq_page = TQFullAttentionSpec(
+                block_size=1,
+                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                head_size=model_config.get_head_size(),
+                head_size_v=model_config.get_head_size(),
+                dtype=kv_cache_dtype,
+                kv_quant_mode=kv_quant_mode,
+                tq_slot_size=tq_cfg.slot_size_aligned,
+            ).page_size_bytes
+            if cache_config.kv_cache_dtype_skip_layers:
+                skip_page = FullAttentionSpec(
+                    block_size=1,
+                    num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                    head_size=model_config.get_head_size(),
+                    dtype=model_config.dtype,
+                ).page_size_bytes
+                # lcm, not max: skip_page is often not a multiple of
+                # tq_page, so max would leave per-layer page sizes
+                # un-unifiable downstream.
+                attn_page_size_1_token = lcm(tq_page, skip_page)
+            else:
+                attn_page_size_1_token = tq_page
         else:
             attn_page_size_1_token = FullAttentionSpec(
                 block_size=1,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index e4d17eeaa969..82a5f0b95e7a 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -409,6 +409,7 @@ class RocmPlatform(Platform):
         "gptq",
         "gptq_marlin",  # will be overwritten with gptq
         "fp8",
+        "deepseek_v4_fp8",
         "compressed-tensors",
         "fbgemm_fp8",
         "gguf",
@@ -694,21 +695,11 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     @classmethod
     def apply_config_platform_defaults(cls, vllm_config: "VllmConfig") -> None:
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.config.compilation import CUDAGraphMode
 
         compilation_config = vllm_config.compilation_config
-        is_eager_execution = compilation_config.cudagraph_mode == CUDAGraphMode.NONE
         use_aiter_fused_moe = rocm_aiter_ops.is_fused_moe_enabled()
-        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
         use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled()
         use_aiter_fused_se = rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
-        #  Aiter rms norm perform best when CUDA Graph capture is enabled.
-        if (
-            use_aiter_rms_norm
-            and not is_eager_execution
-            and "-rms_norm" not in compilation_config.custom_ops
-        ):
-            compilation_config.custom_ops.append("+rms_norm")
 
         if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
             compilation_config.custom_ops.append("+quant_fp8")
@@ -940,7 +931,7 @@ def use_custom_op_collectives(cls) -> bool:
     def get_default_ir_op_priority(
         cls, vllm_config: "VllmConfig"
     ) -> "IrOpPriorityConfig":
-        from vllm.config.compilation import CompilationMode
+        from vllm.config.compilation import CompilationMode, CUDAGraphMode
         from vllm.config.kernel import IrOpPriorityConfig
 
         # Native used by default when compiling,
@@ -950,12 +941,10 @@ def get_default_ir_op_priority(
         using_inductor = cc.backend == "inductor" and cc.mode != CompilationMode.NONE
         default = ["native"] if using_inductor else ["vllm_c", "native"]
 
-        # This (mostly) preserves previous CustomOp behavior
-        # Necessary on ROCm because it's common that users
-        # enable rms_norm to use the aiter kernel.
+        #  Aiter rms norm perform best when CUDA Graph capture is enabled.
         # TODO(luka/TJ) remove env vars completely
         if (
-            cc.is_custom_op_enabled("rms_norm")
+            cc.cudagraph_mode != CUDAGraphMode.NONE
             and envs.VLLM_ROCM_USE_AITER
             and envs.VLLM_ROCM_USE_AITER_RMSNORM
         ):
@@ -963,7 +952,9 @@ def get_default_ir_op_priority(
         else:
             rms_norm = default
 
-        return IrOpPriorityConfig.with_default(default, rms_norm=rms_norm)
+        return IrOpPriorityConfig.with_default(
+            default, rms_norm=rms_norm, fused_add_rms_norm=rms_norm
+        )
 
     @classmethod
     @with_amdsmi_context
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 755fa56d294c..cd51f106503a 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -32,6 +32,18 @@
         "deepseek_v3_reasoning_parser",
         "DeepSeekV3ReasoningParser",
     ),
+    "poolside_v1": (
+        "poolside_v1_reasoning_parser",
+        "PoolsideV1ReasoningParser",
+    ),
+    "cohere_command3": (
+        "cohere_command_reasoning_parser",
+        "CohereCommand3ReasoningParser",
+    ),
+    "cohere_command4": (
+        "cohere_command_reasoning_parser",
+        "CohereCommand4ReasoningParser",
+    ),
     "ernie45": (
         "ernie45_reasoning_parser",
         "Ernie45ReasoningParser",
diff --git a/vllm/reasoning/cohere_command_reasoning_parser.py b/vllm/reasoning/cohere_command_reasoning_parser.py
new file mode 100644
index 000000000000..c96b21d4e8fb
--- /dev/null
+++ b/vllm/reasoning/cohere_command_reasoning_parser.py
@@ -0,0 +1,546 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import json
+from collections.abc import Mapping, Sequence
+from typing import Any, NamedTuple, TypedDict, TypeGuard
+
+import regex as re
+import xgrammar as xgr
+
+try:
+    from cohere_melody import PyFilter, PyFilterOptions
+except ImportError as e:
+    raise ImportError(
+        "The Cohere reasoning parser requires the `cohere_melody` "
+        "package, which is not installed. Install it with:\n"
+        "    pip install cohere_melody"
+    ) from e
+
+
+from vllm.entrypoints.mcp.tool_server import ToolServer
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
+    AnyResponseFormat,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
+from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+from vllm.reasoning import ReasoningParser
+from vllm.sampling_params import StructuredOutputsParams
+from vllm.tokenizers import TokenizerLike
+
+REPLACEMENT_CHAR = "\ufffd"
+
+
+class CohereTagRegistry(NamedTuple):
+    """A single ``structural_tag`` begin("trigger")/end pair."""
+
+    trigger: str
+    end: str
+
+
+class CohereTagStyle(NamedTuple):
+    """The structural tags style for a given model architecture."""
+
+    json: CohereTagRegistry
+    tools: CohereTagRegistry
+
+
+class CohereNormalizedTool(TypedDict):
+    """A tool definition normalized to the shape ``collect_tool_schema`` expects.
+
+    ``parameters`` is a JSON Schema object (possibly empty) describing the tool's
+    call signature.
+    """
+
+    name: str
+    parameters: dict[str, Any]
+
+
+COMMAND_A_TOOLS_TAG = CohereTagRegistry(
+    trigger="<|START_ACTION|>", end="<|END_ACTION|>"
+)
+COMMAND_A_JSON_TAG = CohereTagRegistry(
+    trigger="<|START_RESPONSE|>", end="<|END_RESPONSE|>"
+)
+
+MODEL_TO_TAG_STYLE: dict[str, CohereTagStyle] = {
+    "Cohere2ForCausalLM": CohereTagStyle(
+        json=COMMAND_A_JSON_TAG, tools=COMMAND_A_TOOLS_TAG
+    ),
+    "Cohere2VisionForConditionalGeneration": CohereTagStyle(
+        json=COMMAND_A_JSON_TAG, tools=COMMAND_A_TOOLS_TAG
+    ),
+}
+
+
+def collect_tool_schema(tool_schema: list[CohereNormalizedTool]) -> str:
+    """Build an xgrammar EBNF grammar that matches a JSON array of tool calls.
+
+    The grammar shape is architecture-independent; callers are responsible for
+    wrapping it in the correct structural tag (see ``CohereTagStyle.tools``).
+    """
+    tool_dictionary: dict[str, str] = {}
+    for tool in tool_schema:
+        tool_name = tool["name"]
+        tool_parameters = json.dumps(tool["parameters"])
+        json_schema = f"""{{
+                        "type": "object",
+                        "properties": {{
+                            "tool_call_id": {{
+                                "type": "string",
+                                "pattern": "^[0-9]+$"
+                            }},
+                            "tool_name": {{
+                                "type": "string",
+                                "const": "{tool_name}"
+                            }},
+                            "parameters": {tool_parameters}
+                            }}
+                            }}"""
+        tool_grammar = str(xgr.Grammar.from_json_schema(json_schema))
+        for match in re.findall(r"\b(\w+)\s*::=", tool_grammar):
+            tool_grammar = re.sub(
+                rf"\b{re.escape(match)}\b", tool_name + match, tool_grammar
+            )
+        tool_dictionary[tool_name] = f"{tool_name} ::= {tool_name}root\n{tool_grammar}"
+    # Emitted grammar shape:
+    #   root  ::= tools
+    #   tools ::= ws "[" ws tool ws ("," ws tool)* ws "]" ws
+    #   ws    ::= (" " | "\t" | "\n")*
+    #   tool  ::= <tool_a> | <tool_b> | ...         (one alternative per input)
+    #   <tool_x>     ::= <tool_x>root               (per-tool xgrammar rules)
+    #   <tool_x>root ::= ...                        (from xgr.Grammar.from_json_schema)
+    tool_alternatives = "tool ::= " + " | ".join(tool_dictionary.keys())
+    tool_rules = "\n    ".join(tool_dictionary.values())
+    grammar = f"""root ::= tools
+    tools ::= ws "[" ws tool ws ("," ws tool)*  ws "]" ws
+    ws    ::= (" " | "\\t" | "\\n")*
+    {tool_alternatives}
+    {tool_rules}
+    """
+    return grammar
+
+
+def _tool_definitions_to_schema_list(
+    tools: str | list[Any],
+) -> list[CohereNormalizedTool]:
+    """
+    Build the list of ``CohereNormalizedTool`` dicts expected by
+    ``collect_tool_schema``.
+
+    Accepts:
+    - JSON string
+    - list of dicts with top-level ``name`` / ``parameters``
+    - list of Chat Completions-style ``{"type": "function", "function": {...}}``
+    - list of Pydantic models with ``model_dump()``
+    """
+    if isinstance(tools, str):
+        try:
+            parsed = json.loads(tools)
+        except json.JSONDecodeError:
+            return []
+        if not isinstance(parsed, list):
+            return []
+    else:
+        parsed = list(tools)
+
+    out: list[CohereNormalizedTool] = []
+    for raw in parsed:
+        t = raw.model_dump() if hasattr(raw, "model_dump") else raw
+        if not isinstance(t, dict):
+            continue
+        # Unwrap Chat Completions' ``{"type": "function", "function": {...}}``
+        # shape; otherwise take the dict as-is.
+        if t.get("type") == "function" and isinstance(t.get("function"), dict):
+            t = t["function"]
+        name = t.get("name")
+        if not isinstance(name, str):
+            continue
+        params = t.get("parameters")
+        out.append(
+            CohereNormalizedTool(
+                name=name,
+                parameters=params if isinstance(params, dict) else {},
+            )
+        )
+    return out
+
+
+def _has_effective_tools(
+    tools: str | list[Any] | None,
+) -> TypeGuard[str | list[Any]]:
+    """
+    True when ``tools`` contains at least one tool definition to convert.
+
+    ``ResponsesRequest`` defaults ``tools`` to ``[]``; ``ChatCompletionRequest``
+    uses ``None``. Both mean "no tools" here. Strings (e.g. a JSON blob) are
+    treated as effective only when non-blank.
+    """
+    if tools is None:
+        return False
+    if isinstance(tools, str):
+        return bool(tools.strip())
+    return len(tools) > 0
+
+
+# Builder: produces vLLM response_format in xgrammar's canonical format.
+# See xgrammar docs: type "structural_tag" with "format" = triggered_tags
+# and tag content type = json_schema | grammar.
+def convert_schema_to_structural_tags(
+    schema: dict | None = None,
+    tools: str | list[Any] | None = None,
+    model_architecture: str | None = None,
+) -> str | None:
+    """
+    Returns a response_format string accepted by xgrammar's structural tag format.
+    Uses the canonical shape: {"type": "structural_tag", "format": {...}} with
+    format.type "triggered_tags" and tag content type "json_schema" or "grammar".
+
+    Callers that are not on an engine path (e.g. the reasoning parser) must pass
+    ``model_architecture`` explicitly.
+    """
+    if model_architecture is None or model_architecture not in MODEL_TO_TAG_STYLE:
+        return None
+    style = MODEL_TO_TAG_STYLE[model_architecture]
+
+    tags: list[dict] = []
+
+    def _add_tag(tag: CohereTagRegistry, content: dict) -> None:
+        tags.append({"begin": tag.trigger, "content": content, "end": tag.end})
+
+    if schema is not None:
+        # Add the JSON-schema tag both for schema-only requests and for the
+        # "tools plus JSON mode" case (North use case: follow the schema when
+        # the model decides not to call any tool).
+        _add_tag(style.json, {"type": "json_schema", "json_schema": schema})
+
+    if _has_effective_tools(tools):
+        # ``tools`` may be a JSON string (poseidon / RESPONSE_FORMAT_TOOL_DEFINITIONS)
+        # or a list (Chat Completions ``request.tools`` as Pydantic models or dicts).
+        tool_schema_list = _tool_definitions_to_schema_list(tools)
+        if not tool_schema_list:
+            raise ValueError(
+                "No valid tool definitions could be parsed from the request for "
+                "structural tag conversion."
+            )
+        tool_grammar = collect_tool_schema(tool_schema_list)
+        _add_tag(style.tools, {"type": "grammar", "grammar": tool_grammar})
+
+    if not tags:
+        return None
+    return json.dumps(
+        {
+            "type": "structural_tag",
+            "format": {
+                "type": "triggered_tags",
+                "triggers": [t["begin"] for t in tags],
+                "tags": tags,
+            },
+        }
+    )
+
+
+def _response_format_type(
+    response_format: AnyResponseFormat | dict | None,
+) -> str | None:
+    if response_format is None:
+        return None
+    if isinstance(response_format, dict):
+        t = response_format.get("type")
+        return t if isinstance(t, str) else None
+    return response_format.type
+
+
+def _maybe_parse_json_dict(value: Any) -> dict | None:
+    """If value is a JSON string, parse to dict; otherwise require dict."""
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, str):
+        try:
+            parsed = json.loads(value)
+        except (TypeError, json.JSONDecodeError):
+            return None
+        return parsed if isinstance(parsed, dict) else None
+    return None
+
+
+def _unwrap_nested_schema(candidate: Any) -> dict | None:
+    """Return ``candidate`` as a dict, unwrapping a nested ``schema`` if present.
+
+    Returns ``None`` if ``candidate`` is not (and cannot be parsed into) a dict.
+    """
+    cand = _maybe_parse_json_dict(candidate)
+    if not isinstance(cand, dict):
+        return None
+    nested = cand.get("schema")
+    return nested if isinstance(nested, dict) else cand
+
+
+def _schema_from_json_schema_field(js_wr: Any) -> dict | None:
+    """
+    Extract the JSON Schema object from Chat Completions ``json_schema`` payload.
+
+    Accepts:
+    - ``JsonSchemaResponseFormat`` (Pydantic) with ``schema`` / ``json_schema`` field
+    - dict in OpenAI shape ``{"name": ..., "schema": {...}}``
+    - dict with ``json_schema`` key holding either the schema or a nested wrapper
+    - dict that is already a JSON Schema document (some clients omit the wrapper)
+    - JSON strings for any of the above
+    """
+    if js_wr is None:
+        return None
+
+    parsed_wr = _maybe_parse_json_dict(js_wr)
+    if parsed_wr is not None:
+        js_wr = parsed_wr
+
+    if hasattr(js_wr, "model_dump"):
+        for by_alias in (True, False):
+            try:
+                data = js_wr.model_dump(by_alias=by_alias, exclude_none=False)
+            except TypeError:
+                data = js_wr.model_dump(by_alias=by_alias)
+            out = _unwrap_nested_schema(data.get("schema") or data.get("json_schema"))
+            if out is not None:
+                return out
+        inner_attr = getattr(js_wr, "json_schema", None)
+        return inner_attr if isinstance(inner_attr, dict) else None
+
+    if isinstance(js_wr, dict):
+        for key in ("schema", "json_schema"):
+            out = _unwrap_nested_schema(js_wr.get(key))
+            if out is not None:
+                return out
+        return js_wr
+
+    return None
+
+
+def _schema_dict_from_chat_response_format(
+    rf: AnyResponseFormat | dict | None,
+) -> dict | None:
+    """JSON schema dict from Chat Completions ``request.response_format`` only."""
+    if rf is None:
+        return None
+    rf_type = _response_format_type(rf)
+    if rf_type == "json_object":
+        return {"type": "object"}
+    if rf_type != "json_schema":
+        return None
+    js_wr = (
+        rf.get("json_schema")
+        if isinstance(rf, dict)
+        else getattr(rf, "json_schema", None)
+    )
+    return _schema_from_json_schema_field(js_wr)
+
+
+def _schema_dict_from_structured_outputs(
+    so: StructuredOutputsParams | None,
+) -> dict | None:
+    """Schema dict from ``structured_outputs`` (``json`` / ``json_object``).
+
+    Same unwrapping as ``json_schema``. ``json`` is expected to be ``str`` or
+    ``dict`` (enforced by ``StructuredOutputsParams`` / request models); other
+    types raise ``ValueError`` only if a caller bypasses that validation.
+    """
+    if so is None:
+        return None
+    if so.json_object:
+        return {"type": "object"}
+    raw: Any = so.json
+    if raw is None:
+        return None
+
+    if hasattr(raw, "model_dump"):
+        out = _schema_from_json_schema_field(raw)
+        if out is None:
+            raise ValueError(
+                "structured_outputs.json model has no extractable JSON Schema."
+            )
+        return out
+
+    if isinstance(raw, str):
+        if not raw.strip():
+            raise ValueError("structured_outputs.json cannot be empty.")
+        try:
+            raw = json.loads(raw)
+        except json.JSONDecodeError as e:
+            raise ValueError("structured_outputs.json must be valid JSON.") from e
+        if not isinstance(raw, dict):
+            raise ValueError("structured_outputs.json must decode to a JSON object.")
+
+    if isinstance(raw, Mapping):
+        body = raw if isinstance(raw, dict) else dict(raw)
+        return _schema_from_json_schema_field(body) or body
+
+    raise ValueError(
+        f"structured_outputs.json has unsupported type {type(raw).__name__}."
+    )
+
+
+class BaseCohereCommandReasoningParser(ReasoningParser):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        *args,
+        streaming_opts: PyFilterOptions,
+        unary_opts: PyFilterOptions,
+        **kwargs,
+    ):
+        super().__init__(tokenizer, *args, **kwargs)
+        self.end_token_id = tokenizer.convert_tokens_to_ids("<|END_THINKING|>")
+        self.unary_opts = unary_opts
+        self.melody_unary = PyFilter(unary_opts)
+        self.melody_streaming = PyFilter(streaming_opts)
+
+    @property
+    def reasoning_start_str(self) -> str | None:
+        return "<|START_THINKING|>"
+
+    @property
+    def reasoning_end_str(self) -> str | None:
+        return "<|END_THINKING|>"
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        r = self.melody_streaming.write_decoded(delta_text)
+        if r.content is None and r.reasoning is None and not r.tool_calls:
+            return None
+        msg = DeltaMessage()
+        if r.content is not None:
+            msg.content = r.content
+        if r.reasoning is not None:
+            msg.reasoning = r.reasoning
+        if r.tool_calls:
+            msg.tool_calls = [
+                DeltaToolCall(
+                    id=tc.id,
+                    index=tc.index,
+                    type="function",
+                    function=DeltaFunctionCall(name=tc.name, arguments=tc.arguments),
+                )
+                for tc in r.tool_calls
+            ]
+        return msg
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
+        result = self.melody_unary.process_full_text(model_output)
+        return result.reasoning, result.content
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        token_buf: list[int] = []
+        content_ids: list[int] = []
+        content_filter = PyFilter(self.unary_opts)
+        for t in input_ids:
+            token_buf.append(t)
+            s = self.model_tokenizer.decode(token_buf, skip_special_tokens=False)
+            if s.endswith(REPLACEMENT_CHAR):
+                continue
+            r = content_filter.write_decoded(s)
+            if r.content is not None:
+                content_ids.extend(token_buf)
+            token_buf = []
+        return content_ids
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return any(tid == self.end_token_id for tid in reversed(input_ids))
+
+    def prepare_structured_tag(
+        self, original_tag: str | None, tool_server: ToolServer | None
+    ) -> str | None:
+        # Responses API replaces ``structural_tag`` via the reasoning parser.
+        # Default ``ReasoningParser.prepare_structured_tag`` returns None, which
+        # would clear a Cohere tag produced in ``adjust_request`` and break
+        # ``StructuredOutputsParams`` validation. Preserve the existing tag.
+        return original_tag
+
+    def adjust_request(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> ChatCompletionRequest | ResponsesRequest:
+        so = request.structured_outputs
+        if so is not None and so.structural_tag:
+            return request
+        # Schema: prefer ``response_format`` (OpenAI Chat Completions), then
+        # ``structured_outputs.json`` / ``json_object`` (vLLM direct). Tools stay
+        # on ``request.tools``.
+        rf = (
+            request.response_format
+            if isinstance(request, ChatCompletionRequest)
+            else None
+        )
+        if rf is not None and _response_format_type(rf) == "structural_tag":
+            return request
+        model_architecture = (
+            self._model_config.architecture if self._model_config is not None else None
+        )
+        tools = request.tools
+        # ``response_format`` wins if both it and ``structured_outputs`` supply JSON.
+        schema = _schema_dict_from_chat_response_format(rf)
+        if schema is None:
+            schema = _schema_dict_from_structured_outputs(so)
+        if schema is None and not _has_effective_tools(tools):
+            return request
+        if model_architecture is None:
+            return request
+        result = convert_schema_to_structural_tags(
+            schema=schema,
+            tools=tools,
+            model_architecture=model_architecture,
+        )
+        if result is None:
+            # Unsupported architectures are not in ``MODEL_TO_TAG_STYLE``; conversion
+            raise ValueError(
+                "Failed to build structural_tag guided decoding constraints from "
+                "this request's JSON schema and/or tools. The configured model "
+                f"architecture ({model_architecture!r}) does not support Cohere "
+                "command structural tags, or the schema cannot be expressed in "
+                "that format."
+            )
+        request.structured_outputs = StructuredOutputsParams(structural_tag=result)
+        # Folded JSON constraints into ``structural_tag``; drop ``response_format``
+        # when it was the source so ``to_sampling_params`` does not also set ``json`` /
+        # ``json_object`` (mutually exclusive in ``StructuredOutputsParams``).
+        if isinstance(request, ChatCompletionRequest) and rf is not None:
+            rf_type = _response_format_type(rf)
+            if rf_type in ("json_schema", "json_object"):
+                request.response_format = None
+        return request
+
+
+class CohereCommand3ReasoningParser(BaseCohereCommandReasoningParser):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(
+            tokenizer,
+            *args,
+            streaming_opts=PyFilterOptions().cmd3(),
+            unary_opts=PyFilterOptions().cmd3().no_tools(),
+            **kwargs,
+        )
+
+
+class CohereCommand4ReasoningParser(BaseCohereCommandReasoningParser):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(
+            tokenizer,
+            *args,
+            streaming_opts=PyFilterOptions().cmd4(),
+            unary_opts=PyFilterOptions().cmd4().no_tools(),
+            **kwargs,
+        )
diff --git a/vllm/reasoning/kimi_k2_reasoning_parser.py b/vllm/reasoning/kimi_k2_reasoning_parser.py
index 7a92703426fc..0b64c5c62ea1 100644
--- a/vllm/reasoning/kimi_k2_reasoning_parser.py
+++ b/vllm/reasoning/kimi_k2_reasoning_parser.py
@@ -221,6 +221,10 @@ def extract_reasoning_streaming(
             return None
 
         if self._end_token_id in delta_token_ids:
+            if self._end_token not in delta_text:
+                # Token ID arrived before text was flushed (stop-sequence buffering).
+                # Wait for the next delta when the text becomes visible.
+                return None
             end_index = delta_text.find(self._end_token)
             reasoning = delta_text[:end_index]
             content = delta_text[end_index + len(self._end_token) :]
@@ -229,6 +233,9 @@ def extract_reasoning_streaming(
             )
 
         if self._tool_section_start_token_id in delta_token_ids:
+            if self._tool_section_start_token not in delta_text:
+                # Token ID arrived before text was flushed (stop-sequence buffering).
+                return None
             tool_index = delta_text.find(self._tool_section_start_token)
             reasoning = delta_text[:tool_index]
             content = delta_text[tool_index:]
diff --git a/vllm/reasoning/poolside_v1_reasoning_parser.py b/vllm/reasoning/poolside_v1_reasoning_parser.py
new file mode 100644
index 000000000000..30031d8513a9
--- /dev/null
+++ b/vllm/reasoning/poolside_v1_reasoning_parser.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Laguna reasoning parser.
+
+``DeepSeekV3ReasoningParser.is_reasoning_end`` walks the entire
+token sequence backwards and returns ``True`` on the first ``</think>`` it
+sees. When called on ``prompt_token_ids`` that mistakes any stray
+``</think>`` in conversation history, few-shot examples or tool descriptions
+for a template-injected "thinking already ended" marker. In the streaming
+path (see ``vllm/entrypoints/openai/chat_completion/serving.py``,
+``prompt_is_reasoning_end_arr``) that false positive short-circuits the
+reasoning parser for the whole response, so any ``<think>...</think>`` the
+model emits itself ends up in the content field instead of the reasoning
+field.
+
+As we have more flexible templates, we instead scope
+the backward search to the current assistant turn: the
+walk terminates as soon as we hit the ``<assistant>`` start-of-message
+token. A ``</think>`` in a prior user turn or few-shot example is no longer
+visible.
+"""
+
+from collections.abc import Sequence
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
+
+
+class PoolsideV1ReasoningParser(DeepSeekV3ReasoningParser):
+    """Drop-in replacement for ``deepseek_v3`` that tolerates ``</think>``
+    tokens appearing anywhere in the prompt other than the generation prefix.
+    """
+
+    _start_of_assistant_message = "<assistant>"
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        if self._start_of_assistant_message not in self.vocab:
+            raise ValueError(
+                f"Tokenizer must contain {self._start_of_assistant_message!r} token"
+            )
+        self._start_of_assistant_message_token_id = self.vocab[
+            self._start_of_assistant_message
+        ]
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        # IdentityReasoningParser always returns True: no reasoning to parse.
+        if isinstance(self._parser, IdentityReasoningParser):
+            return True
+
+        assert isinstance(self._parser, DeepSeekR1ReasoningParser)
+        for tok_id in reversed(input_ids):
+            # <think>: reasoning is not yet ended.
+            if tok_id == self._parser.start_token_id:
+                return False
+            # </think>: reasoning has ended.
+            if tok_id == self._parser.end_token_id:
+                return True
+            # <assistant>: reached the start of the current assistant turn
+            # without seeing either marker. Anything further back belongs to
+            # the prior conversation and should be ignored.
+            if tok_id == self._start_of_assistant_message_token_id:
+                return False
+        return False
+
+
+__all__ = ["PoolsideV1ReasoningParser"]
diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py
index 2f10302c0268..41d8c0075fb1 100644
--- a/vllm/renderers/base.py
+++ b/vllm/renderers/base.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-import copy
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Mapping, Sequence
@@ -105,20 +104,13 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
         self._process_multimodal_async = make_async(
             self._process_multimodal, executor=self._mm_executor
         )
-        if config.model_config.is_multimodal_model:
+        if mm_registry.supports_multimodal_inputs(config.model_config):
             mm_processor_cache = mm_registry.processor_cache_from_config(config)
 
-            # Deep-copy the tokenizer so the multimodal processor gets its
-            # own Rust tokenizer backend.  Without this, concurrent access
-            # from AsyncMicrobatchTokenizer and call_hf_processor causes
-            # "RuntimeError: Already borrowed" from the Rust RefCell.
-            # See: https://github.com/huggingface/tokenizers/issues/537
-            mm_tokenizer = copy.deepcopy(tokenizer)
-
             with set_default_torch_num_threads():
                 self.mm_processor = mm_registry.create_processor(
                     config.model_config,
-                    tokenizer=mm_tokenizer,
+                    tokenizer=self.tokenizer,
                     cache=mm_processor_cache,
                 )
 
@@ -130,11 +122,10 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
             # requests don't pollute the sender cache.
             ro_cache = mm_registry.processor_only_cache_from_config(config)
             if ro_cache is not None:
-                ro_tokenizer = copy.deepcopy(tokenizer)
                 with set_default_torch_num_threads():
                     self._readonly_mm_processor = mm_registry.create_processor(
                         config.model_config,
-                        tokenizer=ro_tokenizer,
+                        tokenizer=self.tokenizer,
                         cache=ro_cache,
                     )
 
@@ -769,6 +760,8 @@ def _process_embeds(self, prompt: EmbedsPrompt) -> EmbedsInput:
         return embeds_input(
             prompt_embeds=prompt_embeds,
             cache_salt=prompt.get("cache_salt"),
+            prompt_token_ids=prompt.get("prompt_token_ids"),
+            is_token_ids=prompt.get("prompt_is_token_ids"),
         )
 
     async def _process_tokens_async(
diff --git a/vllm/renderers/embed_utils.py b/vllm/renderers/embed_utils.py
index a51fc53a24ad..84c28dcf7e0f 100644
--- a/vllm/renderers/embed_utils.py
+++ b/vllm/renderers/embed_utils.py
@@ -7,6 +7,7 @@
 import torch
 
 from vllm.exceptions import VLLMValidationError
+from vllm.utils.async_utils import make_async
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -30,15 +31,53 @@ def safe_load_prompt_embeds(
             weights_only=True,
             map_location=torch.device("cpu"),
         )
-        assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
-            torch.float32,
-            torch.bfloat16,
-            torch.float16,
-        )
+        if not isinstance(tensor, torch.Tensor):
+            raise VLLMValidationError(
+                "`prompt_embeds` payload did not deserialize to a torch.Tensor.",
+                parameter="prompt_embeds",
+            )
         tensor = tensor.to_dense()
 
     if tensor.dim() > 2:
         tensor = tensor.squeeze(0)
-        assert tensor.dim() == 2
+    if tensor.dim() != 2:
+        raise VLLMValidationError(
+            "`prompt_embeds` must be a 2D tensor of shape "
+            f"(num_tokens, hidden_size); got shape {tuple(tensor.shape)}.",
+            parameter="prompt_embeds",
+        )
+
+    # Pin each tensor to the model's hidden_size. Validating here
+    # also transitively guarantees cross-tensor consistency for requests that
+    # include multiple `prompt_embeds` parts, which is required by downstream
+    # concatenation in `_build_mixed_prompt_embeds`.
+    expected_hidden_size = model_config.get_hidden_size()
+    if tensor.shape[1] != expected_hidden_size:
+        raise VLLMValidationError(
+            f"`prompt_embeds` hidden_size {tensor.shape[1]} does not match "
+            f"the model's hidden_size {expected_hidden_size}.",
+            parameter="prompt_embeds",
+        )
+
+    # Cast to the model's dtype so API clients don't need to know the server's
+    # `--dtype` setting ahead of time. Only floating-point source dtypes are
+    # allowed. integer / bool / complex inputs almost certainly indicate caller
+    # error (e.g. quantized payloads, wrong tensor), and a silent `.to()`
+    # could hide a real mistake.
+    expected_dtype = model_config.dtype
+    if tensor.dtype != expected_dtype:
+        if not tensor.is_floating_point():
+            raise VLLMValidationError(
+                f"`prompt_embeds` dtype {tensor.dtype} is not a floating-point "
+                f"type, cannot safely cast to the model's dtype {expected_dtype}.",
+                parameter="prompt_embeds",
+            )
+        tensor = tensor.to(expected_dtype)
 
     return tensor
+
+
+safe_load_prompt_embeds_async = make_async(safe_load_prompt_embeds)
+"""Async variant of `safe_load_prompt_embeds` that defers the decode to a
+thread-pool executor, so the asyncio event loop is not blocked by the base64
+decode + `torch.load` work."""
diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py
index 690ffb2a8954..6425bc647a1c 100644
--- a/vllm/renderers/hf.py
+++ b/vllm/renderers/hf.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import copy
 import inspect
 import itertools
+import weakref
 from collections import defaultdict, deque
-from collections.abc import Set
+from collections.abc import Sequence
 from functools import lru_cache
-from typing import Any, Literal, cast, overload
+from typing import TYPE_CHECKING, Any, Final, Literal, cast, overload
 
 import jinja2
 import jinja2.ext
@@ -13,34 +17,199 @@
 import jinja2.nodes
 import jinja2.parser
 import jinja2.sandbox
+import torch
+from typing_extensions import override
 
-from vllm.config import ModelConfig, VllmConfig
 from vllm.entrypoints.chat_utils import (
-    ChatCompletionMessageParam,
-    ChatTemplateContentFormat,
-    ChatTemplateContentFormatOption,
+    PROMPT_EMBEDS_PLACEHOLDER_TOKEN,
     ChatTemplateResolutionError,
-    ConversationMessage,
     load_chat_template,
     parse_chat_messages,
     parse_chat_messages_async,
 )
-from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict
+from vllm.inputs import EmbedsPrompt
+from vllm.inputs.engine import MultiModalInput
 from vllm.logger import init_logger
-from vllm.tokenizers.hf import HfTokenizer
+from vllm.multimodal.hasher import MultiModalHasher
+from vllm.multimodal.inputs import (
+    MultiModalFieldElem,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    MultiModalSharedField,
+    PlaceholderRange,
+)
+from vllm.multimodal.processing.processor import (
+    PromptReplacement,
+    apply_token_matches,
+    find_mm_placeholders,
+)
+from vllm.tokenizers.hf import HfTokenizer, maybe_make_thread_pool
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils.async_utils import make_async
 from vllm.utils.func_utils import supports_kw
 
 from .base import BaseRenderer
-from .inputs import DictPrompt
 from .inputs.preprocess import parse_dec_only_prompt
-from .params import ChatParams
+
+if TYPE_CHECKING:
+    from collections.abc import Set
+
+    from vllm.config import ModelConfig, VllmConfig
+    from vllm.entrypoints.chat_utils import (
+        ChatCompletionMessageParam,
+        ChatTemplateContentFormat,
+        ChatTemplateContentFormatOption,
+        ConversationMessage,
+    )
+    from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict, TokensPrompt
+    from vllm.inputs.engine import TokensInput
+    from vllm.multimodal.processing.processor import (
+        MultiModalPromptUpdates,
+        ResolvedPromptUpdate,
+    )
+
+    from .inputs import DictPrompt
+    from .params import ChatParams
 
 logger = init_logger(__name__)
 
 
+# Cache of `tokenizer -> prompt_embeds placeholder token ID`. Keyed by the
+# tokenizer object (not `id(tokenizer)`) so a fresh tokenizer landing at a
+# recycled memory address can't pick up a stale tid. Entries evict atomically
+# with the tokenizer's garbage-collection.
+_PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE: Final[
+    weakref.WeakKeyDictionary[HfTokenizer, int]
+] = weakref.WeakKeyDictionary()
+_PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_ERROR: Final[str] = (
+    "Expected {token!r} to tokenize to exactly 1 token, got {num_ids} ({ids!r})."
+)
+_PROMPT_EMBEDS_PLACEHOLDER_SPAN_MISMATCH_ERROR: Final[str] = (
+    "Expected {expected} prompt_embeds placeholder spans in the "
+    "tokenized prompt, found {actual}."
+)
+_MISSING_PROMPT_TOKEN_IDS_ERROR: Final[str] = (
+    "Expected prompt_token_ids in rendered prompt when prompt_embeds "
+    "are present. This indicates the chat template was invoked with "
+    "tokenize=False."
+)
+_TOKENIZE_OVERRIDE_WARNING: Final[str] = (
+    "Overriding `tokenize=False` to `True` because `prompt_embeds` "
+    "post-processing requires tokenized IDs."
+)
+
+
+def _ensure_prompt_embeds_placeholder_token(tokenizer: HfTokenizer) -> int:
+    """Register `PROMPT_EMBEDS_PLACEHOLDER_TOKEN` as a special token and return
+    its token ID."""
+    cached = _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE.get(tokenizer)
+    if cached is not None:
+        return cached
+
+    tokenizer.add_special_tokens(
+        {"additional_special_tokens": [PROMPT_EMBEDS_PLACEHOLDER_TOKEN]}
+    )
+
+    ids = tokenizer.encode(PROMPT_EMBEDS_PLACEHOLDER_TOKEN, add_special_tokens=False)
+    if len(ids) != 1:
+        raise RuntimeError(
+            _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_ERROR.format(
+                token=PROMPT_EMBEDS_PLACEHOLDER_TOKEN,
+                num_ids=len(ids),
+                ids=ids,
+            )
+        )
+
+    token_id = ids[0]
+    _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE[tokenizer] = token_id
+    return token_id
+
+
+def _build_prompt_embeds_updates(
+    prompt_embeds_tensors: Sequence[torch.Tensor],
+    placeholder_token_id: int,
+) -> MultiModalPromptUpdates:
+    """Build `MultiModalPromptUpdates` for `prompt_embeds` expansion.
+
+    Each tensor produces a `PromptReplacement` that maps
+    `[placeholder_token_id]` -> `[placeholder_token_id] x N`
+    (where `N = tensor.shape[0]`).
+    """
+    updates: list[Sequence[ResolvedPromptUpdate]] = []
+    for i, tensor in enumerate(prompt_embeds_tensors):
+        update = PromptReplacement(
+            modality="prompt_embeds",
+            target=[placeholder_token_id],
+            replacement=[placeholder_token_id] * tensor.shape[0],
+        )
+        updates.append([update.resolve(item_idx=i)])
+    return {"prompt_embeds": updates}
+
+
+def _expand_prompt_embeds_placeholders(
+    token_ids: list[int],
+    mm_prompt_updates: MultiModalPromptUpdates,
+) -> list[int]:
+    """Expand each 1-token `prompt_embeds` sentinel into an N-token span.
+
+    Uses `apply_token_matches`.  Each single placeholder token in
+    `token_ids` is replaced with a consecutive span of
+    `tensor.shape[0]` copies, following tensors in order.
+    """
+    expanded, _ = apply_token_matches(token_ids, mm_prompt_updates, tokenizer=None)
+    return expanded
+
+
+def _build_prompt_embeds_positions(
+    token_ids: list[int],
+    num_tensors: int,
+    mm_prompt_updates: MultiModalPromptUpdates,
+) -> list[tuple[int, int]]:
+    """Locate each prompt_embeds placeholder span in `token_ids`.
+
+    Expects `token_ids` to already contain expanded N-token spans.
+    Returns `[(start_idx, length), ...]` aligned with the tensors.
+    """
+    placeholders = find_mm_placeholders(
+        prompt=token_ids,
+        mm_prompt_updates=mm_prompt_updates,
+        tokenizer=None,
+    )
+    features = placeholders.get("prompt_embeds", [])
+
+    if len(features) != num_tensors:
+        raise ValueError(
+            _PROMPT_EMBEDS_PLACEHOLDER_SPAN_MISMATCH_ERROR.format(
+                expected=num_tensors,
+                actual=len(features),
+            )
+        )
+
+    return [(f.start_idx, f.length) for f in features]
+
+
+def _build_mixed_prompt_embeds(
+    token_ids: list[int],
+    prompt_embeds_tensors: Sequence[torch.Tensor],
+    positions: list[tuple[int, int]],
+) -> tuple[torch.Tensor, list[bool]]:
+    """Build the full-length `prompt_embeds` tensor and the `is_token_ids`
+    mask aligned to `token_ids`."""
+    total_len = len(token_ids)
+    hidden_size = prompt_embeds_tensors[0].shape[1]
+    dtype = prompt_embeds_tensors[0].dtype
+
+    full_embeds = torch.zeros(total_len, hidden_size, dtype=dtype)
+    is_token_ids = torch.ones(total_len, dtype=torch.bool)
+
+    for (start, length), tensor in zip(positions, prompt_embeds_tensors, strict=True):
+        full_embeds[start : start + length] = tensor
+        is_token_ids[start : start + length] = False
+
+    return full_embeds, is_token_ids.tolist()
+
+
 _PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], str | None]()
 """
 Used in `_try_get_processor_chat_template` to avoid calling
@@ -98,7 +267,7 @@ def resolve_chat_template(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     *,
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
 ) -> str | None:
     # 1st priority: The given chat template
     if chat_template is not None:
@@ -281,7 +450,7 @@ def _resolve_chat_template_content_format(
     tools: list[dict[str, Any]] | None,
     tokenizer: HfTokenizer,
     *,
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
 ) -> ChatTemplateContentFormat:
     resolved_chat_template = resolve_chat_template(
         tokenizer,
@@ -335,7 +504,7 @@ def resolve_chat_template_content_format(
     given_format: ChatTemplateContentFormatOption,
     tokenizer: HfTokenizer,
     *,
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
 ) -> ChatTemplateContentFormat:
     if given_format != "auto":
         return given_format
@@ -437,7 +606,7 @@ def resolve_chat_template_kwargs(
 
 @overload
 def safe_apply_chat_template(
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
     tokenizer: HfTokenizer,
     conversation: list[ConversationMessage],
     *,
@@ -448,7 +617,7 @@ def safe_apply_chat_template(
 ) -> list[int]: ...
 @overload
 def safe_apply_chat_template(
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
     tokenizer: HfTokenizer,
     conversation: list[ConversationMessage],
     *,
@@ -458,7 +627,7 @@ def safe_apply_chat_template(
     **kwargs,
 ) -> str: ...
 def safe_apply_chat_template(
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
     tokenizer: HfTokenizer,
     conversation: list[ConversationMessage],
     *,
@@ -486,6 +655,14 @@ def safe_apply_chat_template(
         chat_template_kwargs=kwargs,
     )
 
+    # transformers v5 changed the default of `return_dict` to True, which
+    # makes `apply_chat_template(tokenize=True)` return a `BatchEncoding`
+    # instead of `list[int]`. Force `return_dict=False` so downstream code
+    # that expects a flat token list (e.g. `parse_dec_only_prompt`) works
+    # consistently across v4 and v5.
+    if tokenize and "return_dict" not in resolved_kwargs:
+        resolved_kwargs["return_dict"] = False
+
     try:
         return tokenizer.apply_chat_template(
             conversation=conversation,  # type: ignore[arg-type]
@@ -609,6 +786,14 @@ def __init__(
         config: VllmConfig,
         tokenizer: HfTokenizer | None,
     ) -> None:
+        # Ensure the og tokenizer is never modified by maybe_make_thread_pool
+        tokenizer = copy.copy(tokenizer)
+        if (
+            # Skip for mock configs and tokenizers
+            getattr(config.model_config, "enable_prompt_embeds", False)
+            and isinstance(tokenizer, HfTokenizer)
+        ):
+            _ensure_prompt_embeds_placeholder_token(tokenizer)
         super().__init__(config, tokenizer)
 
         self.use_unified_vision_chunk = getattr(
@@ -619,6 +804,11 @@ def __init__(
             safe_apply_chat_template, executor=self._executor
         )
 
+        if self.tokenizer is not None:
+            maybe_make_thread_pool(
+                self.tokenizer, config.model_config.renderer_num_workers + 1
+            )
+
     def render_messages(
         self,
         messages: list[ChatCompletionMessageParam],
@@ -627,6 +817,12 @@ def render_messages(
         model_config = self.model_config
         tokenizer = self.get_tokenizer()
 
+        prompt_embeds_placeholder_token_id: int | None = None
+        if model_config.enable_prompt_embeds:
+            prompt_embeds_placeholder_token_id = (
+                _ensure_prompt_embeds_placeholder_token(tokenizer)
+            )
+
         conversation, mm_data, mm_uuids = parse_chat_messages(
             messages,
             model_config,
@@ -641,11 +837,30 @@ def render_messages(
             mm_processor_kwargs=params.mm_processor_kwargs,
         )
 
+        # prompt_embeds tensors are carried by the tracker through mm_data,
+        # but they must NOT be fed to the MM processor (which would reject
+        # the unknown key). Extract them here.
+        prompt_embeds_tensors: list[torch.Tensor] | None = None
+        if mm_data is not None and "prompt_embeds" in mm_data:
+            prompt_embeds_tensors = list(
+                cast(Sequence[torch.Tensor], mm_data["prompt_embeds"])
+            )
+            mm_data = {k: v for k, v in mm_data.items() if k != "prompt_embeds"}
+            if not mm_data:
+                mm_data = None
+
+        chat_template_kwargs = params.get_apply_chat_template_kwargs()
+        if prompt_embeds_tensors:
+            # prompt_embeds post-processing requires prompt_token_ids.
+            if chat_template_kwargs.get("tokenize") is False:
+                logger.warning_once(_TOKENIZE_OVERRIDE_WARNING)
+            chat_template_kwargs["tokenize"] = True
+
         prompt_raw = safe_apply_chat_template(
             model_config,
             tokenizer,
             conversation,
-            **params.get_apply_chat_template_kwargs(),
+            **chat_template_kwargs,
         )
 
         # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
@@ -671,6 +886,29 @@ def render_messages(
             )
 
         prompt = parse_dec_only_prompt(prompt_raw)
+
+        # When `prompt_embeds` is mixed with other modality data,
+        # `_process_tokens` runs `_process_multimodal` first (expanding
+        # `<|AUDIO|>` / `<|IMAGE|>` placeholders) and then
+        # `_apply_prompt_embeds_to_engine_input` augments the result.
+        # Stash the tensors and placeholder ID for that override to consume.
+        if prompt_embeds_tensors and mm_data:
+            assert prompt_embeds_placeholder_token_id is not None
+            cast(dict, prompt)["_prompt_embeds"] = (
+                prompt_embeds_tensors,
+                prompt_embeds_placeholder_token_id,
+            )
+            if params.mm_processor_kwargs:
+                cast(dict, prompt)["mm_processor_kwargs"] = params.mm_processor_kwargs
+        elif prompt_embeds_tensors:
+            # Pure mode: no other MM data, mutate prompt to EmbedsPrompt shape.
+            assert prompt_embeds_placeholder_token_id is not None
+            self._apply_prompt_embeds_to_prompt(
+                prompt,
+                prompt_embeds_tensors,
+                prompt_embeds_placeholder_token_id,
+            )
+
         if mm_data is not None:
             prompt["multi_modal_data"] = mm_data
         if mm_uuids is not None:
@@ -686,6 +924,12 @@ async def render_messages_async(
         model_config = self.model_config
         tokenizer = self.get_tokenizer()
 
+        prompt_embeds_placeholder_token_id: int | None = None
+        if model_config.enable_prompt_embeds:
+            prompt_embeds_placeholder_token_id = (
+                _ensure_prompt_embeds_placeholder_token(tokenizer)
+            )
+
         conversation, mm_data, mm_uuids = await parse_chat_messages_async(
             messages,
             model_config,
@@ -700,11 +944,27 @@ async def render_messages_async(
             mm_processor_kwargs=params.mm_processor_kwargs,
         )
 
+        prompt_embeds_tensors: list[torch.Tensor] | None = None
+        if mm_data is not None and "prompt_embeds" in mm_data:
+            prompt_embeds_tensors = list(
+                cast(Sequence[torch.Tensor], mm_data["prompt_embeds"])
+            )
+            mm_data = {k: v for k, v in mm_data.items() if k != "prompt_embeds"}
+            if not mm_data:
+                mm_data = None
+
+        chat_template_kwargs = params.get_apply_chat_template_kwargs()
+        if prompt_embeds_tensors:
+            # prompt_embeds post-processing requires prompt_token_ids.
+            if chat_template_kwargs.get("tokenize") is False:
+                logger.warning_once(_TOKENIZE_OVERRIDE_WARNING)
+            chat_template_kwargs["tokenize"] = True
+
         prompt_raw = await self._apply_chat_template_async(
             model_config,
             tokenizer,
             conversation,
-            **params.get_apply_chat_template_kwargs(),
+            **chat_template_kwargs,
         )
 
         # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
@@ -728,9 +988,185 @@ async def render_messages_async(
             )
 
         prompt = parse_dec_only_prompt(prompt_raw)
+
+        # See `render_messages` for the rationale.
+        if prompt_embeds_tensors and mm_data:
+            assert prompt_embeds_placeholder_token_id is not None
+            cast(dict, prompt)["_prompt_embeds"] = (
+                prompt_embeds_tensors,
+                prompt_embeds_placeholder_token_id,
+            )
+            if params.mm_processor_kwargs:
+                cast(dict, prompt)["mm_processor_kwargs"] = params.mm_processor_kwargs
+        elif prompt_embeds_tensors:
+            assert prompt_embeds_placeholder_token_id is not None
+            self._apply_prompt_embeds_to_prompt(
+                prompt,
+                prompt_embeds_tensors,
+                prompt_embeds_placeholder_token_id,
+            )
+
         if mm_data is not None:
             prompt["multi_modal_data"] = mm_data
         if mm_uuids is not None:
             prompt["multi_modal_uuids"] = mm_uuids
 
         return conversation, prompt
+
+    @override
+    def _process_tokens(
+        self,
+        prompt: TokensPrompt,
+        *,
+        skip_mm_cache: bool = False,
+    ) -> TokensInput | MultiModalInput:
+        """Pre-expand `prompt_embeds` sentinels before delegating to the MM
+        processor, then attach `prompt_embeds` modality data to the result.
+
+        Mixed mode only: the `_prompt_embeds` stash is set by
+        `render_messages` when `prompt_embeds` co-exist with other MM data
+        (images, audio, …).  We expand each 1-token sentinel to an N-token
+        span *before* calling `super()._process_tokens()` so the MM
+        processor records all placeholder offsets in the final (post-expansion)
+        coordinate space, no offset shifting needed afterwards.
+        """
+        prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
+        if prompt_embeds_info is not None:
+            tensors, placeholder_token_id = prompt_embeds_info
+            mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
+            cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
+                list(prompt["prompt_token_ids"]), mm_updates
+            )
+        engine_input = super()._process_tokens(prompt, skip_mm_cache=skip_mm_cache)
+        if prompt_embeds_info is not None:
+            tensors, _ = prompt_embeds_info
+            self._apply_prompt_embeds_to_engine_input(
+                cast(MultiModalInput, engine_input),
+                tensors,
+                mm_updates,
+            )
+        return engine_input
+
+    @override
+    async def _process_tokens_async(
+        self,
+        prompt: TokensPrompt,
+        *,
+        skip_mm_cache: bool = False,
+    ) -> TokensInput | MultiModalInput:
+        """Async equivalent of `_process_tokens`."""
+        prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
+        if prompt_embeds_info is not None:
+            tensors, placeholder_token_id = prompt_embeds_info
+            mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
+            cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
+                list(prompt["prompt_token_ids"]), mm_updates
+            )
+        engine_input = await super()._process_tokens_async(
+            prompt, skip_mm_cache=skip_mm_cache
+        )
+        if prompt_embeds_info is not None:
+            tensors, _ = prompt_embeds_info
+            self._apply_prompt_embeds_to_engine_input(
+                cast(MultiModalInput, engine_input),
+                tensors,
+                mm_updates,
+            )
+        return engine_input
+
+    @staticmethod
+    def _apply_prompt_embeds_to_prompt(
+        prompt: DictPrompt,
+        prompt_embeds_tensors: list[torch.Tensor],
+        placeholder_token_id: int,
+    ) -> None:
+        """Mutate `prompt` from `TokensPrompt` to `EmbedsPrompt` shape.
+
+        Pure `prompt_embeds` path only (no other MM modalities).  Expands
+        each `<prompt_embeds>` sentinel token into an N-token span and builds
+        the full-length `prompt_embeds` tensor + `prompt_is_token_ids` mask
+        that the engine's `enable_prompt_embeds` worker branch consumes.
+        """
+        token_ids = cast(list[int] | None, prompt.get("prompt_token_ids"))
+        if token_ids is None:
+            raise RuntimeError(_MISSING_PROMPT_TOKEN_IDS_ERROR)
+
+        embeds_orig_positions: list[int] = [
+            i for i, tok in enumerate(token_ids) if tok == placeholder_token_id
+        ]
+        if len(embeds_orig_positions) != len(prompt_embeds_tensors):
+            raise ValueError(
+                f"Expected {len(prompt_embeds_tensors)} prompt_embeds "
+                f"placeholder tokens in the rendered prompt, found "
+                f"{len(embeds_orig_positions)}."
+            )
+
+        mm_updates = _build_prompt_embeds_updates(
+            prompt_embeds_tensors, placeholder_token_id
+        )
+        expanded = _expand_prompt_embeds_placeholders(token_ids, mm_updates)
+        positions = _build_prompt_embeds_positions(
+            expanded, len(prompt_embeds_tensors), mm_updates
+        )
+
+        embeds_prompt = cast(EmbedsPrompt, prompt)
+        embeds_prompt["prompt_token_ids"] = expanded
+        full_embeds, is_token_ids_mask = _build_mixed_prompt_embeds(
+            expanded, prompt_embeds_tensors, positions
+        )
+        embeds_prompt["prompt_embeds"] = full_embeds
+        embeds_prompt["prompt_is_token_ids"] = is_token_ids_mask
+
+    @staticmethod
+    def _apply_prompt_embeds_to_engine_input(
+        engine_input: MultiModalInput,
+        prompt_embeds_tensors: list[torch.Tensor],
+        mm_updates: MultiModalPromptUpdates,
+    ) -> None:
+        """Augment `engine_input` in-place with a `prompt_embeds` modality.
+
+        Mixed mode: called after `_process_multimodal` has already run on the
+        pre-expanded token IDs (expansion was done in `_process_tokens` before
+        calling `super()`).  Locates the already-expanded `prompt_embeds` spans
+        and adds `prompt_embeds` entries to `mm_kwargs`, `mm_hashes`, and
+        `mm_placeholders`.
+        """
+        # token_ids already contain the pre-expanded N-token spans.
+        token_ids = list(engine_input["prompt_token_ids"])
+
+        positions = _build_prompt_embeds_positions(
+            token_ids, len(prompt_embeds_tensors), mm_updates
+        )
+
+        pe_kwargs_items: list[MultiModalKwargsItem] = []
+        pe_hashes: list[str] = []
+        pe_placeholders: list[PlaceholderRange] = []
+        for tensor, (start, length) in zip(
+            prompt_embeds_tensors, positions, strict=True
+        ):
+            pe_kwargs_items.append(
+                MultiModalKwargsItem(
+                    {
+                        "embedding": MultiModalFieldElem(
+                            data=tensor,
+                            field=MultiModalSharedField(batch_size=1),
+                        )
+                    }
+                )
+            )
+            pe_hashes.append(MultiModalHasher.hash_kwargs(prompt_embeds=tensor))
+            # `is_embed=None` matches the existing image_embeds-style
+            # "no encoder, just splice the tensor directly" semantics.
+            pe_placeholders.append(
+                PlaceholderRange(offset=start, length=length, is_embed=None)
+            )
+
+        cast(
+            MultiModalKwargsItems[MultiModalKwargsItem | None],
+            engine_input["mm_kwargs"],
+        )["prompt_embeds"] = pe_kwargs_items
+        engine_input["mm_hashes"] = {
+            **engine_input["mm_hashes"],
+            "prompt_embeds": pe_hashes,
+        }
+        cast(dict, engine_input["mm_placeholders"])["prompt_embeds"] = pe_placeholders
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 77fa6402180e..88b1b0b8e8e9 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -25,6 +25,10 @@
 _SAMPLING_EPS = 1e-5
 _MAX_TEMP = 1e-2
 
+MAX_LOGPROB_TOKEN_IDS = 128
+"""Upper bound on `SamplingParams.logprob_token_ids` list length. Must match
+the per-request row width allocated by the sampler's `LogprobTokenIdsState`."""
+
 
 class SamplingType(IntEnum):
     GREEDY = 0
@@ -628,6 +632,16 @@ def bad_words_token_ids(self) -> list[list[int]] | None:
         # For internal use only. Backward compatibility not guaranteed
         return self._bad_words_token_ids
 
+    @property
+    def num_logprobs(self) -> int | None:
+        """Number of sample logprobs to return per output token, or `None` if
+        no sample logprobs were requested. Takes `logprob_token_ids` into
+        account: when `logprobs` is unset but `logprob_token_ids` is set,
+        returns `len(logprob_token_ids)`."""
+        if self.logprobs is not None:
+            return self.logprobs
+        return len(self.logprob_token_ids) if self.logprob_token_ids else None
+
     def clone(self) -> "SamplingParams":
         """If skip_clone is True, uses shallow copy instead of deep copy."""
         if self.skip_clone:
@@ -666,6 +680,17 @@ def _validate_logprobs(self, model_config: ModelConfig) -> None:
                     value=num_logprobs,
                 )
 
+        # Validate logprob_token_ids.
+        if self.logprob_token_ids is not None:
+            n = len(self.logprob_token_ids)
+            if n > MAX_LOGPROB_TOKEN_IDS:
+                raise VLLMValidationError(
+                    f"Requested logprob_token_ids of length {n}, "
+                    f"which is greater than max allowed: {MAX_LOGPROB_TOKEN_IDS}",
+                    parameter="logprob_token_ids",
+                    value=n,
+                )
+
         # Validate prompt logprobs.
         if num_prompt_logprobs := self.prompt_logprobs:
             if num_prompt_logprobs == -1:
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 2daba409881f..6531989a9f35 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .hf import maybe_make_thread_pool
 from .protocol import TokenizerLike
 from .registry import (
     TokenizerRegistry,
@@ -15,4 +16,5 @@
     "cached_get_tokenizer",
     "get_tokenizer",
     "cached_tokenizer_from_config",
+    "maybe_make_thread_pool",
 ]
diff --git a/vllm/tokenizers/deepseek_v4.py b/vllm/tokenizers/deepseek_v4.py
index 76725dab16a1..2a6aaaf73975 100644
--- a/vllm/tokenizers/deepseek_v4.py
+++ b/vllm/tokenizers/deepseek_v4.py
@@ -40,10 +40,16 @@ def apply_chat_template(
                 messages.insert(0, {"role": "system"})
                 messages[0]["tools"] = tools  # type: ignore[typeddict-unknown-key]
 
-            # The V4 reference currently accepts only "max", "high", or None.
             reasoning_effort = kwargs.get("reasoning_effort")
-            if reasoning_effort not in ("max", "high"):
+            if not isinstance(reasoning_effort, str):
                 reasoning_effort = None
+            elif reasoning_effort == "none":
+                thinking_mode = "chat"
+                reasoning_effort = None
+            elif reasoning_effort in ("max", "xhigh"):
+                reasoning_effort = "max"
+            else:
+                reasoning_effort = "high"
 
             encode_config = dict(
                 thinking_mode=thinking_mode,
diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py
index 85c812398529..b4248e229a68 100644
--- a/vllm/tokenizers/hf.py
+++ b/vllm/tokenizers/hf.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
+import queue
 from pathlib import Path
-from typing import TypeAlias
+from typing import TypeAlias, TypeVar
 
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -12,6 +13,92 @@
 from .protocol import TokenizerLike
 
 HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
+_T = TypeVar("_T", bound=TokenizerLike)
+
+
+class ThreadSafeHFTokenizerMixin:
+    """Mixin class for thread-safe HF fast tokenizers."""
+
+    pass
+
+
+def maybe_make_thread_pool(tokenizer: _T, copies: int = 1):
+    """
+    If `tokenizer` is a `PreTrainedTokenizerFast`, modify the tokenizer
+    in-place to make the public interface thread-safe by routing calls
+    through a deep-copied tokenizer pool.
+
+    Note that:
+    - Only ``TokenizerLike``'s public interface is thread-safe.
+      This doesn't include ``_tokenizer`` property nor any mutation
+      methods like ``add_special_tokens`` or ``add_tokens``.
+    - Adjacent method calls could happen on different deep copies.
+    """
+    if not isinstance(tokenizer, PreTrainedTokenizerFast) or isinstance(
+        tokenizer, ThreadSafeHFTokenizerMixin
+    ):
+        return tokenizer
+
+    og_tokenizer = copy.copy(tokenizer)
+
+    tokenizer_pool: queue.Queue[PreTrainedTokenizerFast] = queue.Queue()
+    for _ in range(copies):
+        tokenizer_pool.put(copy.deepcopy(og_tokenizer))
+
+    @contextlib.contextmanager
+    def _borrow_from_pool():
+        try:
+            tok = tokenizer_pool.get_nowait()
+            yield tok
+        except queue.Empty:
+            tok = copy.deepcopy(og_tokenizer)
+            yield tok
+        finally:
+            tokenizer_pool.put(tok)
+
+    class TokenizerPool(tokenizer.__class__, ThreadSafeHFTokenizerMixin):  # type: ignore
+        def apply_chat_template(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.apply_chat_template(*args, **kwargs)
+
+        def batch_decode(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.batch_decode(*args, **kwargs)
+
+        def batch_encode(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.batch_encode(*args, **kwargs)
+
+        def convert_tokens_to_ids(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.convert_tokens_to_ids(*args, **kwargs)
+
+        def convert_ids_to_tokens(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.convert_ids_to_tokens(*args, **kwargs)
+
+        def convert_tokens_to_string(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.convert_tokens_to_string(*args, **kwargs)
+
+        def decode(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.decode(*args, **kwargs)
+
+        def encode(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.encode(*args, **kwargs)
+
+        def __call__(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok(*args, **kwargs)
+
+        def __reduce__(self):
+            return maybe_make_thread_pool, (og_tokenizer, copies)
+
+    TokenizerPool.__name__ = f"TokenizerPool{og_tokenizer.__class__.__name__}"
+
+    tokenizer.__class__ = TokenizerPool
 
 
 def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
@@ -103,7 +190,10 @@ def from_pretrained(
                     "is a custom tokenizer not yet available in the "
                     "HuggingFace transformers library, consider "
                     "setting `trust_remote_code=True` in LLM or using "
-                    "the `--trust-remote-code` flag in the CLI."
+                    "the `--trust-remote-code` flag in the CLI. If the "
+                    "model was created with a newer version of "
+                    "transformers, consider upgrading: "
+                    "`uv pip install --upgrade transformers`"
                 )
                 raise RuntimeError(err_msg) from e
             else:
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index ef58b1b75d68..8fce690433ef 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -13,7 +13,6 @@
 from mistral_common.protocol.instruct.request import (
     ReasoningEffort,
 )
-from mistral_common.protocol.instruct.tool_calls import Function, Tool
 from mistral_common.protocol.instruct.validator import ValidationMode
 from mistral_common.tokens.tokenizers.base import (
     SpecialTokenPolicy,
@@ -68,36 +67,6 @@ def _pop_unallowed_keys_and_warn(
             )
 
 
-# TODO(juliendenize): remove this once OpenAI API is better supported by
-# `mistral-common`.
-def adapt_inplace_to_mistral_tool(
-    tool: dict[str, Any],
-) -> dict[str, Any]:
-    tools_fields = set(Tool.model_fields.keys())
-    function_fields = set(Function.model_fields.keys())
-
-    # The Mistral client, in comparison to the OpenAI client, requires the
-    # "parameters" dict and the "description" string to be present
-    # even if they are empty.
-    if function := tool.get("function"):
-        if function.get("parameters") is None:
-            function["parameters"] = {}
-        if function.get("description") is None:
-            function["description"] = ""
-
-        _pop_unallowed_keys_and_warn(
-            dictionary=function,
-            allowed_keys=function_fields,
-            err_dict_name="function",
-        )
-
-    _pop_unallowed_keys_and_warn(
-        dictionary=tool, allowed_keys=tools_fields, err_dict_name="tools"
-    )
-
-    return tool
-
-
 def maybe_serialize_tool_calls(request: "MistralChatCompletionRequest"):
     # SEE: https://github.com/vllm-project/vllm/pull/9951
     # Credits go to: @gcalmettes
@@ -167,12 +136,11 @@ def truncate_tool_call_ids(request: "MistralChatCompletionRequest"):
                 request.messages[i]["tool_call_id"] = tool_call_id
 
 
-def _prepare_apply_chat_template_tools_and_messages(
+def _validate_apply_chat_template_args(
     messages: list["ChatCompletionMessageParam"],
-    tools: list[dict[str, Any]] | None = None,
     continue_final_message: bool = False,
     add_generation_prompt: bool = False,
-) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
+) -> None:
     if add_generation_prompt and continue_final_message:
         raise ValueError(
             "Cannot set both `add_generation_prompt` and "
@@ -196,21 +164,6 @@ def _prepare_apply_chat_template_tools_and_messages(
             "the last message is not from the assistant."
         )
 
-    # mistral-common requires AssistantMessage content to be string [1].
-    #
-    # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
-    for message in messages:
-        # Remove reasoning as unsupported by Mistral
-        _ = message.pop("reasoning", None)  # type: ignore
-
-    tools = (
-        [adapt_inplace_to_mistral_tool(tool=tool) for tool in tools]
-        if tools is not None
-        else None
-    )
-
-    return messages, tools
-
 
 def validate_request_params(request: "ChatCompletionRequest"):
     if request.chat_template is not None or request.chat_template_kwargs is not None:
@@ -449,8 +402,8 @@ def apply_chat_template(
         if self.version >= 15:
             version_kwargs["reasoning_effort"] = kwargs.get("reasoning_effort")
 
-        messages, tools = _prepare_apply_chat_template_tools_and_messages(
-            messages, tools, continue_final_message, add_generation_prompt
+        _validate_apply_chat_template_args(
+            messages, continue_final_message, add_generation_prompt
         )
 
         return self.transformers_tokenizer.apply_chat_template(
diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py
index 8a39ca825d5f..f64209e535b7 100644
--- a/vllm/tool_parsers/__init__.py
+++ b/vllm/tool_parsers/__init__.py
@@ -38,6 +38,14 @@
         "deepseekv4_tool_parser",
         "DeepSeekV4ToolParser",
     ),
+    "cohere_command3": (
+        "cohere_command_tool_parser",
+        "CohereCommand3ToolParser",
+    ),
+    "cohere_command4": (
+        "cohere_command_tool_parser",
+        "CohereCommand4ToolParser",
+    ),
     "ernie45": (
         "ernie45_tool_parser",
         "Ernie45ToolParser",
@@ -66,6 +74,10 @@
         "hermes_tool_parser",
         "Hermes2ProToolParser",
     ),
+    "poolside_v1": (
+        "poolside_v1_tool_parser",
+        "PoolsideV1ToolParser",
+    ),
     "hunyuan_a13b": (
         "hunyuan_a13b_tool_parser",
         "HunyuanA13BToolParser",
diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py
index 75181d8dfac6..c3438082a72d 100644
--- a/vllm/tool_parsers/abstract_tool_parser.py
+++ b/vllm/tool_parsers/abstract_tool_parser.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
+import json
 import os
 from collections.abc import Callable, Sequence
 from functools import cached_property
@@ -13,6 +14,7 @@
 from openai.types.responses.function_tool import FunctionTool
 
 from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionToolsParam,
 )
@@ -23,6 +25,7 @@
 from vllm.entrypoints.openai.responses.protocol import (
     ResponsesRequest,
 )
+from vllm.envs import VLLM_ENFORCE_STRICT_TOOL_CALLING
 from vllm.logger import init_logger
 from vllm.sampling_params import (
     StructuredOutputsParams,
@@ -83,13 +86,39 @@ def vocab(self) -> dict[str, int]:
         return self.model_tokenizer.get_vocab()
 
     def adjust_request(
-        self, request: ChatCompletionRequest | ResponsesRequest
+        self,
+        request: ChatCompletionRequest | ResponsesRequest,
     ) -> ChatCompletionRequest | ResponsesRequest:
-        """
-        Static method that used to adjust the request parameters.
-        """
+        # If there are no tools, return the request as is.
         if not request.tools:
             return request
+
+        # Step 1 (highest priority for ChatCompletionRequest): apply
+        # vLLM-owned structural tag support for model-specific tool formats.
+        if (
+            isinstance(request, ChatCompletionRequest)
+            and VLLM_ENFORCE_STRICT_TOOL_CALLING
+        ):
+            need_tool_calling = (
+                request.tool_choice == "auto"
+                or request.tool_choice == "required"
+                or isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
+            )
+            if need_tool_calling:
+                structure_tag = self.get_structural_tag(request)
+                if structure_tag is not None:
+                    if request.structured_outputs is None:
+                        request.structured_outputs = StructuredOutputsParams(
+                            structural_tag=json.dumps(structure_tag.model_dump()),
+                        )
+                    else:
+                        request.structured_outputs.structural_tag = json.dumps(
+                            structure_tag.model_dump()
+                        )
+                    return request
+
+        # Step 2: set structured output params when tool constraints are
+        # derived from the tool schema.
         json_schema_from_tool = get_json_schema_from_tools(
             tool_choice=request.tool_choice, tools=request.tools
         )
@@ -121,6 +150,9 @@ def adjust_request(
 
         return request
 
+    def get_structural_tag(self, request: ChatCompletionRequest):
+        return None
+
     def extract_tool_calls(
         self, model_output: str, request: ChatCompletionRequest
     ) -> ExtractedToolCallInformation:
diff --git a/vllm/tool_parsers/cohere_command_tool_parser.py b/vllm/tool_parsers/cohere_command_tool_parser.py
new file mode 100644
index 000000000000..0b252ce3177a
--- /dev/null
+++ b/vllm/tool_parsers/cohere_command_tool_parser.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+try:
+    from cohere_melody import PyFilter, PyFilterOptions
+except ImportError as e:
+    raise ImportError(
+        "The Cohere tool parser requires the `cohere_melody` "
+        "package, which is not installed. Install it with:\n"
+        "    pip install cohere_melody"
+    ) from e
+
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.responses.protocol import (
+    ResponsesRequest,
+)
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser
+from vllm.tool_parsers.utils import Tool
+
+
+class BaseCohereCommandToolParser(ToolParser):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        streaming_opts: PyFilterOptions,
+        unary_opts: PyFilterOptions,
+    ):
+        super().__init__(tokenizer)
+        self.melody_streaming = PyFilter(streaming_opts)
+        self.melody_unary = PyFilter(unary_opts)
+
+    def adjust_request(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> ChatCompletionRequest | ResponsesRequest:
+        request = super().adjust_request(request)
+        request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        r = self.melody_streaming.write_decoded(delta_text)
+        if r.content is not None:
+            return DeltaMessage(content=r.content)
+        if r.reasoning is not None:
+            return DeltaMessage(reasoning=r.reasoning)
+        if r.tool_calls:
+            return DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        id=tc.id,
+                        index=tc.index,
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=tc.name, arguments=tc.arguments
+                        ),
+                    )
+                    for tc in r.tool_calls
+                ]
+            )
+        return None
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        result = self.melody_unary.process_full_text(model_output)
+        tool_calls = [
+            ToolCall(
+                id=tc.id,
+                type="function",
+                function=FunctionCall(name=tc.name, arguments=tc.arguments),
+            )
+            for tc in result.tool_calls
+        ]
+        return ExtractedToolCallInformation(
+            tools_called=len(tool_calls) > 0,
+            tool_calls=tool_calls,
+            content=result.content,
+        )
+
+
+class CohereCommand3ToolParser(BaseCohereCommandToolParser):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        tools: list[Tool] | None = None,
+    ):
+        super().__init__(
+            tokenizer,
+            streaming_opts=PyFilterOptions().cmd3(),
+            unary_opts=PyFilterOptions().cmd3(),
+        )
+
+
+class CohereCommand4ToolParser(BaseCohereCommandToolParser):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        tools: list[Tool] | None = None,
+    ):
+        super().__init__(
+            tokenizer,
+            streaming_opts=PyFilterOptions().cmd4(),
+            unary_opts=PyFilterOptions().cmd4(),
+        )
diff --git a/vllm/tool_parsers/deepseekv32_tool_parser.py b/vllm/tool_parsers/deepseekv32_tool_parser.py
index b8623592365c..f01f7f929426 100644
--- a/vllm/tool_parsers/deepseekv32_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv32_tool_parser.py
@@ -69,7 +69,7 @@ def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
             r'<｜DSML｜invoke\s+name="([^"]+)"\s*>(.*?)</｜DSML｜invoke>', re.DOTALL
         )
         self.parameter_complete_regex = re.compile(
-            r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="(?:true|false)"\s*>(.*?)</｜DSML｜parameter>',
+            r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="(true|false)"\s*>(.*?)</｜DSML｜parameter>',
             re.DOTALL,
         )
 
@@ -101,10 +101,12 @@ def _generate_tool_call_id(self) -> str:
         """Generate a unique tool call ID."""
         return f"call_{uuid.uuid4().hex[:24]}"
 
-    def _parse_invoke_params(self, invoke_str: str) -> dict:
-        param_dict = dict()
-        for param_name, param_val in self.parameter_complete_regex.findall(invoke_str):
-            param_dict[param_name] = param_val
+    def _parse_invoke_params(self, invoke_str: str) -> dict[str, tuple[str, str]]:
+        param_dict: dict[str, tuple[str, str]] = {}
+        for param_name, string_attr, param_val in self.parameter_complete_regex.findall(
+            invoke_str
+        ):
+            param_dict[param_name] = (param_val, string_attr)
         return param_dict
 
     def _convert_param_value_checked(self, value: str, param_type: str) -> Any:
@@ -142,10 +144,32 @@ def _convert_param_value(self, value: str, param_type: str | list[str]) -> Any:
         # return value as fallback
         return value
 
+    @staticmethod
+    def _repair_param_dict(
+        param_dict: dict[str, Any],
+        param_config: dict[str, dict],
+    ) -> dict[str, Any]:
+        """Unwrap single 'arguments' / 'input' wrappers when the wrapper
+        is not part of the requested tool schema and the wrapped object
+        matches the schema fields."""
+        allowed = set(param_config.keys())
+        for wrapper in ("arguments", "input"):
+            if set(param_dict.keys()) != {wrapper} or wrapper in allowed:
+                continue
+            inner = param_dict[wrapper]
+            if isinstance(inner, str):
+                try:
+                    inner = json.loads(inner)
+                except json.JSONDecodeError:
+                    return param_dict
+            if isinstance(inner, dict) and set(inner.keys()).issubset(allowed):
+                return inner
+        return param_dict
+
     def _convert_params_with_schema(
         self,
         function_name: str,
-        param_dict: dict[str, str],
+        param_dict: dict[str, tuple[str, str]],
     ) -> dict[str, Any]:
         """Convert raw string param values using the tool schema types."""
         param_config: dict = {}
@@ -162,12 +186,16 @@ def _convert_params_with_schema(
                     break
 
         converted: dict[str, Any] = {}
-        for name, value in param_dict.items():
+        for name, (value, string_attr) in param_dict.items():
+            if string_attr == "true":
+                converted[name] = value
+                continue
+
             param_type = "string"
             if name in param_config and isinstance(param_config[name], dict):
                 param_type = param_config[name].get("type", "string")
             converted[name] = self._convert_param_value(value, param_type)
-        return converted
+        return self._repair_param_dict(converted, param_config)
 
     def extract_tool_calls(
         self,
@@ -191,12 +219,13 @@ def extract_tool_calls(
                     tool_call_match
                 ):
                     param_dict = self._parse_invoke_params(invoke_content)
+                    params = self._convert_params_with_schema(invoke_name, param_dict)
                     tool_calls.append(
                         ToolCall(
                             type="function",
                             function=FunctionCall(
                                 name=invoke_name,
-                                arguments=json.dumps(param_dict, ensure_ascii=False),
+                                arguments=json.dumps(params, ensure_ascii=False),
                             ),
                         )
                     )
diff --git a/vllm/tool_parsers/deepseekv4_tool_parser.py b/vllm/tool_parsers/deepseekv4_tool_parser.py
index 45a9c1302578..e32451cd8bbd 100644
--- a/vllm/tool_parsers/deepseekv4_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv4_tool_parser.py
@@ -1,7 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
 from vllm.tool_parsers.deepseekv32_tool_parser import DeepSeekV32ToolParser
+from vllm.tool_parsers.structural_tag_registry import (
+    get_enable_structured_outputs_in_reasoning,
+    get_model_structural_tag,
+)
 
 
 class DeepSeekV4ToolParser(DeepSeekV32ToolParser):
@@ -14,3 +21,11 @@ class DeepSeekV4ToolParser(DeepSeekV32ToolParser):
 
     tool_call_start_token: str = "<｜DSML｜tool_calls>"
     tool_call_end_token: str = "</｜DSML｜tool_calls>"
+
+    def get_structural_tag(self, request: ChatCompletionRequest):
+        return get_model_structural_tag(
+            model="deepseek_v4",
+            tools=request.tools,
+            tool_choice=request.tool_choice,
+            reasoning=get_enable_structured_outputs_in_reasoning(),
+        )
diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py
index 94b7b678979c..0a057a3af468 100644
--- a/vllm/tool_parsers/mistral_tool_parser.py
+++ b/vllm/tool_parsers/mistral_tool_parser.py
@@ -43,7 +43,7 @@
 from vllm.reasoning.mistral_reasoning_parser import MistralReasoningParser
 from vllm.sampling_params import StructuredOutputsParams
 from vllm.tokenizers import TokenizerLike
-from vllm.tokenizers.mistral import MistralTokenizer, adapt_inplace_to_mistral_tool
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.tool_parsers.abstract_tool_parser import (
     Tool,
     ToolParser,
@@ -241,12 +241,7 @@ def adjust_request(
         )
 
         mistral_tools = (
-            [
-                MistralTool.model_validate(
-                    adapt_inplace_to_mistral_tool(tool.model_dump())
-                )
-                for tool in request.tools
-            ]
+            [MistralTool.from_openai(tool.model_dump()) for tool in request.tools]
             if request.tools is not None
             else None
         )
@@ -623,13 +618,6 @@ def _extract_tool_calls_streaming(
         if len(delta_tool_calls) > 0:
             delta.tool_calls = delta_tool_calls
 
-        # HACK: serving_chat.py inspects the internal state of tool parsers
-        # when determining its final streaming delta, automatically
-        # adding autocompleted JSON.
-        # These two lines avoid that nonsense while ensuring finish_reason
-        # is set to tool_calls when at least one tool is called.
-        if delta_tool_calls and not self.prev_tool_call_arr:
-            self.prev_tool_call_arr = [{"arguments": {}}]
         return delta
 
     def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
@@ -642,6 +630,8 @@ def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
             StreamingState.PARSING_ARGUMENTS,
         ] and delta_text.startswith(self.bot_token):
             self.current_tool_id += 1
+            self.streamed_args_for_tool.append("")
+            self.prev_tool_call_arr.append({})
             self.streaming_state = StreamingState.PARSING_NAME
             delta_text = delta_text.replace(self.bot_token, "", 1)
         if self.streaming_state == StreamingState.PARSING_NAME:
@@ -655,6 +645,9 @@ def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
                 self.current_tool_name += delta_function_name
                 # HF tokenizers may include [ARGS] in the text
                 self.current_tool_name = self.current_tool_name.replace("[ARGS]", "")
+                self.prev_tool_call_arr[self.current_tool_id]["name"] = (
+                    self.current_tool_name
+                )
                 delta_text = delta_text[len(delta_function_name) :]
                 self.streaming_state = StreamingState.PARSING_ARGUMENTS
             else:
@@ -671,6 +664,10 @@ def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
                 self.streaming_state = StreamingState.TOOL_COMPLETE
             else:
                 delta_arguments = delta_text
+            self.streamed_args_for_tool[self.current_tool_id] += delta_arguments
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = (
+                self.streamed_args_for_tool[self.current_tool_id]
+            )
             ret = []
             if self.current_tool_name or delta_arguments:
                 ret += [
@@ -820,9 +817,12 @@ def _extract_tool_calls_streaming_pre_v11_tokenizer(
                     if self.current_tool_mistral_id is not None:
                         current_tool_call.id = self.current_tool_mistral_id
                         self.current_tool_mistral_id = None
+                    self._track_streamed_args_pre_v11(current_tool_call)
                     delta_tool_calls.append(current_tool_call)
                 current_tool_call_modified = False
                 self.current_tool_id += 1
+                self.streamed_args_for_tool.append("")
+                self.prev_tool_call_arr.append({})
                 self.current_tool_mistral_id = MistralToolCall.generate_random_id()
                 current_tool_call = DeltaToolCall(
                     index=self.current_tool_id,
@@ -835,6 +835,9 @@ def _extract_tool_calls_streaming_pre_v11_tokenizer(
                 # we have the complete tool name
                 current_tool_call_modified = True
                 current_tool_call.function.name = self.current_tool_name
+                self.prev_tool_call_arr[self.current_tool_id]["name"] = (
+                    self.current_tool_name
+                )
                 self.current_tool_name = None
             if self.streaming_state == StreamingState.PARSING_NAME_COMPLETED:
                 self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
@@ -860,16 +863,9 @@ def _extract_tool_calls_streaming_pre_v11_tokenizer(
             if self.current_tool_mistral_id is not None:
                 current_tool_call.id = self.current_tool_mistral_id
                 self.current_tool_mistral_id = None
+            self._track_streamed_args_pre_v11(current_tool_call)
             delta_tool_calls.append(current_tool_call)
 
-        # HACK: serving_chat.py inspects the internal state of tool parsers
-        # when determining it's final streaming delta, automatically
-        # adding autocompleted JSON.
-        # These two lines avoid that nonsense while ensuring finish_reason
-        # is set to tool_calls when at least one tool is called.
-        if delta_tool_calls and not self.prev_tool_call_arr:
-            self.prev_tool_call_arr = [{"arguments": {}}]
-
         if content or len(delta_tool_calls) > 0:
             delta_message = DeltaMessage()
             if content:
@@ -883,6 +879,16 @@ def _extract_tool_calls_streaming_pre_v11_tokenizer(
             else:
                 return None
 
+    def _track_streamed_args_pre_v11(self, tool_call: DeltaToolCall) -> None:
+        r"""Accumulate `tool_call` arguments into the streaming state."""
+        if tool_call.function is not None and tool_call.function.arguments is not None:
+            self.streamed_args_for_tool[self.current_tool_id] += (
+                tool_call.function.arguments
+            )
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = (
+                self.streamed_args_for_tool[self.current_tool_id]
+            )
+
     def _split_delta(
         self,
         delta_text: str,
diff --git a/vllm/tool_parsers/poolside_v1_tool_parser.py b/vllm/tool_parsers/poolside_v1_tool_parser.py
new file mode 100644
index 000000000000..f14b47362917
--- /dev/null
+++ b/vllm/tool_parsers/poolside_v1_tool_parser.py
@@ -0,0 +1,583 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+GLM-4 Tool Call Parser with incremental string streaming support.
+
+This parser fixes the streaming issue reported in Issue #32829 where long string
+parameters (e.g., file content with 4000+ characters of code) are buffered until
+complete, causing multi-second delays before the user sees any content.
+
+The fix streams string values incrementally as they arrive, providing a true
+streaming experience for long content.
+"""
+
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any
+
+import partial_json_parser.core.complete
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.responses.protocol import (
+    ResponsesRequest,
+)
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    Tool,
+    ToolParser,
+)
+
+logger = init_logger(__name__)
+
+
+class PoolsideV1ToolParser(ToolParser):
+    """Tool parser for GLM-4 models with incremental string streaming.
+
+    This parser emits tool-call deltas incrementally as arguments arrive.
+    For string-type parameters, content is streamed character-by-character
+    rather than waiting for the complete </arg_value> tag.
+    """
+
+    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
+        super().__init__(tokenizer, tools)
+        # Stateful streaming fields
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict[str, Any]] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = []
+
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.arg_key_start: str = "<arg_key>"
+        self.arg_key_end: str = "</arg_key>"
+        self.arg_val_start: str = "<arg_value>"
+        self.arg_val_end: str = "</arg_value>"
+
+        self.tool_calls_start_token = self.tool_call_start_token
+
+        self.func_call_regex = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
+        self.func_detail_regex = re.compile(
+            r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL
+        )
+        self.func_arg_regex = re.compile(
+            r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
+        )
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction."
+            )
+
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+        self._buffer: str = ""
+
+        # Streaming state for incremental tool-call streaming
+        self._in_tool_call: bool = False
+        self._current_tool_name: str | None = None
+        self._pending_key: str | None = None
+        self._streaming_string_value: bool = False
+        self._tool_call_ids: list[str] = []
+        self._args_started: list[bool] = []
+        self._args_closed: list[bool] = []
+        self._seen_keys: list[set[str]] = []
+
+    @staticmethod
+    def _deserialize(value: str) -> Any:
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            pass
+
+        try:
+            return ast.literal_eval(value)
+        except (ValueError, SyntaxError):
+            pass
+
+        return value
+
+    @staticmethod
+    def _json_escape_string_content(s: str) -> str:
+        """JSON-escape string content for incremental streaming.
+
+        This escapes the content that goes INSIDE a JSON string (between quotes),
+        not including the surrounding quotes themselves.
+        """
+        if not s:
+            return ""
+        return json.dumps(s, ensure_ascii=False)[1:-1]
+
+    @staticmethod
+    def _is_string_type(
+        tool_name: str,
+        arg_name: str,
+        tools: list[Tool] | None,
+    ) -> bool:
+        if tools is None:
+            return False
+        for tool in tools:
+            if tool.function.name != tool_name:
+                continue
+            if tool.function.parameters is None:
+                return False
+            arg_type = (
+                tool.function.parameters.get("properties", {})
+                .get(arg_name, {})
+                .get("type", None)
+            )
+            return arg_type == "string"
+        logger.debug("No tool named '%s'.", tool_name)
+        return False
+
+    @staticmethod
+    def _tools_enabled(request: ChatCompletionRequest) -> bool:
+        """Return whether tool parsing should be applied for this request."""
+        try:
+            tools = getattr(request, "tools", None)
+            tool_choice = getattr(request, "tool_choice", None)
+            return bool(tools) and tool_choice != "none"
+        except Exception:
+            logger.exception("Failed to determine if tools are enabled.")
+            return False
+
+    def adjust_request(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> ChatCompletionRequest | ResponsesRequest:
+        """Adjust request parameters for tool call token handling."""
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
+            # Ensure tool call tokens (<tool_call>, </tool_call>) are not skipped
+            # during decoding. Even though they are not marked as special tokens,
+            # setting skip_special_tokens=False ensures proper handling in
+            # transformers 5.x where decoding behavior may have changed.
+            request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        matched_tool_calls = self.func_call_regex.findall(model_output)
+        logger.debug("model_output: %s", model_output)
+        try:
+            tool_calls: list[ToolCall] = []
+            for match in matched_tool_calls:
+                tc_detail = self.func_detail_regex.search(match)
+                if not tc_detail:
+                    logger.warning(
+                        "Failed to parse tool call details from: %s",
+                        match,
+                    )
+                    continue
+                tc_name = tc_detail.group(1).strip()
+                tc_args = tc_detail.group(2)
+                pairs = self.func_arg_regex.findall(tc_args) if tc_args else []
+                arg_dct: dict[str, Any] = {}
+                for key, value in pairs:
+                    arg_key = key.strip()
+                    arg_val = value.strip()
+                    if not self._is_string_type(tc_name, arg_key, request.tools):
+                        arg_val = self._deserialize(arg_val)
+                    logger.debug("arg_key = %s, arg_val = %s", arg_key, arg_val)
+                    arg_dct[arg_key] = arg_val
+                tool_calls.append(
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=tc_name,
+                            arguments=json.dumps(arg_dct, ensure_ascii=False),
+                        ),
+                    )
+                )
+        except Exception:
+            logger.exception("Failed to extract tool call spec")
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+        else:
+            if len(tool_calls) > 0:
+                content: str | None = model_output[
+                    : model_output.find(self.tool_calls_start_token)
+                ]
+                # Normalize empty/whitespace-only content to None
+                if not content or not content.strip():
+                    content = None
+                return ExtractedToolCallInformation(
+                    tools_called=True, tool_calls=tool_calls, content=content
+                )
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        if not self._tools_enabled(request):
+            return DeltaMessage(content=delta_text) if delta_text else None
+
+        self._buffer += delta_text
+
+        pending_deltas: dict[int, DeltaToolCall] = {}
+        content: str | None = None
+
+        while True:
+            if not self._in_tool_call:
+                start_idx = self._buffer.find(self.tool_call_start_token)
+                if start_idx == -1:
+                    # Check for partial start token at end of buffer
+                    for i in range(1, len(self.tool_call_start_token)):
+                        if self._buffer.endswith(self.tool_call_start_token[:i]):
+                            out = self._buffer[:-i]
+                            self._buffer = self._buffer[-i:]
+                            if out:
+                                content = (content or "") + out
+                            break
+                    else:
+                        out = self._buffer
+                        self._buffer = ""
+                        if out:
+                            content = (content or "") + out
+                    break
+
+                if start_idx > 0:
+                    content = (content or "") + self._buffer[:start_idx]
+                    self._buffer = self._buffer[start_idx:]
+
+                self._buffer = self._buffer[len(self.tool_call_start_token) :]
+                self._begin_tool_call()
+                continue
+
+            # Parse tool name first
+            if not self.current_tool_name_sent:
+                nl = self._buffer.find("\n")
+                ak = self._buffer.find(self.arg_key_start)
+                end = self._buffer.find(self.tool_call_end_token)
+                candidates = [i for i in [nl, ak, end] if i != -1]
+                if not candidates:
+                    break
+                cut = min(candidates)
+                tool_name = self._buffer[:cut].strip()
+                if tool_name == "" and cut == end:
+                    # Handle empty tool call like `<tool_call></tool_call>`.
+                    # Consume the tokens and reset state to avoid infinite loop.
+                    self._buffer = self._buffer[end + len(self.tool_call_end_token) :]
+                    self._finish_tool_call()
+                    self._revert_last_tool_call_state()
+                    continue
+
+                if cut == nl:
+                    self._buffer = self._buffer[nl + 1 :]
+                else:
+                    self._buffer = self._buffer[cut:]
+
+                self._current_tool_name = tool_name
+                self.current_tool_name_sent = True
+                self._update_tool_name(pending_deltas, tool_name)
+                continue
+
+            assert self._current_tool_name is not None
+
+            # Handle incremental string value streaming
+            if self._streaming_string_value:
+                val_end = self._buffer.find(self.arg_val_end)
+                if val_end != -1:
+                    raw_content = self._buffer[:val_end]
+                    self._buffer = self._buffer[val_end + len(self.arg_val_end) :]
+                    self._streaming_string_value = False
+                    self._pending_key = None
+
+                    escaped = self._json_escape_string_content(raw_content)
+                    frag = escaped + '"'
+                    self.streamed_args_for_tool[self.current_tool_id] += frag
+                    self._update_tool_args(pending_deltas, frag)
+                    continue
+
+                # Check for partial </arg_value> at end
+                safe_len = len(self._buffer)
+                for i in range(1, len(self.arg_val_end)):
+                    if self._buffer.endswith(self.arg_val_end[:i]):
+                        safe_len = len(self._buffer) - i
+                        break
+
+                if safe_len > 0:
+                    to_emit = self._buffer[:safe_len]
+                    self._buffer = self._buffer[safe_len:]
+                    escaped = self._json_escape_string_content(to_emit)
+                    if escaped:
+                        self.streamed_args_for_tool[self.current_tool_id] += escaped
+                        self._update_tool_args(pending_deltas, escaped)
+                break
+
+            # If we have a pending key, parse its value
+            if self._pending_key is not None:
+                val_pos = self._buffer.find(self.arg_val_start)
+                if val_pos == -1:
+                    break
+                if val_pos > 0:
+                    self._buffer = self._buffer[val_pos:]
+
+                key = (self._pending_key or "").strip()
+
+                is_string = self._is_string_type(
+                    self._current_tool_name, key, request.tools
+                )
+
+                if is_string:
+                    # String type: stream incrementally
+                    self._buffer = self._buffer[len(self.arg_val_start) :]
+
+                    if key in self._seen_keys[self.current_tool_id]:
+                        self._pending_key = None
+                        continue
+
+                    self._seen_keys[self.current_tool_id].add(key)
+                    key_json = json.dumps(key, ensure_ascii=False)
+
+                    if not self._args_started[self.current_tool_id]:
+                        frag = "{" + key_json + ': "'
+                        self._args_started[self.current_tool_id] = True
+                    else:
+                        frag = ", " + key_json + ': "'
+
+                    self.streamed_args_for_tool[self.current_tool_id] += frag
+                    self._streaming_string_value = True
+                    self._update_tool_args(pending_deltas, frag)
+                    continue
+
+                # Non-string type: wait for complete value
+                val_end = self._buffer.find(self.arg_val_end)
+                if val_end == -1:
+                    break
+
+                raw_val = self._buffer[len(self.arg_val_start) : val_end].strip()
+                self._buffer = self._buffer[val_end + len(self.arg_val_end) :]
+                self._pending_key = None
+
+                frag_or_none = self._append_arg_fragment(key=key, raw_val=raw_val)
+                if frag_or_none:
+                    self._update_tool_args(pending_deltas, frag_or_none)
+                continue
+
+            # Parse next arg or close
+            end_pos = self._buffer.find(self.tool_call_end_token)
+            key_pos = self._buffer.find(self.arg_key_start)
+            if end_pos != -1 and (key_pos == -1 or end_pos < key_pos):
+                self._buffer = self._buffer[end_pos + len(self.tool_call_end_token) :]
+                frag_or_none = self._close_args_if_needed()
+                # Finalize prev_tool_call_arr with complete parsed arguments
+                if self._current_tool_name:
+                    try:
+                        full_args_str = self.streamed_args_for_tool[
+                            self.current_tool_id
+                        ]
+                        args_dict = json.loads(full_args_str)
+                        self.prev_tool_call_arr[self.current_tool_id] = {
+                            "name": self._current_tool_name,
+                            "arguments": args_dict,
+                        }
+                    except (json.JSONDecodeError, IndexError) as e:
+                        logger.warning(
+                            "Failed to finalize tool call state for tool %d: %s",
+                            self.current_tool_id,
+                            e,
+                        )
+                self._finish_tool_call()
+                if frag_or_none:
+                    self._update_tool_args(pending_deltas, frag_or_none)
+                continue
+
+            if key_pos == -1:
+                break
+            if key_pos > 0:
+                self._buffer = self._buffer[key_pos:]
+            key_end = self._buffer.find(self.arg_key_end)
+            if key_end == -1:
+                break
+            key = self._buffer[len(self.arg_key_start) : key_end]
+            self._buffer = self._buffer[key_end + len(self.arg_key_end) :]
+            self._pending_key = key
+            continue
+
+        tool_calls = list(pending_deltas.values())
+        if content is None and len(tool_calls) == 0:
+            if request.logprobs:
+                return DeltaMessage(content="")
+            return None
+        return DeltaMessage(content=content, tool_calls=tool_calls)
+
+    def _ensure_tool_state(self) -> None:
+        while len(self._tool_call_ids) <= self.current_tool_id:
+            self._tool_call_ids.append(
+                make_tool_call_id(id_type="random", func_name=None, idx=None)
+            )
+        while len(self.streamed_args_for_tool) <= self.current_tool_id:
+            self.streamed_args_for_tool.append("")
+        while len(self.prev_tool_call_arr) <= self.current_tool_id:
+            self.prev_tool_call_arr.append({})
+        while len(self._args_started) <= self.current_tool_id:
+            self._args_started.append(False)
+        while len(self._args_closed) <= self.current_tool_id:
+            self._args_closed.append(False)
+        while len(self._seen_keys) <= self.current_tool_id:
+            self._seen_keys.append(set())
+
+    def _begin_tool_call(self) -> None:
+        if self.current_tool_id == -1:
+            self.current_tool_id = 0
+        else:
+            self.current_tool_id += 1
+        self._ensure_tool_state()
+        self.current_tool_name_sent = False
+        self._current_tool_name = None
+        self._pending_key = None
+        self._streaming_string_value = False
+        self._in_tool_call = True
+
+    def _finish_tool_call(self) -> None:
+        self._in_tool_call = False
+        self._current_tool_name = None
+        self._pending_key = None
+        self._streaming_string_value = False
+
+    def _revert_last_tool_call_state(self) -> None:
+        """Revert the state allocation for the last tool call."""
+        if self.current_tool_id < 0:
+            return
+        self._tool_call_ids.pop()
+        self.streamed_args_for_tool.pop()
+        self.prev_tool_call_arr.pop()
+        self._args_started.pop()
+        self._args_closed.pop()
+        self._seen_keys.pop()
+        self.current_tool_id -= 1
+
+    def _get_or_create_delta(self, pending: dict[int, DeltaToolCall]) -> DeltaToolCall:
+        idx = self.current_tool_id
+        if idx not in pending:
+            pending[idx] = DeltaToolCall(
+                index=idx,
+                function=DeltaFunctionCall(),
+            )
+        delta = pending[idx]
+        assert delta.function is not None
+        return delta
+
+    def _update_tool_name(
+        self, pending: dict[int, DeltaToolCall], tool_name: str
+    ) -> None:
+        self.prev_tool_call_arr[self.current_tool_id] = {
+            "name": self._current_tool_name,
+            "arguments": {},
+        }
+        delta = self._get_or_create_delta(pending)
+        delta.id = self._tool_call_ids[self.current_tool_id]
+        delta.type = "function"
+        assert delta.function is not None
+        delta.function.name = tool_name
+        if delta.function.arguments is None:
+            delta.function.arguments = ""
+
+    @staticmethod
+    def _complete_json_prefix(
+        json_prefix: str,
+        allowed_partial_types: Allow,
+    ) -> dict | None:
+        """Complete a partial JSON prefix into a valid JSON object.
+
+        Returns (formatted_prefix, parsed_dict) or None on failure.
+
+        Note: ``partial_json_parser`` strips trailing whitespace before
+        parsing (``complete.py:20``), which means the returned slice is
+        shorter than ``json_prefix`` when it has trailing whitespace.
+        Since the parser controls the construction of the json_prefix value,
+        this code relies on it being a valid prefix and we only use the fix for
+        the completion of the JSON object.
+        """
+        try:
+            _, partial_str_completion = partial_json_parser.core.complete.fix(
+                json_prefix,
+                allowed_partial_types,
+            )
+            return json.loads(json_prefix + partial_str_completion)
+        except Exception:
+            return None
+
+    def _update_tool_args(
+        self, pending: dict[int, DeltaToolCall], fragment: str
+    ) -> None:
+        result = self._complete_json_prefix(
+            self.streamed_args_for_tool[self.current_tool_id],
+            Allow.ALL,
+        )
+        if result is not None:
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = result
+        delta = self._get_or_create_delta(pending)
+        assert delta.function is not None
+        if delta.function.arguments is None:
+            delta.function.arguments = ""
+        delta.function.arguments += fragment
+
+    def _append_arg_fragment(
+        self,
+        *,
+        key: str,
+        raw_val: str,
+    ) -> str | None:
+        key = key.strip()
+        if not key:
+            return None
+        if key in self._seen_keys[self.current_tool_id]:
+            return None
+
+        # This function is only called for non-string types (already checked
+        # by _is_string_type in the caller), so we always deserialize.
+        val_obj: Any = self._deserialize(raw_val)
+
+        key_json = json.dumps(key, ensure_ascii=False)
+        val_json = json.dumps(val_obj, ensure_ascii=False)
+
+        if not self._args_started[self.current_tool_id]:
+            fragment = "{" + key_json + ": " + val_json
+            self._args_started[self.current_tool_id] = True
+        else:
+            fragment = ", " + key_json + ": " + val_json
+
+        self._seen_keys[self.current_tool_id].add(key)
+        self.streamed_args_for_tool[self.current_tool_id] += fragment
+        return fragment
+
+    def _close_args_if_needed(self) -> str | None:
+        if self._args_closed[self.current_tool_id]:
+            return None
+        self._args_closed[self.current_tool_id] = True
+        if not self._args_started[self.current_tool_id]:
+            fragment = "{}"
+            self.streamed_args_for_tool[self.current_tool_id] = fragment
+        else:
+            fragment = "}"
+            self.streamed_args_for_tool[self.current_tool_id] += fragment
+        return fragment
diff --git a/vllm/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py
index 7b089ceffbc0..73850b2ab0c5 100644
--- a/vllm/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/tool_parsers/qwen3coder_tool_parser.py
@@ -25,12 +25,18 @@
     Tool,
     ToolParser,
 )
+from vllm.tool_parsers.structural_tag_registry import (
+    get_enable_structured_outputs_in_reasoning,
+    get_model_structural_tag,
+)
 from vllm.tool_parsers.utils import find_tool_properties
 
 logger = init_logger(__name__)
 
 
 class Qwen3CoderToolParser(ToolParser):
+    supports_required_and_named: bool = False
+
     def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
         super().__init__(tokenizer, tools)
 
@@ -681,3 +687,11 @@ def extract_tool_calls_streaming(
                 return result
 
         return None
+
+    def get_structural_tag(self, request: ChatCompletionRequest):
+        return get_model_structural_tag(
+            model="qwen_3_5",
+            tools=request.tools,
+            tool_choice=request.tool_choice,
+            reasoning=get_enable_structured_outputs_in_reasoning(),
+        )
diff --git a/vllm/tool_parsers/streaming.py b/vllm/tool_parsers/streaming.py
index fc903328e334..7f6638dcb94e 100644
--- a/vllm/tool_parsers/streaming.py
+++ b/vllm/tool_parsers/streaming.py
@@ -67,10 +67,9 @@ def extract_named_tool_call_streaming(
     tool_call_idx: int | None,
     tool_call_id_type: str,
     tokenizer: "TokenizerLike",
-    tool_call_array_index: int,
-) -> tuple[DeltaMessage, bool, bool]:
+    tool_call_array_index: int = 0,
+) -> tuple[DeltaMessage | None, bool]:
     """Build a streaming tool-call delta for forced named tool choice."""
-    created_new_tool_call = False
     if function_name_returned:
         delta_tool_call = DeltaToolCall(
             function=DeltaFunctionCall(arguments=delta_text),
@@ -95,12 +94,9 @@ def extract_named_tool_call_streaming(
             index=tool_call_array_index,
         )
         function_name_returned = True
-        created_new_tool_call = True
-
     return (
         DeltaMessage(tool_calls=[delta_tool_call]),
         function_name_returned,
-        created_new_tool_call,
     )
 
 
diff --git a/vllm/tool_parsers/structural_tag_registry.py b/vllm/tool_parsers/structural_tag_registry.py
new file mode 100644
index 000000000000..754cc52361c5
--- /dev/null
+++ b/vllm/tool_parsers/structural_tag_registry.py
@@ -0,0 +1,330 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Model-specific structural tag builders adapted from XGrammar's
+# builtin structural tag implementations:
+# https://github.com/mlc-ai/xgrammar/blob/main/python/xgrammar/builtin_structural_tag.py
+
+from collections.abc import Callable
+from typing import Any, Literal
+
+from xgrammar import StructuralTag
+from xgrammar.structural_tag import (
+    AnyTextFormat,
+    ConstStringFormat,
+    JSONSchemaFormat,
+    SequenceFormat,
+    TagFormat,
+    TagsWithSeparatorFormat,
+    TriggeredTagsFormat,
+)
+
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionToolsParam,
+)
+
+SimplifiedToolChoice = Literal["auto", "required", "forced"]
+ToolChoice = (
+    Literal["none", "auto", "required"] | ChatCompletionNamedToolChoiceParam | None
+)
+StructuralTagBuilder = Callable[
+    [list[ChatCompletionToolsParam], SimplifiedToolChoice, bool],
+    StructuralTag,
+]
+
+_structural_tag_registry: dict[str, StructuralTagBuilder] = {}
+
+
+def register_model_structural_tag(name: str):
+    """Register a vLLM-owned model-specific structural tag builder."""
+
+    def decorator(func: StructuralTagBuilder) -> StructuralTagBuilder:
+        _structural_tag_registry[name] = func
+        return func
+
+    return decorator
+
+
+def get_model_structural_tag(
+    model: str,
+    tools: list[ChatCompletionToolsParam] | None,
+    tool_choice: ToolChoice,
+    reasoning: bool,
+) -> StructuralTag | None:
+    """Build a structural tag from vLLM-owned model-specific builders."""
+
+    builder = _structural_tag_registry.get(model)
+    if builder is None:
+        supported = list(_structural_tag_registry.keys())
+        raise ValueError(f"Unknown format type: {model}, supported types: {supported}")
+
+    normalized_tools, simplified_tool_choice = _normalize_tool_choice(
+        tools=tools,
+        tool_choice=tool_choice,
+    )
+    if not normalized_tools:
+        return None
+
+    return builder(normalized_tools, simplified_tool_choice, reasoning)
+
+
+def _normalize_tool_choice(
+    tools: list[ChatCompletionToolsParam] | None,
+    tool_choice: ToolChoice,
+) -> tuple[list[ChatCompletionToolsParam], SimplifiedToolChoice]:
+    """Normalize vLLM ChatCompletion tool_choice for structural tag builders."""
+
+    if not tools:
+        return [], "auto"
+
+    if tool_choice is None or tool_choice == "none":
+        return [], "auto"
+
+    if tool_choice == "auto":
+        return tools, "auto"
+
+    if tool_choice == "required":
+        return tools, "required"
+
+    if isinstance(tool_choice, ChatCompletionNamedToolChoiceParam):
+        tool_name = tool_choice.function.name
+        filtered_tools = [tool for tool in tools if tool.function.name == tool_name]
+        if not filtered_tools:
+            raise ValueError(
+                f"The tool with name '{tool_name}' is not found in the tools list."
+            )
+        return filtered_tools, "forced"
+
+    raise ValueError(f"Unsupported tool_choice for structural tag: {tool_choice}")
+
+
+def _get_function_parameters(function: Any) -> dict[str, Any] | bool:
+    """Return the JSON schema used for constrained tool arguments."""
+
+    if getattr(function, "strict", None) is False:
+        return True
+    if function.parameters is None:
+        return True
+    return function.parameters
+
+
+_enable_structured_outputs_in_reasoning: bool = False
+
+
+def set_enable_structured_outputs_in_reasoning(enabled: bool) -> None:
+    """Publish the engine's ``enable_in_reasoning`` flag to tool parsers.
+
+    Called once during APIServer startup so request-time parsers can read
+    it without going through the EngineCore-only contextvar.
+    """
+
+    global _enable_structured_outputs_in_reasoning
+    _enable_structured_outputs_in_reasoning = bool(enabled)
+
+
+def get_enable_structured_outputs_in_reasoning() -> bool:
+    """Whether structured outputs are active during the reasoning phase.
+
+    When ``True``, the structural tag will cover the reasoning part:
+    ``<think>...</think>`` prefix (if available); when ``False`` (default), the tag only
+    constrains the post-reasoning suffix.
+    """
+
+    return _enable_structured_outputs_in_reasoning
+
+
+@register_model_structural_tag("deepseek_v4")
+def get_deepseek_v4_structural_tag(
+    tools: list[ChatCompletionToolsParam],
+    tool_choice: SimplifiedToolChoice,
+    reasoning: bool,
+) -> StructuralTag:
+    """Build DeepSeek V4 structural tags."""
+
+    invoke_begin_prefix = '<｜DSML｜invoke name="'
+    invoke_begin_suffix = '">\n'
+    invoke_end = "</｜DSML｜invoke>\n"
+    tool_calls_prefix = "\n\n"
+    function_calls_begin = "<｜DSML｜tool_calls>\n"
+    function_calls_end = "</｜DSML｜tool_calls>"
+    function_calls_trigger = "<｜DSML｜tool_calls>"
+    think_tag_end = "</think>"
+    think_exclude_tokens = ["<think>", "</think>"]
+    xml_style = "deepseek_xml"
+
+    if tool_choice == "auto":
+        tags = []
+        for tool in tools:
+            function = tool.function
+            parameters = _get_function_parameters(function)
+            tags.append(
+                TagFormat(
+                    begin=invoke_begin_prefix + function.name + invoke_begin_suffix,
+                    content=JSONSchemaFormat(
+                        json_schema=parameters,
+                        style=xml_style,
+                    ),
+                    end=invoke_end,
+                )
+            )
+
+        if tags:
+            function_calling_tags = TagsWithSeparatorFormat(
+                tags=tags,
+                separator="\n",
+                at_least_one=True,
+            )
+            suffix_tag = TriggeredTagsFormat(
+                triggers=[function_calls_trigger],
+                tags=[
+                    TagFormat(
+                        begin=function_calls_begin,
+                        content=function_calling_tags,
+                        end=function_calls_end,
+                    )
+                ],
+                excludes=think_exclude_tokens,
+            )
+        else:
+            suffix_tag = AnyTextFormat(excludes=think_exclude_tokens)
+
+    elif tool_choice == "forced":
+        if not tools:
+            raise ValueError("Forced tool choice must resolve to exactly one tool.")
+        function = tools[0].function
+        suffix_tag = SequenceFormat(
+            elements=[
+                ConstStringFormat(value=tool_calls_prefix + function_calls_begin),
+                TagFormat(
+                    begin=invoke_begin_prefix + function.name + invoke_begin_suffix,
+                    content=JSONSchemaFormat(
+                        json_schema=_get_function_parameters(function),
+                        style=xml_style,
+                    ),
+                    end=invoke_end,
+                ),
+                ConstStringFormat(value=function_calls_end),
+            ]
+        )
+
+    elif tool_choice == "required":
+        tags = []
+        for tool in tools:
+            function = tool.function
+            parameters = _get_function_parameters(function)
+            tags.append(
+                TagFormat(
+                    begin=invoke_begin_prefix + function.name + invoke_begin_suffix,
+                    content=JSONSchemaFormat(
+                        json_schema=parameters,
+                        style=xml_style,
+                    ),
+                    end=invoke_end,
+                )
+            )
+        assert len(tags) > 0
+        suffix_tag = SequenceFormat(
+            elements=[
+                ConstStringFormat(value=tool_calls_prefix + function_calls_begin),
+                TagsWithSeparatorFormat(
+                    tags=tags,
+                    separator="\n",
+                    at_least_one=True,
+                ),
+                ConstStringFormat(value=function_calls_end),
+            ]
+        )
+
+    if not reasoning:
+        return StructuralTag(format=suffix_tag)
+
+    prefix_tag = TagFormat(begin="", content=AnyTextFormat(), end=think_tag_end)
+    return StructuralTag(format=SequenceFormat(elements=[prefix_tag, suffix_tag]))
+
+
+@register_model_structural_tag("qwen_3_5")
+def get_qwen_3_5_structural_tag(
+    tools: list[ChatCompletionToolsParam],
+    tool_choice: SimplifiedToolChoice,
+    reasoning: bool,
+) -> StructuralTag:
+    """Build Qwen XML structural tags.
+
+    This format is used for Qwen3-Coder/Qwen3.5/Qwen3.6 and is compatible with
+    Qwen variants that use the same XML tool-call format.
+    """
+    tool_call_begin_prefix = "<tool_call>\n<function="
+    tool_call_begin_suffix = ">\n"
+    tool_call_end = "\n</function>\n</tool_call>"
+    tool_call_trigger = "<tool_call>\n<function="
+    think_tag_end = "</think>"
+    think_suffix = "\n\n"
+    think_exclude_tokens = ["<think>", "</think>"]
+
+    if tool_choice == "auto":
+        tags = []
+        for tool in tools:
+            function = tool.function
+            parameters = _get_function_parameters(function)
+            tags.append(
+                TagFormat(
+                    begin=f"{tool_call_begin_prefix}{function.name}{tool_call_begin_suffix}",
+                    content=JSONSchemaFormat(json_schema=parameters, style="qwen_xml"),
+                    end=tool_call_end,
+                )
+            )
+
+        if tags:
+            suffix_tag = TriggeredTagsFormat(
+                triggers=[tool_call_trigger],
+                tags=tags,
+                excludes=think_exclude_tokens,
+            )
+        else:
+            suffix_tag = AnyTextFormat(excludes=think_exclude_tokens)
+
+    elif tool_choice == "forced":
+        if not tools:
+            raise ValueError("Forced tool choice must resolve to exactly one tool.")
+        function = tools[0].function
+        suffix_tag = TagFormat(
+            begin=f"{tool_call_begin_prefix}{function.name}{tool_call_begin_suffix}",
+            content=JSONSchemaFormat(
+                json_schema=_get_function_parameters(function),
+                style="qwen_xml",
+            ),
+            end=tool_call_end,
+        )
+
+    elif tool_choice == "required":
+        tags = []
+        for tool in tools:
+            function = tool.function
+            parameters = _get_function_parameters(function)
+            tags.append(
+                TagFormat(
+                    begin=f"{tool_call_begin_prefix}{function.name}{tool_call_begin_suffix}",
+                    content=JSONSchemaFormat(json_schema=parameters, style="qwen_xml"),
+                    end=tool_call_end,
+                )
+            )
+        assert len(tags) > 0
+        suffix_tag = TagsWithSeparatorFormat(
+            tags=tags,
+            separator="",
+            at_least_one=True,
+        )
+
+    if not reasoning:
+        result = StructuralTag(format=suffix_tag)
+    else:
+        prefix_tag = SequenceFormat(
+            elements=[
+                TagFormat(begin="", content=AnyTextFormat(), end=think_tag_end),
+                ConstStringFormat(value=think_suffix),
+            ]
+        )
+        result = StructuralTag(format=SequenceFormat(elements=[prefix_tag, suffix_tag]))
+
+    return result
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index bb6ad1056b7b..e6c497c0b450 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -101,6 +101,7 @@ def __getitem__(self, key):
     fireredlid="FireRedLIDConfig",
     funaudiochat="FunAudioChatConfig",
     granite4_vision="Granite4VisionConfig",
+    hyperclovax_vlm="HCXVisionConfig",
     hunyuan_vl="HunYuanVLConfig",
     hy_v3="HYV3Config",
     isaac="IsaacConfig",
@@ -114,6 +115,7 @@ def __getitem__(self, key):
     mlp_speculator="MLPSpeculatorConfig",
     medusa="MedusaConfig",
     midashenglm="MiDashengLMConfig",
+    moondream3="Moondream3Config",
     eagle="EAGLEConfig",
     speculators="SpeculatorsConfig",
     nemotron="NemotronConfig",
@@ -123,10 +125,12 @@ def __getitem__(self, key):
     step3_vl="Step3VLConfig",
     step3_text="Step3TextConfig",
     step3p5="Step3p5Config",
+    qianfan_ocr="QianfanOCRConfig",
     qwen3_asr="Qwen3ASRConfig",
     qwen3_next="Qwen3NextConfig",
     qwen3_5="Qwen3_5Config",
     qwen3_5_moe="Qwen3_5MoeConfig",
+    laguna="LagunaConfig",
     lfm2_moe="Lfm2MoeConfig",
     tarsier2="Tarsier2Config",
 )
@@ -215,8 +219,9 @@ def parse(
             )
         else:
             if model_type in _CONFIG_REGISTRY:
-                # Register the config class to AutoConfig to ensure it's used in future
-                # calls to `from_pretrained`
+                # Register the config class to AutoConfig to ensure it's used
+                # in future calls to `from_pretrained` (e.g. from
+                # AutoTokenizer or AutoProcessor).
                 config_class = _CONFIG_REGISTRY[model_type]
                 config_class.model_type = model_type
                 AutoConfig.register(model_type, config_class, exist_ok=True)
@@ -396,6 +401,57 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
         config.rope_parameters["rope_theta"] = default_theta
 
 
+def patch_legacy_rope_type(rope_parameters: dict[str, Any] | None) -> None:
+    """Patch legacy RoPE type fields for backwards compatibility with
+    older custom models which would otherwise fail to load."""
+
+    # No RoPE parameters to patch
+    if rope_parameters is None:
+        return
+
+    def _patch_legacy_rope_type(rope_parameters: dict[str, Any]) -> None:
+        # Case 1: Both legacy and modern fields present - check for conflicts
+        if "rope_type" in rope_parameters and "type" in rope_parameters:
+            rope_type = rope_parameters["rope_type"]
+            rope_type_legacy = rope_parameters["type"]
+            if (rope_type_legacy == "su" and rope_type == "longrope") or (
+                rope_type_legacy == "mrope" and rope_type == "default"
+            ):
+                pass  # No action needed
+            elif rope_type != rope_type_legacy:
+                raise ValueError(
+                    f"Found conflicts between 'rope_type={rope_type}' (modern "
+                    f"field) and 'type={rope_type_legacy}' (legacy field). "
+                    "You should only specify one of them."
+                )
+        # Case 2: Only legacy field present - patch to modern format with warning
+        if "rope_type" not in rope_parameters and "type" in rope_parameters:
+            rope_parameters["rope_type"] = rope_parameters["type"]
+            logger.info("Replacing legacy 'type' key with 'rope_type'")
+        # Case 3: No rope_type field at all - cannot determine RoPE type, raise error
+        if "rope_type" not in rope_parameters:
+            raise ValueError("rope_parameters should have a 'rope_type' key")
+        # Patch legacy rope_type values with warning
+        if rope_parameters["rope_type"] == "su":
+            rope_parameters["rope_type"] = "longrope"
+            logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
+        elif rope_parameters["rope_type"] == "mrope":
+            if "mrope_section" not in rope_parameters:
+                raise ValueError(
+                    "Legacy rope_type 'mrope' requires "
+                    "'mrope_section' in rope_parameters"
+                )
+            rope_parameters["rope_type"] = "default"
+            logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
+
+    # Handle nested rope_parameters in interleaved sliding attention models
+    if is_rope_parameters_nested(rope_parameters):
+        for rope_parameters_layer_type in rope_parameters.values():
+            _patch_legacy_rope_type(rope_parameters_layer_type)
+    else:
+        _patch_legacy_rope_type(rope_parameters)
+
+
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
     from vllm.config.utils import getattr_iter
@@ -409,22 +465,28 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
     ompe = getattr(config, "original_max_position_embeddings", None)
 
     if Version(version("transformers")) < Version("5.0.0"):
-        # Transformers v4 installed, legacy config fields may be present
-        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
-            config.rope_parameters = rope_scaling
-        if (
-            rope_theta is not None
-            or partial_rotary_factor is not None
-            or ompe is not None
-        ) and not getattr(config, "rope_parameters", None):
-            config.rope_parameters = {"rope_type": "default"}
-        # Patch legacy fields into rope_parameters
-        if rope_theta is not None:
-            config.rope_parameters["rope_theta"] = rope_theta
-        if partial_rotary_factor is not None:
-            config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
-        if ompe is not None:
-            config.rope_parameters["original_max_position_embeddings"] = ompe
+        # Transformers v4 installed, legacy config fields may be present.
+        if is_rope_parameters_nested(getattr(config, "rope_parameters", {})):
+            # Loading nested rope_parameters (from Transformers v5) in Transformers v4.
+            # Skip legacy patching since it should already be in the correct format.
+            pass
+        else:
+            if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
+                config.rope_parameters = rope_scaling
+            if (
+                rope_theta is not None
+                or partial_rotary_factor is not None
+                or ompe is not None
+            ) and not getattr(config, "rope_parameters", None):
+                config.rope_parameters = {"rope_type": "default"}
+            # Patch legacy fields into rope_parameters
+            if rope_theta is not None:
+                config.rope_parameters["rope_theta"] = rope_theta
+            if partial_rotary_factor is not None:
+                config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+            if ompe is not None:
+                config.rope_parameters["original_max_position_embeddings"] = ompe
+            patch_legacy_rope_type(getattr(config, "rope_parameters", None))
     elif rope_theta is not None or getattr(config, "rope_parameters", None):
         # Transformers v5 installed
         # Patch these fields in case they used non-standard names
@@ -433,54 +495,10 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
         if partial_rotary_factor is not None:
             config.partial_rotary_factor = partial_rotary_factor
         # Standardize and validate RoPE parameters
+        patch_legacy_rope_type(getattr(config, "rope_parameters", None))
         config.standardize_rope_params()
         config.validate_rope()
 
-    # No RoPE parameters to patch
-    if getattr(config, "rope_parameters", None) is None:
-        return
-
-    # Handle nested rope_parameters in interleaved sliding attention models
-    if is_rope_parameters_nested(config.rope_parameters):
-        for rope_parameters_layer_type in config.rope_parameters.values():
-            patch_rope_parameters_dict(rope_parameters_layer_type)
-    else:
-        patch_rope_parameters_dict(config.rope_parameters)
-
-
-def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
-    if "rope_type" in rope_parameters and "type" in rope_parameters:
-        rope_type = rope_parameters["rope_type"]
-        rope_type_legacy = rope_parameters["type"]
-        if (rope_type_legacy == "su" and rope_type == "longrope") or (
-            rope_type_legacy == "mrope" and rope_type == "default"
-        ):
-            pass  # No action needed
-        elif rope_type != rope_type_legacy:
-            raise ValueError(
-                f"Found conflicts between 'rope_type={rope_type}' (modern "
-                f"field) and 'type={rope_type_legacy}' (legacy field). "
-                "You should only specify one of them."
-            )
-
-    if "rope_type" not in rope_parameters and "type" in rope_parameters:
-        rope_parameters["rope_type"] = rope_parameters["type"]
-        logger.info("Replacing legacy 'type' key with 'rope_type'")
-
-    if "rope_type" not in rope_parameters:
-        raise ValueError("rope_parameters should have a 'rope_type' key")
-
-    if rope_parameters["rope_type"] == "su":
-        rope_parameters["rope_type"] = "longrope"
-        logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
-    elif rope_parameters["rope_type"] == "mrope":
-        if "mrope_section" not in rope_parameters:
-            raise ValueError(
-                "Legacy rope_type 'mrope' requires 'mrope_section' in rope_parameters"
-            )
-        rope_parameters["rope_type"] = "default"
-        logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
-
 
 def _uses_mrope(config: PretrainedConfig) -> bool:
     rope_parameters = getattr(config, "rope_parameters", None)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 667ed5a2596c..c3466fddd65a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -37,6 +37,7 @@
     "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HCXVisionConfig": "vllm.transformers_utils.configs.hyperclovax",
     "HYV3Config": "vllm.transformers_utils.configs.hy_v3",
     "HyperCLOVAXConfig": "vllm.transformers_utils.configs.hyperclovax",
     "IsaacConfig": "vllm.transformers_utils.configs.isaac",
@@ -45,10 +46,14 @@
     # `FalconConfig` class from the official HuggingFace transformers library.
     "RWConfig": "vllm.transformers_utils.configs.falcon",
     "JAISConfig": "vllm.transformers_utils.configs.jais",
+    "LagunaConfig": "vllm.transformers_utils.configs.laguna",
     "Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe",
     "MedusaConfig": "vllm.transformers_utils.configs.medusa",
     "MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm",
     "MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator",
+    "Moondream3Config": "vllm.transformers_utils.configs.moondream3",
+    "Moondream3TextConfig": "vllm.transformers_utils.configs.moondream3",
+    "Moondream3VisionConfig": "vllm.transformers_utils.configs.moondream3",
     "MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
     "KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
     "KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
@@ -65,6 +70,8 @@
     "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
     "Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
     "Step3p5Config": "vllm.transformers_utils.configs.step3p5",
+    "QianfanOCRConfig": "vllm.transformers_utils.configs.qianfan_ocr",
+    "QianfanOCRVisionConfig": "vllm.transformers_utils.configs.qianfan_ocr",
     "Qwen3ASRConfig": "vllm.transformers_utils.configs.qwen3_asr",
     "Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
     "Qwen3_5Config": "vllm.transformers_utils.configs.qwen3_5",
@@ -100,15 +107,20 @@
     "HunYuanVLConfig",
     "HunYuanVLTextConfig",
     "HunYuanVLVisionConfig",
+    "HCXVisionConfig",
     "HYV3Config",
     "HyperCLOVAXConfig",
     "IsaacConfig",
     "RWConfig",
     "JAISConfig",
+    "LagunaConfig",
     "Lfm2MoeConfig",
     "MedusaConfig",
     "MiDashengLMConfig",
     "MLPSpeculatorConfig",
+    "Moondream3Config",
+    "Moondream3TextConfig",
+    "Moondream3VisionConfig",
     "MoonViTConfig",
     "KimiLinearConfig",
     "KimiVLConfig",
@@ -125,6 +137,8 @@
     "Step3VisionEncoderConfig",
     "Step3TextConfig",
     "Step3p5Config",
+    "QianfanOCRConfig",
+    "QianfanOCRVisionConfig",
     "Qwen3ASRConfig",
     "Qwen3NextConfig",
     "Qwen3_5Config",
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index 03f24319e287..3d3e20fea856 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -101,7 +101,6 @@ class DeepseekVLV2TextConfig(DeepseekV2Config):
 
 class DeepseekVLV2Config(PretrainedConfig):
     model_type = "deepseek_vl_v2"
-    architectures: list[str] | None = None
 
     tile_tag: str = "2D"
     global_view_pos: str = "head"
@@ -114,17 +113,11 @@ def __init__(
         candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
         **kwargs,
     ):
-        if "architectures" not in kwargs:
-            kwargs["architectures"] = ["DeepseekVLV2ForCausalLM"]
+        architectures = kwargs.setdefault("architectures", ["DeepseekVLV2ForCausalLM"])
 
-        vision_config = kwargs.pop("vision_config", {})
-        self.vision_config = VisionEncoderConfig(**vision_config)
-
-        projector_config = kwargs.pop("projector_config", {})
-        self.projector_config = MlpProjectorConfig(**projector_config)
-
-        language_config = kwargs.pop("language_config", {})
-        self.text_config = DeepseekVLV2TextConfig(**language_config)
+        self.vision_config = VisionEncoderConfig(**kwargs.pop("vision_config", {}))
+        self.projector_config = MlpProjectorConfig(**kwargs.pop("projector_config", {}))
+        self.text_config = DeepseekVLV2TextConfig(**kwargs.pop("language_config", {}))
 
         self.tile_tag = tile_tag
         self.global_view_pos = global_view_pos
@@ -132,8 +125,8 @@ def __init__(
         self.vocab_size = self.text_config.vocab_size
 
         # update model_type for OCR models
-        if "DeepseekOCRForCausalLM" in kwargs["architectures"]:
-            self.model_type = "deepseek_ocr"
-        elif "DeepseekOCR2ForCausalLM" in kwargs["architectures"]:
-            self.model_type = "deepseek_ocr2"
+        if "DeepseekOCRForCausalLM" in architectures:
+            kwargs["model_type"] = "deepseek_ocr"
+        elif "DeepseekOCR2ForCausalLM" in architectures:
+            kwargs["model_type"] = "deepseek_ocr2"
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/hyperclovax.py b/vllm/transformers_utils/configs/hyperclovax.py
index 9fa823743d66..d1a3218fe4dd 100644
--- a/vllm/transformers_utils/configs/hyperclovax.py
+++ b/vllm/transformers_utils/configs/hyperclovax.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 """HyperCLOVA X model configuration."""
 
+from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 
 
@@ -275,3 +276,74 @@ def __init__(
             auto_map=auto_map,
             **kwargs,
         )
+
+
+class HCXVisionConfig(PretrainedConfig):
+    """Vendored HyperCLOVAX Vision config with transformers v5 fix.
+
+    The original remote code config does not handle empty initialization
+    (text_config=None), which breaks transformers v5's @strict validation.
+
+    TODO: Remove this class once HyperCLOVAX is upstreamed to transformers.
+    Tracking PR: https://github.com/huggingface/transformers/pull/44956
+    """
+
+    model_type = "hyperclovax_vlm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    text_config_attribute_map = {
+        "n_embd": "hidden_size",
+        "n_positions": "max_position_embeddings",
+        "n_head": "num_attention_heads",
+        "n_layer": "num_hidden_layers",
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        use_nth_layer=-2,
+        img_start_id=100009,
+        decoder_max_length=4096,
+        anyres=False,
+        unpad=False,
+        max_num_grids=-1,
+        num_queries_vis_abstractor=-1,
+        ignore_index=-100,
+        proj_pos_emb=True,
+        proj_prenorm=False,
+        use_1x1_grid=False,
+        **kwargs,
+    ):
+        for key, val in self.text_config_attribute_map.items():
+            if text_config is not None and key in text_config:
+                text_config[val] = text_config.pop(key)
+
+        self.text_config = None
+        if text_config is not None:
+            _text_config = AutoConfig.for_model(text_config["model_type"])
+            self.text_config = _text_config.from_dict(text_config)
+            self.hidden_size = self.text_config.hidden_size
+
+        self.vision_config = None
+        if vision_config is not None:
+            _vision_config = AutoConfig.for_model(vision_config["model_type"])
+            self.vision_config = _vision_config.from_dict(vision_config)
+
+        self.use_nth_layer = use_nth_layer
+        self.decoder_max_length = decoder_max_length
+        self.anyres = anyres
+        self.unpad = unpad
+        self.max_num_grids = max_num_grids
+        self.num_queries_vis_abstractor = num_queries_vis_abstractor
+        self.img_start_id = img_start_id
+        self.ignore_index = ignore_index
+        self.proj_pos_emb = proj_pos_emb
+        self.proj_prenorm = proj_prenorm
+        self.use_1x1_grid = use_1x1_grid
+        super().__init__(**kwargs)
+
+    def get_text_config(self, decoder=False):
+        if self.text_config is not None:
+            return self.text_config
+        return self
diff --git a/vllm/transformers_utils/configs/laguna.py b/vllm/transformers_utils/configs/laguna.py
new file mode 100644
index 000000000000..2702d3af5aa1
--- /dev/null
+++ b/vllm/transformers_utils/configs/laguna.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class LagunaConfig(PretrainedConfig):
+    model_type = "laguna"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.g_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size: int = 100352,
+        hidden_size: int = 2048,
+        intermediate_size: int = 8192,
+        num_hidden_layers: int = 40,
+        num_attention_heads: int = 48,
+        num_key_value_heads: int = 8,
+        head_dim: int = 128,
+        qkv_bias: bool = False,
+        attention_bias: bool = False,
+        gating: bool | str = True,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 500000.0,
+        rope_scaling: dict | None = None,
+        rope_parameters: dict | None = None,
+        partial_rotary_factor: float = 1.0,
+        attention_dropout: float = 0.0,
+        sliding_window: int | None = None,
+        layer_types: list[str] | None = None,
+        swa_attention_sink_enabled: bool = False,
+        swa_rope_parameters: dict | None = None,
+        num_attention_heads_per_layer: list[int] | None = None,
+        num_experts: int = 256,
+        num_experts_per_tok: int = 8,
+        moe_intermediate_size: int = 512,
+        shared_expert_intermediate_size: int = 512,
+        norm_topk_prob: bool = True,
+        decoder_sparse_step: int = 1,
+        mlp_only_layers: list[int] | None = None,
+        router_aux_loss_coef: float = 0.001,
+        output_router_logits: bool = False,
+        moe_routed_scaling_factor: float = 1.0,
+        moe_apply_router_weight_on_input: bool = False,
+        **kwargs,
+    ):
+        if mlp_only_layers is None:
+            mlp_only_layers = [0]
+
+        # Accept either v4-style (rope_theta + rope_scaling) or v5-style
+        # (rope_parameters). Translate v5 → v4 so downstream code has one path.
+        if rope_parameters is not None:
+            rp = dict(rope_parameters)
+            rope_theta = float(rp.pop("rope_theta", rope_theta))
+            rt = rp.pop("rope_type", None)
+            if rt is not None and rt != "default":
+                rope_scaling = {"rope_type": rt, **rp}
+            elif rp and rope_scaling is None:
+                rope_scaling = {"rope_type": "default", **rp}
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.qkv_bias = qkv_bias
+        self.attention_bias = attention_bias
+        self.gating = gating
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self.attention_dropout = attention_dropout
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        self.swa_attention_sink_enabled = swa_attention_sink_enabled
+        self.swa_rope_parameters = swa_rope_parameters
+        self.num_attention_heads_per_layer = num_attention_heads_per_layer
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.norm_topk_prob = norm_topk_prob
+        self.decoder_sparse_step = decoder_sparse_step
+        self.mlp_only_layers = mlp_only_layers
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.output_router_logits = output_router_logits
+        self.moe_routed_scaling_factor = moe_routed_scaling_factor
+        self.moe_apply_router_weight_on_input = moe_apply_router_weight_on_input
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["LagunaConfig"]
diff --git a/vllm/transformers_utils/configs/moondream3.py b/vllm/transformers_utils/configs/moondream3.py
new file mode 100644
index 000000000000..307bb2977206
--- /dev/null
+++ b/vllm/transformers_utils/configs/moondream3.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Configuration for Moondream3 model."""
+
+from transformers import PretrainedConfig
+
+
+class Moondream3VisionConfig(PretrainedConfig):
+    """Vision encoder configuration for Moondream3."""
+
+    model_type = "moondream3_vision"
+
+    def __init__(
+        self,
+        enc_dim: int = 1152,
+        enc_patch_size: int = 14,
+        enc_n_layers: int = 27,
+        enc_ff_dim: int = 4304,
+        enc_n_heads: int = 16,
+        proj_inner_dim: int = 8192,
+        crop_size: int = 378,
+        max_crops: int = 12,
+        overlap_margin: int = 4,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.enc_dim = enc_dim
+        self.enc_patch_size = enc_patch_size
+        self.enc_n_layers = enc_n_layers
+        self.enc_ff_dim = enc_ff_dim
+        self.enc_n_heads = enc_n_heads
+        self.proj_inner_dim = proj_inner_dim
+        self.crop_size = crop_size
+        self.max_crops = max_crops
+        self.overlap_margin = overlap_margin
+
+        # Standard HuggingFace attributes for vision config
+        self.hidden_size = enc_dim
+        self.num_attention_heads = enc_n_heads
+        self.num_hidden_layers = enc_n_layers
+        self.intermediate_size = enc_ff_dim
+        self.patch_size = enc_patch_size
+        self.image_size = crop_size
+
+
+class Moondream3TextConfig(PretrainedConfig):
+    """Text decoder configuration for Moondream3."""
+
+    model_type = "moondream3_text"
+
+    def __init__(
+        self,
+        dim: int = 2048,
+        ff_dim: int = 8192,
+        n_layers: int = 24,
+        vocab_size: int = 51200,
+        max_context: int = 4096,
+        n_heads: int = 32,
+        n_kv_heads: int = 32,
+        prefix_attn: int = 730,
+        rope_theta: float = 1500000.0,
+        moe: dict | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Store original moondream3 config names
+        self.dim = dim
+        self.ff_dim = ff_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.prefix_attn = prefix_attn
+        self.max_context = max_context
+        self.rope_theta = rope_theta
+
+        # MoE config
+        moe = moe or {}
+        self.moe_start_layer = moe.get("start_layer", 4)
+        self.moe_num_experts = moe.get("n_experts", 64)
+        self.moe_experts_per_token = moe.get("n_experts_per_tok", 8)
+        self.moe_expert_inner_dim = moe.get("expert_inner_dim", 1024)
+
+        # Standard HuggingFace attributes (required by vLLM)
+        self.hidden_size = dim
+        self.num_attention_heads = n_heads
+        self.num_key_value_heads = n_kv_heads
+        self.num_hidden_layers = n_layers
+        self.intermediate_size = ff_dim
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_context
+
+        # Moondream3 uses token 0 (<|endoftext|>) as both BOS and EOS.
+        # Token 3 (<|md_reserved_2|>) is an answer delimiter that the model
+        # implementation suppresses during generation.
+        self.bos_token_id = 0
+        self.eos_token_id = 0
+
+        # MoE standard attributes
+        self.num_local_experts = self.moe_num_experts
+        self.num_experts_per_tok = self.moe_experts_per_token
+
+
+class Moondream3Config(PretrainedConfig):
+    """Combined configuration for Moondream3 multimodal model."""
+
+    model_type = "moondream3"
+    is_composition = True
+
+    def __init__(
+        self,
+        config: dict | None = None,
+        **kwargs,
+    ):
+        config = config or {}
+
+        # Parse text config
+        text_config = config.get("text", {})
+        self.text_config: Moondream3TextConfig = Moondream3TextConfig(**text_config)
+
+        # Parse vision config
+        vision_config = config.get("vision", {})
+        self.vision_config = Moondream3VisionConfig(**vision_config)
+
+        # Store the original config dict for model access
+        self.config = config
+        tokenizer_config = config.get("tokenizer", {})
+        self.answer_token_id = tokenizer_config.get("answer_id", 3)
+
+        super().__init__(**kwargs)
+
+        # Expose key attributes at top level for vLLM compatibility
+        self.hidden_size = self.text_config.hidden_size
+        self.num_attention_heads = self.text_config.num_attention_heads
+        self.num_key_value_heads = self.text_config.num_key_value_heads
+        self.num_hidden_layers = self.text_config.num_hidden_layers
+        self.vocab_size = self.text_config.vocab_size
+        self.intermediate_size = self.text_config.intermediate_size
+
+        # Moondream3 uses token 0 (<|endoftext|>) as both BOS and EOS.
+        # Token 3 (<|md_reserved_2|>) is an answer delimiter that the model
+        # implementation suppresses during generation.
+        self.bos_token_id = 0
+        self.eos_token_id = 0
+
+    def get_text_config(self, decoder: bool = False) -> "Moondream3TextConfig":
+        """Return the text config for vLLM's text_config detection.
+
+        Args:
+            decoder: Ignored. Only used for encoder-decoder models.
+        """
+        return self.text_config
diff --git a/vllm/transformers_utils/configs/qianfan_ocr.py b/vllm/transformers_utils/configs/qianfan_ocr.py
new file mode 100644
index 000000000000..da004bb90f4f
--- /dev/null
+++ b/vllm/transformers_utils/configs/qianfan_ocr.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+from transformers import PretrainedConfig
+from transformers.models.auto import CONFIG_MAPPING
+
+
+class QianfanOCRVisionConfig(PretrainedConfig):
+    model_type = "qianfan_ocr_vision"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 4096,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 16,
+        num_channels: int = 3,
+        image_size: int = 448,
+        patch_size: int = 14,
+        hidden_act: str = "gelu",
+        layer_norm_eps: float = 1e-6,
+        attention_dropout: float = 0.0,
+        drop_path_rate: float = 0.1,
+        qkv_bias: bool = True,
+        qk_normalization: bool = False,
+        norm_type: str = "layer_norm",
+        initializer_range: float = 0.02,
+        initializer_factor: float = 0.1,
+        use_mask_token: bool = False,
+        use_mean_pooling: bool = True,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.drop_path_rate = drop_path_rate
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.norm_type = norm_type
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.use_mask_token = use_mask_token
+        self.use_mean_pooling = use_mean_pooling
+
+
+class QianfanOCRConfig(PretrainedConfig):
+    model_type = "qianfan_ocr"
+
+    def __init__(
+        self,
+        vision_config: dict | None = None,
+        text_config: dict | None = None,
+        downsample_ratio: float = 0.5,
+        dynamic_image_size: bool = True,
+        force_image_size: int = 448,
+        image_token_id: int = 151671,
+        max_dynamic_patch: int = 12,
+        min_dynamic_patch: int = 1,
+        pad2square: bool = False,
+        ps_version: str = "v2",
+        select_layer: int = -1,
+        template: str = "internvl2_5",
+        use_thumbnail: bool = True,
+        tie_word_embeddings: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+
+        if isinstance(vision_config, dict):
+            self.vision_config = QianfanOCRVisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = QianfanOCRVisionConfig()
+        else:
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            model_type = text_config.get("model_type", "qwen3")
+            self.text_config = CONFIG_MAPPING[model_type](**text_config)
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["qwen3"]()
+        else:
+            self.text_config = text_config
+
+        self.downsample_ratio = downsample_ratio
+        self.dynamic_image_size = dynamic_image_size
+        self.force_image_size = force_image_size
+        self.image_token_id = image_token_id
+        self.max_dynamic_patch = max_dynamic_patch
+        self.min_dynamic_patch = min_dynamic_patch
+        self.pad2square = pad2square
+        self.ps_version = ps_version
+        self.select_layer = select_layer
+        self.template = template
+        self.use_thumbnail = use_thumbnail
+        self.tie_word_embeddings = tie_word_embeddings
diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py
index 2823e2cc4832..35fa1313d1e7 100644
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -267,6 +267,7 @@ def is_mm_prefix_lm(self) -> bool:
             "bagel",
             "gemma3",
             "molmo2",
+            "moondream3",
             "paligemma",
             "umm",
         )
@@ -351,6 +352,9 @@ def get_total_num_kv_heads(self) -> int:
         )
         return enc_num_kv_heads
 
+    def is_mm_prefix_lm(self) -> bool:
+        return False
+
 
 class MambaModelArchConfigConvertor(ModelArchConfigConvertorBase):
     def get_head_size(self) -> int:
@@ -508,6 +512,18 @@ def get_num_hidden_layers(self) -> int:
         return getattr(self.hf_text_config, "num_nextn_predict_layers", 1)
 
 
+class Gemma4MTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_hidden_size(self) -> int:
+        # The speculator buffer must match the backbone (target) model's
+        # hidden dimension, not the draft model's smaller dimension.
+        return getattr(
+            self.hf_config, "backbone_hidden_size", super().get_hidden_size()
+        )
+
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_hidden_layers", 0)
+
+
 class Gemma4ModelArchConfigConvertor(ModelArchConfigConvertorBase):
     def is_mm_prefix_lm(self) -> bool:
         return (
@@ -537,6 +553,7 @@ def get_head_size(self) -> int:
     "falcon": FalconModelArchConfigConvertor,
     "gemma4": Gemma4ModelArchConfigConvertor,
     "gemma4_text": Gemma4ModelArchConfigConvertor,
+    "gemma4_mtp": Gemma4MTPModelArchConfigConvertor,
     "RefinedWeb": FalconModelArchConfigConvertor,
     "RefinedWebModel": FalconModelArchConfigConvertor,
     "nemotron-nas": NemotronNasModelArchConfigConvertor,
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 546a5c453292..4f44591b16d1 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -23,6 +23,7 @@
     "H2OVLProcessor",
     "HunYuanVLProcessor",
     "HunYuanVLImageProcessor",
+    "Moondream3Processor",
     "InternVLProcessor",
     "IsaacProcessor",
     "KimiAudioProcessor",
@@ -61,6 +62,7 @@
     "MiMoOmniProcessor": "vllm.transformers_utils.processors.mimo_v2_omni",
     "MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
     "MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
+    "Moondream3Processor": "vllm.transformers_utils.processors.moondream3",
     "NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
     "NemotronVLProcessor": "vllm.transformers_utils.processors.nemotron_vl",
     "LlamaNemotronVLEmbedProcessor": "vllm.transformers_utils.processors.nemotron_vl",
diff --git a/vllm/transformers_utils/processors/moondream3.py b/vllm/transformers_utils/processors/moondream3.py
new file mode 100644
index 000000000000..289c40dd175e
--- /dev/null
+++ b/vllm/transformers_utils/processors/moondream3.py
@@ -0,0 +1,541 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Custom processor for Moondream3 model."""
+
+import math
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import (
+    PreTokenizedInput,
+    PreTrainedTokenizerBase,
+    TextInput,
+)
+
+from vllm.multimodal.image import convert_image_mode
+
+__all__ = ["Moondream3Processor"]
+
+
+class Moondream3ProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            "max_crops": 12,
+            "overlap_margin": 4,
+            "crop_size": 378,
+            "patch_size": 14,
+            "convert_to_rgb": True,
+            "return_tensors": "pt",
+        },
+    }
+
+
+def select_tiling(
+    height: int, width: int, crop_size: int, max_crops: int
+) -> tuple[int, int]:
+    """Determine the optimal number of tiles to cover an image."""
+    if height <= crop_size or width <= crop_size:
+        return (1, 1)
+
+    min_h = math.ceil(height / crop_size)
+    min_w = math.ceil(width / crop_size)
+
+    if min_h * min_w > max_crops:
+        ratio = math.sqrt(max_crops / (min_h * min_w))
+        return (max(1, math.floor(min_h * ratio)), max(1, math.floor(min_w * ratio)))
+
+    h_tiles = math.floor(math.sqrt(max_crops * height / width))
+    w_tiles = math.floor(math.sqrt(max_crops * width / height))
+
+    h_tiles = max(h_tiles, min_h)
+    w_tiles = max(w_tiles, min_w)
+
+    if h_tiles * w_tiles > max_crops:
+        if w_tiles > h_tiles:
+            w_tiles = math.floor(max_crops / h_tiles)
+        else:
+            h_tiles = math.floor(max_crops / w_tiles)
+
+    return (max(1, h_tiles), max(1, w_tiles))
+
+
+class Moondream3Processor(ProcessorMixin):
+    """
+    Constructs a Moondream3 processor which handles image preprocessing
+    and tokenization for the Moondream3 multimodal model.
+
+    Args:
+        tokenizer: The tokenizer to use for text processing.
+        chat_template: Optional chat template string.
+        crop_size: Size of each image crop.
+        max_crops: Maximum number of crops per image.
+        overlap_margin: Margin for overlapping crops in patches.
+        patch_size: Size of each patch.
+    """
+
+    attributes = ["tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "crop_size",
+        "max_crops",
+        "overlap_margin",
+        "patch_size",
+    ]
+
+    tokenizer_class = "AutoTokenizer"
+    # Use separate tokenizer repo
+    _tokenizer_repo = "moondream/starmie-v1"
+
+    # Default chat template for Moondream3
+    # Moondream uses special tokens for prompting:
+    # - Token 0 (<|endoftext|>): BOS token (ALWAYS present at position 0)
+    # - Token 1 (<|md_reserved_0|>): Start of instruction
+    # - Token 2 (<|md_reserved_1|>): Separator before question
+    # - Token 3 (<|md_reserved_2|>): End of question / start of answer
+    #
+    # Task routing based on text prefix:
+    #   "caption [short|normal|long]" → describe<|md_reserved_1|>{length}
+    #   "describe [short|normal|long]" → describe<|md_reserved_1|>{length}
+    #   otherwise                      → query<|md_reserved_1|><text>
+    #
+    # Format with image:
+    #   <|endoftext|><image><|md_reserved_0|>{task}<|md_reserved_1|>{q}<|md_reserved_2|>
+    # Format without image:
+    #   <|endoftext|><|md_reserved_0|>{task}<|md_reserved_1|>{q}<|md_reserved_2|>
+    _default_chat_template = (
+        "{% for message in messages %}"
+        "{% if message['role'] == 'user' %}"
+        "{% if message['content'] is string %}"
+        # Simple string content (with image assumed) - route by prefix
+        "<|endoftext|><image><|md_reserved_0|>"
+        "{% if message['content'] == 'caption' %}"
+        "describe<|md_reserved_1|>normal<|md_reserved_2|>"
+        "{% elif message['content'].startswith('caption ') %}"
+        "describe<|md_reserved_1|>{{ message['content'][8:] }}<|md_reserved_2|>"
+        "{% elif message['content'] == 'describe' %}"
+        "describe<|md_reserved_1|>normal<|md_reserved_2|>"
+        "{% elif message['content'].startswith('describe ') %}"
+        "describe<|md_reserved_1|>{{ message['content'][9:] }}<|md_reserved_2|>"
+        "{% else %}"
+        "query<|md_reserved_1|>{{ message['content'] }}<|md_reserved_2|>"
+        "{% endif %}"
+        "{% else %}"
+        # List content - build Moondream's image prefix independently of
+        # OpenAI-style content part order, then render the text task.
+        "<|endoftext|>"
+        "{% for content in message['content'] %}"
+        "{% if content['type'] in ['image', 'image_url', 'input_image', 'image_pil'] %}"  # noqa: E501
+        "<image>"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% for content in message['content'] %}"
+        "{% if content['type'] == 'text' %}"
+        "<|md_reserved_0|>"
+        "{% if content['text'] == 'caption' %}"
+        "describe<|md_reserved_1|>normal<|md_reserved_2|>"
+        "{% elif content['text'].startswith('caption ') %}"
+        "describe<|md_reserved_1|>{{ content['text'][8:] }}<|md_reserved_2|>"
+        "{% elif content['text'] == 'describe' %}"
+        "describe<|md_reserved_1|>normal<|md_reserved_2|>"
+        "{% elif content['text'].startswith('describe ') %}"
+        "describe<|md_reserved_1|>{{ content['text'][9:] }}<|md_reserved_2|>"
+        "{% else %}"
+        "query<|md_reserved_1|>{{ content['text'] }}<|md_reserved_2|>"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        "{% elif message['role'] == 'assistant' %}"
+        "{{ message['content'] }}"
+        "{% endif %}"
+        "{% endfor %}"
+    )
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase | None = None,
+        chat_template: str | None = None,
+        crop_size: int = 378,
+        max_crops: int = 12,
+        overlap_margin: int = 4,
+        patch_size: int = 14,
+        **kwargs,
+    ):
+        self.image_token = "<image>"
+        self.crop_size = crop_size
+        self.max_crops = max_crops
+        self.overlap_margin = overlap_margin
+        self.patch_size = patch_size
+
+        # Number of patches per crop (27x27 = 729 for 378/14)
+        self.patches_per_crop = (crop_size // patch_size) ** 2
+
+        # Use default chat template if none provided
+        if chat_template is None:
+            chat_template = self._default_chat_template
+
+        super().__init__(tokenizer, chat_template=chat_template)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        **kwargs,
+    ):
+        """
+        Load the processor, using a separate tokenizer repo.
+
+        The moondream3 model uses a custom tokenizer from 'moondream/starmie-v1'
+        instead of having tokenizer files in the model repo.
+        """
+        from transformers import AutoTokenizer, PreTrainedTokenizerFast
+        from transformers.utils import cached_file
+
+        tokenizer = kwargs.pop("tokenizer", None)
+
+        tokenizer_kwargs = {
+            "trust_remote_code": kwargs.get("trust_remote_code", False),
+        }
+        for key in (
+            "cache_dir",
+            "force_download",
+            "local_files_only",
+            "revision",
+            "subfolder",
+            "token",
+            "use_fast",
+        ):
+            if key in kwargs:
+                tokenizer_kwargs[key] = kwargs[key]
+
+        cached_file_kwargs = {
+            key: tokenizer_kwargs[key]
+            for key in (
+                "cache_dir",
+                "force_download",
+                "local_files_only",
+                "revision",
+                "subfolder",
+                "token",
+            )
+            if key in tokenizer_kwargs
+        }
+
+        def load_tokenizer(repo_or_path):
+            try:
+                return AutoTokenizer.from_pretrained(repo_or_path, **tokenizer_kwargs)
+            except Exception:
+                tokenizer_file = cached_file(
+                    repo_or_path,
+                    "tokenizer.json",
+                    **cached_file_kwargs,
+                )
+                return PreTrainedTokenizerFast(
+                    tokenizer_file=tokenizer_file,
+                    clean_up_tokenization_spaces=False,
+                )
+
+        if isinstance(tokenizer, str):
+            tokenizer = load_tokenizer(tokenizer)
+
+        if tokenizer is None:
+            # Prefer model-local tokenizer files first. If unavailable, fall
+            # back to moondream's dedicated tokenizer repository.
+            try:
+                tokenizer = load_tokenizer(pretrained_model_name_or_path)
+            except Exception:
+                tokenizer = load_tokenizer(cls._tokenizer_repo)
+
+        # Configure special tokens for Moondream3
+        # BOS and EOS are both token 0 (<|endoftext|>), matching the native
+        # config (TokenizerConfig.bos_id=0, eos_id=0). This is standard for
+        # GPT-2 style models where <|endoftext|> signals both start and end.
+        # Token 1 (<|md_reserved_0|>) is a template delimiter, NOT the EOS.
+        tokenizer.bos_token = "<|endoftext|>"
+        tokenizer.bos_token_id = 0
+        tokenizer.eos_token = "<|endoftext|>"
+        tokenizer.eos_token_id = 0
+
+        # Extract processor-specific kwargs
+        crop_size = kwargs.pop("crop_size", 378)
+        max_crops = kwargs.pop("max_crops", 12)
+        overlap_margin = kwargs.pop("overlap_margin", 4)
+        patch_size = kwargs.pop("patch_size", 14)
+        chat_template = kwargs.pop("chat_template", None)
+
+        # Set default chat template on tokenizer if not already set
+        if chat_template is None:
+            chat_template = cls._default_chat_template
+        if tokenizer.chat_template is None:
+            tokenizer.chat_template = chat_template
+
+        return cls(
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            crop_size=crop_size,
+            max_crops=max_crops,
+            overlap_margin=overlap_margin,
+            patch_size=patch_size,
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        **kwargs: Unpack[Moondream3ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Process images and text for Moondream3 model.
+
+        Args:
+            images: Input images (PIL Image, numpy array, or list thereof).
+            text: Input text or list of texts.
+            **kwargs: Additional processing arguments.
+
+        Returns:
+            BatchFeature with processed inputs.
+        """
+        output_kwargs = self._merge_kwargs(
+            Moondream3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Process images
+        image_features = {}
+        if images is not None:
+            processed_images = []
+            tilings = []
+
+            images_list = images if isinstance(images, list) else [images]
+            for image in images_list:
+                pixel_values, tiling = self.preprocess_image(
+                    image, **output_kwargs["images_kwargs"]
+                )
+                processed_images.append(pixel_values)
+                tilings.append(tiling)
+
+            if processed_images:
+                image_features["pixel_values"] = processed_images
+                image_features["tilings"] = tilings
+
+        # Process text
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+
+            # Get text kwargs, remove keys we set ourselves
+            text_kwargs = output_kwargs.get("text_kwargs", {}).copy()
+            text_kwargs.pop("return_tensors", None)
+            text_kwargs.pop("add_special_tokens", None)
+
+            # Tokenize text
+            tokenized = self.tokenizer(
+                text,
+                add_special_tokens=True,
+                return_tensors="pt",
+                **text_kwargs,
+            )
+
+            output = BatchFeature(data=dict(tokenized))
+
+            # Add image features
+            if image_features:
+                output["pixel_values"] = image_features["pixel_values"]
+                output["tilings"] = image_features["tilings"]
+
+            return output
+
+        # If only images were provided
+        return BatchFeature(data=image_features)
+
+    @staticmethod
+    def _image_array_to_uint8(array: np.ndarray) -> np.ndarray:
+        if array.dtype == np.uint8:
+            return np.ascontiguousarray(array)
+
+        if array.dtype == np.bool_:
+            return np.ascontiguousarray(array.astype(np.uint8) * 255)
+
+        if np.issubdtype(array.dtype, np.floating):
+            array = np.nan_to_num(array, nan=0.0, posinf=255.0, neginf=0.0)
+            if array.size > 0 and array.max() <= 1.0:
+                array = array * 255.0
+            array = np.rint(array)
+
+        return np.ascontiguousarray(np.clip(array, 0, 255).astype(np.uint8))
+
+    @staticmethod
+    def _to_pil_image(image: ImageInput) -> Image.Image:
+        if isinstance(image, Image.Image):
+            return image
+
+        if isinstance(image, torch.Tensor):
+            tensor = image.detach().cpu()
+            if tensor.dtype == torch.bfloat16:
+                tensor = tensor.to(torch.float32)
+            image_array = tensor.numpy()
+        elif isinstance(image, np.ndarray):
+            image_array = image
+        else:
+            raise TypeError(
+                "Moondream3 images must be PIL images, numpy arrays, "
+                f"or torch tensors, got {type(image)!r}."
+            )
+
+        if image_array.ndim == 2:
+            image_array = Moondream3Processor._image_array_to_uint8(image_array)
+            return Image.fromarray(image_array)
+
+        if image_array.ndim != 3:
+            raise ValueError(
+                "Moondream3 image arrays must have 2 or 3 dimensions, "
+                f"got shape {image_array.shape}."
+            )
+
+        channel_dims = (1, 3, 4)
+        if image_array.shape[-1] not in channel_dims:
+            if image_array.shape[0] not in channel_dims:
+                raise ValueError(
+                    "Moondream3 image arrays must be HWC or CHW with 1, 3, "
+                    f"or 4 channels, got shape {image_array.shape}."
+                )
+            image_array = np.transpose(image_array, (1, 2, 0))
+
+        image_array = Moondream3Processor._image_array_to_uint8(image_array)
+        if image_array.shape[-1] == 1:
+            image_array = image_array[..., 0]
+
+        return Image.fromarray(image_array)
+
+    def preprocess_image(
+        self,
+        image: ImageInput,
+        max_crops: int = 12,
+        overlap_margin: int = 4,
+        crop_size: int = 378,
+        patch_size: int = 14,
+        convert_to_rgb: bool = True,
+        return_tensors: str = "pt",
+    ) -> tuple[torch.Tensor, tuple[int, int]]:
+        """
+        Preprocess an image using overlap-and-resize cropping strategy.
+
+        Args:
+            image: Input PIL image, numpy array, or torch tensor.
+            max_crops: Maximum number of crops.
+            overlap_margin: Margin for overlapping in patches.
+            crop_size: Size of each crop.
+            patch_size: Size of each patch.
+            convert_to_rgb: Whether to convert to RGB.
+            return_tensors: Return type ("pt" for PyTorch).
+
+        Returns:
+            Tuple of (pixel_values tensor, tiling tuple).
+        """
+        image = self._to_pil_image(image)
+        if convert_to_rgb:
+            image = convert_image_mode(image, "RGB")
+
+        # Convert to numpy array
+        image_array = np.array(image)
+        original_h, original_w = image_array.shape[:2]
+
+        margin_pixels = patch_size * overlap_margin
+        total_margin_pixels = margin_pixels * 2
+
+        crop_patches = crop_size // patch_size
+        crop_window_patches = crop_patches - (2 * overlap_margin)
+        crop_window_size = crop_window_patches * patch_size
+
+        tiling = select_tiling(
+            original_h - total_margin_pixels,
+            original_w - total_margin_pixels,
+            crop_window_size,
+            max_crops,
+        )
+
+        n_crops = tiling[0] * tiling[1] + 1
+        crops = np.zeros((n_crops, crop_size, crop_size, 3), dtype=np.uint8)
+
+        target_size = (
+            tiling[0] * crop_window_size + total_margin_pixels,
+            tiling[1] * crop_window_size + total_margin_pixels,
+        )
+
+        # Resize image
+        pil_img = Image.fromarray(image_array)
+        resized = pil_img.resize(
+            (int(target_size[1]), int(target_size[0])),
+            resample=Image.Resampling.LANCZOS,
+        )
+        resized_array = np.asarray(resized)
+
+        # Create global crop
+        global_pil = pil_img.resize(
+            (crop_size, crop_size), resample=Image.Resampling.LANCZOS
+        )
+        crops[0] = np.asarray(global_pil)
+
+        # Create local crops
+        for i in range(tiling[0]):
+            for j in range(tiling[1]):
+                y0 = i * crop_window_size
+                x0 = j * crop_window_size
+                y_end = min(y0 + crop_size, resized_array.shape[0])
+                x_end = min(x0 + crop_size, resized_array.shape[1])
+
+                crop_region = resized_array[y0:y_end, x0:x_end]
+                crop_idx = 1 + i * tiling[1] + j
+                h_slice = slice(None, crop_region.shape[0])
+                w_slice = slice(None, crop_region.shape[1])
+                crops[crop_idx, h_slice, w_slice] = crop_region
+
+        # Convert to tensor: (n_crops, H, W, C) -> (n_crops, C, H, W)
+        pixel_values = np.transpose(crops, (0, 3, 1, 2))
+
+        if return_tensors == "pt":
+            # Match HF reference preprocessing exactly: convert uint8 crops to
+            # bfloat16 before in-place normalization.
+            pixel_values = (
+                torch.from_numpy(pixel_values)
+                .to(dtype=torch.bfloat16)
+                .div_(255.0)
+                .sub_(0.5)
+                .div_(0.5)
+            )
+        else:
+            pixel_values = pixel_values.astype(np.float32) / 255.0
+            pixel_values = (pixel_values - 0.5) / 0.5
+
+        return pixel_values, tiling
+
+    def get_num_image_tokens(self) -> int:
+        """Return the number of image tokens (729 = 27x27 patches)."""
+        return self.patches_per_crop
+
+    def batch_decode(self, *args, **kwargs):
+        """Forward to tokenizer's batch_decode."""
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """Forward to tokenizer's decode."""
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        return tokenizer_input_names + ["pixel_values", "tilings"]
+
+
+AutoProcessor.register("Moondream3Processor", Moondream3Processor)
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 4bfa2bf76996..669a6c3c37ad 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -21,29 +21,6 @@
 LOGE2 = 0.6931471805599453
 
 
-def maybe_launch_pdl(value: bool = False) -> dict:
-    """Return launch metadata for Triton kernel calls that may use PDL.
-
-    The ``launch_pdl`` launch attribute (Programmatic Dependent Launch) is
-    a NVIDIA Hopper SM90+ feature exposed by NVIDIA's Triton runtime.
-    Other Triton backends (notably ROCm/HIP) do not recognize this kwarg
-    and raise ``KeyError`` from ``JITKernel._pack_args``. Use this helper
-    in the kernel call site:
-
-        kernel[grid](..., **maybe_launch_pdl())
-
-    so the attribute is only forwarded on platforms whose Triton runtime
-    supports it.
-    """
-    # Lazy import to avoid pulling in the full platform stack at module
-    # import time of vllm.triton_utils.
-    from vllm.platforms import current_platform
-
-    if current_platform.is_cuda():
-        return {"launch_pdl": value}
-    return {}
-
-
 __all__ = [
     "HAS_TRITON",
     "triton",
@@ -51,5 +28,4 @@ def maybe_launch_pdl(value: bool = False) -> dict:
     "tldevice",
     "LOG2E",
     "LOGE2",
-    "maybe_launch_pdl",
 ]
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index f05bc555bfdc..9b2634c90982 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -6,6 +6,7 @@
 from importlib.util import find_spec
 
 from vllm.logger import init_logger
+from vllm.utils.math_utils import cdiv
 
 logger = init_logger(__name__)
 
@@ -79,6 +80,7 @@ def __init__(self):
         self.autotune = self._dummy_decorator("autotune")
         self.heuristics = self._dummy_decorator("heuristics")
         self.Config = self._dummy_decorator("Config")
+        self.cdiv = cdiv
         self.language = TritonLanguagePlaceholder()
 
     def _dummy_decorator(self, name):
diff --git a/vllm/triton_utils/jit_monitor.py b/vllm/triton_utils/jit_monitor.py
new file mode 100644
index 000000000000..5ee33fc51dc4
--- /dev/null
+++ b/vllm/triton_utils/jit_monitor.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Monitor unexpected Triton kernel JIT compilation during inference.
+
+After server warmup completes, any Triton JIT compilation or autotuning
+event indicates a cache miss or unexpected input shape that causes a
+latency spike. This module registers hooks in the Triton runtime to
+detect and log such events so they can be investigated.
+
+Currently monitors:
+- Triton ``@triton.autotune`` cache misses (via ``knobs.autotuning.print``)
+- Triton ``@triton.jit`` first-time compilations
+  (via ``knobs.runtime.jit_post_compile_hook``)
+"""
+
+import os
+
+from vllm.logger import init_logger
+from vllm.triton_utils.importing import HAS_TRITON
+
+logger = init_logger(__name__)
+
+_active: bool = False
+
+
+def is_active() -> bool:
+    """Return whether the JIT compilation monitor is currently active."""
+    return _active
+
+
+def activate() -> None:
+    """Enable JIT compilation monitoring after warmup.
+
+    Call once per worker process at the end of
+    :func:`compile_or_warm_up_model`.  After activation every Triton
+    kernel compilation or autotuning benchmark that happens during
+    inference will be logged as a warning.
+
+    Safe to call multiple times — subsequent calls are no-ops.
+
+    If the user has explicitly set ``TRITON_PRINT_AUTOTUNING=0`` in
+    their environment, autotuning printing is left disabled; the JIT
+    compilation hook is still registered regardless.
+    """
+    global _active
+    if _active:
+        return
+    _active = True
+
+    _setup_triton_autotuning_print()
+    _setup_triton_jit_hook()
+
+    logger.info(
+        "Kernel JIT monitor activated — Triton JIT compilations "
+        "during inference will be logged as warnings."
+    )
+
+
+# ------------------------------------------------------------------
+# Triton autotuning print
+# ------------------------------------------------------------------
+
+
+def _setup_triton_autotuning_print() -> None:
+    """Enable ``TRITON_PRINT_AUTOTUNING`` unless the user opted out."""
+    if not HAS_TRITON:
+        return
+    from triton import knobs  # type: ignore[import-untyped]
+
+    user_val = os.environ.get("TRITON_PRINT_AUTOTUNING")
+    if user_val == "0":
+        logger.debug(
+            "TRITON_PRINT_AUTOTUNING=0 set by user — "
+            "autotuning messages will stay suppressed."
+        )
+        return
+
+    knobs.autotuning.print = True
+
+
+# ------------------------------------------------------------------
+# Triton JIT compilation hook
+# ------------------------------------------------------------------
+
+
+def _setup_triton_jit_hook() -> None:
+    """Register a ``jit_post_compile_hook`` that warns on compilation."""
+    if not HAS_TRITON:
+        return
+    from triton import knobs  # type: ignore[import-untyped]
+
+    existing_hook = knobs.runtime.jit_post_compile_hook
+
+    def _on_jit_compile(**kwargs):
+        # `jit_post_compile_hook` is Triton internal API and its
+        # signature has changed across releases (kwargs added/renamed).
+        # Accept **kwargs so an upstream change cannot crash this hook
+        # with TypeError, and forward the full kwarg set to any
+        # pre-existing hook unchanged.
+        fn = kwargs.get("fn")
+        fn_name = getattr(fn, "name", "<unknown>")
+        logger.warning_once(
+            "Triton kernel JIT compilation during inference: %s. "
+            "This causes a latency spike; consider extending warmup "
+            "to cover this shape/config.",
+            fn_name,
+        )
+        if existing_hook is not None:
+            return existing_hook(**kwargs)
+        return None
+
+    knobs.runtime.jit_post_compile_hook = _on_jit_compile
diff --git a/vllm/utils/cpu_resource_utils.py b/vllm/utils/cpu_resource_utils.py
index bbf554d0ccdd..6baf84266195 100644
--- a/vllm/utils/cpu_resource_utils.py
+++ b/vllm/utils/cpu_resource_utils.py
@@ -3,8 +3,8 @@
 
 import json
 import os
-import platform
 import subprocess
+import sys
 from dataclasses import dataclass
 from functools import cache
 
@@ -78,7 +78,7 @@ def parse_id_list(raw_str: str) -> list[int]:
 
 
 def get_memory_node_info(node_id: int = 0) -> MemoryNodeInfo:
-    if platform.system() == "Darwin":
+    if sys.platform == "darwin":
         # MacOS has no memory node
         return MemoryNodeInfo(
             total_memory=psutil.virtual_memory().total,
@@ -122,17 +122,14 @@ def get_memory_node_info(node_id: int = 0) -> MemoryNodeInfo:
 
 def get_allowed_cpu_list() -> list[LogicalCPUInfo]:
     cpu_list = _get_cpu_list()
-    if platform.system() == "Darwin":
-        return cpu_list
-
-    global_allowed_cpu_id_list = os.sched_getaffinity(0)
-    logical_cpu_list = [x for x in cpu_list if x.id in global_allowed_cpu_id_list]
-
-    return logical_cpu_list
+    if sys.platform == "linux":
+        allowed = os.sched_getaffinity(0)
+        return [x for x in cpu_list if x.id in allowed]
+    return cpu_list
 
 
 def get_visible_memory_node() -> list[int]:
-    if platform.system() == "Darwin":
+    if sys.platform == "darwin":
         return [0]
 
     allowed_memory_node_list = get_memory_affinity()
@@ -163,7 +160,7 @@ def _synthesize_cpu_list() -> list[LogicalCPUInfo]:
 
 
 def _get_cpu_list() -> list[LogicalCPUInfo]:
-    if platform.system() == "Darwin":
+    if sys.platform == "darwin":
         # For MacOS, no user-level CPU affinity and SMT, return all CPUs
         return _synthesize_cpu_list()
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5672aef301ea..828ff08a067d 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -715,6 +715,38 @@ def flashinfer_scaled_fp8_mm(
     return output
 
 
+def flashinfer_scaled_fp8_mm_out(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2 and out.ndim == 2
+    assert a.shape[1] == b.shape[0]
+    assert out.shape == (a.shape[0], b.shape[1])
+    assert scale_a.numel() == 1 and scale_b.numel() == 1
+    assert a.dtype == torch.float8_e4m3fn and b.dtype == torch.float8_e4m3fn
+    assert out.device.type == "cuda"
+    assert a.is_contiguous()
+
+    from flashinfer import bmm_fp8 as bmm_fp8_
+
+    bmm_fp8_(
+        a.unsqueeze(0),
+        # FlashInfer expects the weight in the same column-major view layout
+        # consumed by flashinfer_scaled_fp8_mm, so keep the transposed view.
+        b.unsqueeze(0),
+        scale_a,
+        scale_b,
+        out_dtype or out.dtype,
+        out.unsqueeze(0),
+        "auto",
+    )
+    return out
+
+
 def flashinfer_quant_nvfp4_8x4_sf_layout(
     a: torch.Tensor, a_global_sf: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -833,6 +865,7 @@ def is_flashinfer_cudnn_fp8_prefill_attn_supported() -> bool:
     "use_trtllm_attention",
     "flashinfer_scaled_fp4_mm",
     "flashinfer_scaled_fp8_mm",
+    "flashinfer_scaled_fp8_mm_out",
     "flashinfer_quant_nvfp4_8x4_sf_layout",
     "flashinfer_fp8_blockscale_gemm",
     "should_use_flashinfer_for_blockscale_fp8_gemm",
diff --git a/vllm/utils/multi_stream_utils.py b/vllm/utils/multi_stream_utils.py
index cc6bc6462449..2203221c5a14 100644
--- a/vllm/utils/multi_stream_utils.py
+++ b/vllm/utils/multi_stream_utils.py
@@ -56,3 +56,73 @@ def maybe_execute_in_parallel(
         result0 = fn0()
         result1 = fn1()
     return (result0, result1)
+
+
+def execute_in_parallel(
+    default_fn: Callable[[], Any],
+    aux_fns: list[Callable[[], Any] | None],
+    start_event: torch.cuda.Event,
+    done_events: list[torch.cuda.Event],
+    aux_streams: list[torch.cuda.Stream] | None = None,
+    enable: bool = False,
+) -> tuple[Any, list[Any]]:
+    """Run default_fn on the current stream and aux_fns concurrently on
+    aux_streams.
+
+    Generalizes maybe_execute_in_parallel to N aux callables. Slots where
+    aux_fns[i] is None are skipped (no stream switch, no event record); their
+    corresponding entry in the returned aux_results list is None.
+
+    start_event fans out from the current stream to every launched aux stream;
+    done_events[i] is recorded after aux_fns[i] so the current stream joins
+    before returning. Falls back to sequential execution on the current stream
+    when aux_streams is None or enable is False; in that case default_fn runs
+    first, then aux_fns in order.
+
+    Args:
+        default_fn: Callable for the default (current) stream.
+        aux_fns: Per-aux callables; entries may be None to skip.
+        start_event: CUDA event recorded on the current stream before
+            default_fn so each launched aux stream can wait on it.
+        done_events: One CUDA event per aux slot, recorded after the
+            corresponding aux_fn. Length must match aux_fns.
+        aux_streams: Per-aux CUDA streams. Length must match aux_fns.
+            Multi-stream is disabled when None.
+        enable: Opt-in switch for the multi-stream path. Defaults to False,
+            so callers that pass aux_streams must also pass enable=True
+            (typically gated by an env var) to actually overlap. When False,
+            execution falls back to sequential on the current stream.
+
+    Returns:
+        Tuple of (default_result, aux_results) where aux_results[i] is the
+        result of aux_fns[i] (or None when skipped).
+    """
+    aux_results: list[Any]
+    if aux_streams is None or not enable:
+        default_result = default_fn()
+        aux_results = [fn() if fn is not None else None for fn in aux_fns]
+        return default_result, aux_results
+
+    assert len(aux_fns) == len(aux_streams) == len(done_events), (
+        "aux_fns, aux_streams, and done_events must be the same length"
+    )
+
+    aux_results = [None] * len(aux_fns)
+    pending: list[torch.cuda.Event] = []
+
+    start_event.record()
+    for i, fn in enumerate(aux_fns):
+        if fn is None:
+            continue
+        with torch.cuda.stream(aux_streams[i]):
+            start_event.wait()
+            aux_results[i] = fn()
+            done_events[i].record()
+        pending.append(done_events[i])
+
+    default_result = default_fn()
+
+    for ev in pending:
+        ev.wait()
+
+    return default_result, aux_results
diff --git a/vllm/utils/ompmultiprocessing.py b/vllm/utils/ompmultiprocessing.py
index c3c607ea90b4..6e7b8c3496c9 100644
--- a/vllm/utils/ompmultiprocessing.py
+++ b/vllm/utils/ompmultiprocessing.py
@@ -40,12 +40,13 @@ def __init__(self, config: "VllmConfig"):
 
         assert not (self.use_iomp and self.use_gomp)
 
-        # at least reserve 1/local_world_size(for ARM) core for scheduler
+        # at least reserve 1/local_world_size(for ARM/RISC-V) core for scheduler
         # proc as always use MP executor
         # TODO: make scheduler proc sleep when idle
         self.reserve_cpu_num = (
             self.local_world_size
-            if current_platform.get_cpu_architecture() == CpuArchEnum.ARM
+            if current_platform.get_cpu_architecture()
+            in (CpuArchEnum.ARM, CpuArchEnum.RISCV)
             else 1
         )
         # reserve at one more core for nixl_connector under p/d case
@@ -140,8 +141,8 @@ def _parse_omp_threads_bind_env(self):
                 cpu_list, reserve_list = self._get_autobind_cpu_ids(
                     lambda cpus: cpus[-1:]
                 )
-            elif cpu_arch == CpuArchEnum.ARM:
-                # For AArch64, no SMT, use all logical CPU
+            elif cpu_arch in (CpuArchEnum.ARM, CpuArchEnum.RISCV):
+                # For AArch64 / RISC-V, no SMT, use all logical CPUs
                 cpu_list, reserve_list = self._get_autobind_cpu_ids(lambda cpus: cpus)
             else:
                 cpu_list, reserve_list = [], []
@@ -173,9 +174,15 @@ def _parse_omp_threads_bind_env(self):
             # skip
             self.cpu_lists = []
 
-        msg = "OpenMP thread binding info: \n"
-        for i in range(self.local_world_size):
-            msg += f"\tlocal_rank={i}, core ids={self.cpu_lists[i]}\n"
+        msg = (
+            "OpenMP thread binding info: \n"
+            f"\tVLLM_CPU_OMP_THREADS_BIND={vllm_mask!r}, "
+            f"auto_setup={self.auto_setup}, skip_setup={self.skip_setup}\n"
+            f"\tlocal_world_size={self.local_world_size}, "
+            f"reserve_cpu_num={self.reserve_cpu_num}\n"
+        )
+        for i, cpus in enumerate(self.cpu_lists):
+            msg += f"\tlocal_rank={i}, core ids={cpus}\n"
         msg += f"\treserved_cpus={self.reserved_cpu_list}"
         logger.info(msg)
 
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index 1eb9306ed4b1..798c136fc239 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -110,6 +110,32 @@ def is_strictly_contiguous(t: torch.Tensor) -> bool:
     return True
 
 
+def canonicalize_singleton_dim_strides(t: torch.Tensor) -> torch.Tensor:
+    """Fix degenerate strides on size=1 dimensions for CUDA TMA compatibility.
+
+    PyTorch allows any stride on a size=1 dim (is_contiguous() is always True
+    there), so a size=1 dim may have stride=1 (2 bytes for bf16) instead of
+    the canonical product(shape[i+1:]).  CUDA TMA on H100+ requires all
+    non-outermost strides to be ≥16-byte aligned; stride=1 triggers
+    cudaErrorIllegalInstruction.  Zero-copy: patches stride metadata only via
+    as_strided; returns t unchanged if all size=1 strides are already canonical.
+    """
+    if 1 not in t.shape:
+        return t
+    strides = list(t.stride())
+    shape = t.shape
+    prev_stride = 1
+    changed = False
+    for i in range(len(shape) - 1, -1, -1):
+        if shape[i] == 1 and strides[i] != prev_stride:
+            strides[i] = prev_stride
+            changed = True
+        prev_stride = strides[i] * shape[i]
+    if not changed:
+        return t
+    return t.as_strided(t.shape, strides)
+
+
 @contextlib.contextmanager
 def set_default_torch_dtype(dtype: torch.dtype):
     """Sets the default torch dtype to the given dtype."""
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 76c78fed46a3..51e3e865d52f 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar
+from typing import TYPE_CHECKING, ClassVar
+
+if TYPE_CHECKING:
+    from vllm.config.cache import CacheDType
 
 import torch
 
@@ -18,6 +21,7 @@
     AttentionMetadataBuilder,
     AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
 )
 from vllm.v1.attention.backends.utils import (
     split_decodes_and_prefills,
@@ -26,7 +30,12 @@
 
 logger = init_logger(__name__)
 
-_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86, CpuArchEnum.ARM, CpuArchEnum.S390X)
+_CPU_ARCH_PREFER_MIXED_BATCH = (
+    CpuArchEnum.X86,
+    CpuArchEnum.ARM,
+    CpuArchEnum.S390X,
+    CpuArchEnum.POWERPC,
+)
 
 
 class CPUAttentionBackend(AttentionBackend):
@@ -35,6 +44,16 @@ class CPUAttentionBackend(AttentionBackend):
         torch.bfloat16,
         torch.float32,
     ]
+    supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -133,7 +152,13 @@ def __init__(
         if self.window_size is None:
             self.window_size = -1
         self.block_size = vllm_config.cache_config.block_size
-        self.isa = _get_attn_isa(self.dtype, self.block_size, self.head_dim)
+        kv_cache_dtype_str = vllm_config.cache_config.cache_dtype
+        self.isa = _get_attn_isa(
+            self.dtype,
+            self.block_size,
+            self.head_dim,
+            kv_cache_dtype_str,
+        )
         self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)
 
     def build(
@@ -247,8 +272,7 @@ def __init__(
         self.kv_cache_dtype = kv_cache_dtype
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if is_quantized_kv_cache(kv_cache_dtype):
-            raise NotImplementedError("FP8 KV cache is unsupported in CPU_ATTN")
+        self.is_fp8_kv_cache = is_quantized_kv_cache(kv_cache_dtype)
         self.attn_type = attn_type
 
         self.sinks = sinks
@@ -325,6 +349,9 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 attn_metadata.isa,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                kv_cache_dtype=self.kv_cache_dtype,
             )
 
         if attn_metadata.use_sdpa_prefill:
@@ -356,6 +383,9 @@ def forward(
                 softcap=self.logits_soft_cap,
                 scheduler_metadata=attn_metadata.scheduler_metadata,
                 s_aux=self.sinks,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                kv_cache_dtype=self.kv_cache_dtype,
             )
 
         return output
@@ -477,13 +507,28 @@ def _make_sliding_window_bias(
 
 
 def _get_attn_isa(
-    dtype: torch.dtype, block_size: int, head_size: int | None = None
+    dtype: torch.dtype,
+    block_size: int,
+    head_size: int | None = None,
+    kv_cache_dtype: str | None = None,
 ) -> str:
+    fp8_kv = is_quantized_kv_cache(kv_cache_dtype) if kv_cache_dtype else False
     if head_size is not None and head_size % 32 != 0 and head_size % 16 == 0:
+        if fp8_kv:
+            raise NotImplementedError(
+                "FP8 KV cache requires head_size divisible by 32 on CPU."
+            )
         return "vec16"
     supports_amx = torch.cpu._is_amx_tile_supported()
-    supports_arm = current_platform.get_cpu_architecture() == CpuArchEnum.ARM
-    supports_vxe = current_platform.get_cpu_architecture() == CpuArchEnum.S390X
+    arch = current_platform.get_cpu_architecture()
+    supports_arm = arch == CpuArchEnum.ARM
+    supports_vxe = arch == CpuArchEnum.S390X
+    supports_vsx = arch == CpuArchEnum.POWERPC
+    supports_avx512 = torch.cpu._is_avx512_supported()
+    if fp8_kv and not supports_amx and not supports_avx512:
+        raise NotImplementedError(
+            "FP8 KV cache on CPU requires x86 with AVX-512 or AMX."
+        )
     if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
         return "amx"
     elif block_size % 32 == 0:
@@ -492,6 +537,8 @@ def _get_attn_isa(
             return "neon"
         elif supports_vxe:
             return "vxe"
+        elif supports_vsx:
+            return "vsx"
         else:
             return "vec"
     else:
diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py
index b3a4a0c76ebd..1e74e4c48eda 100644
--- a/vllm/v1/attention/backends/fa_utils.py
+++ b/vllm/v1/attention/backends/fa_utils.py
@@ -115,22 +115,29 @@ def get_flash_attn_version(
             )
             fa_version = 2
 
-        # The FA3 kernel rejects s_aux (sinks) when hdim != hdim_v; upgrade to
-        # FA4 on SM90 when available.
+        # Some FA3 unsupported SM90 cases can use FA4 when available.
         if (
             fa_version == 3
-            and has_sinks
-            and head_size is not None
-            and head_size_v is not None
-            and head_size != head_size_v
             and device_capability.major == 9
             and is_fa_version_supported(4)
         ):
-            logger.info_once(
-                "Diff-KV with sinks: upgrading FlashAttention 3 -> 4",
-                scope="local",
-            )
-            fa_version = 4
+            upgrade_reason = None
+            if head_size is not None and head_size > 256:
+                upgrade_reason = f"FA3 does not support head_size={head_size} on SM90"
+            elif (
+                has_sinks
+                and head_size is not None
+                and head_size_v is not None
+                and head_size != head_size_v
+            ):
+                upgrade_reason = "Diff-KV with sinks"
+            if upgrade_reason:
+                logger.info_once(
+                    "%s: upgrading FlashAttention 3 -> 4",
+                    upgrade_reason,
+                    scope="local",
+                )
+                fa_version = 4
 
         # FA4 currently uses batch-shape-dependent scheduling
         # heuristics on SM100+, which breaks batch invariance.
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 1c9ff3f79e43..4b8b86d864be 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -11,7 +11,10 @@
 
 from vllm.model_executor.layers.attention import Attention
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import is_quantized_kv_cache
+from vllm.utils.torch_utils import (
+    canonicalize_singleton_dim_strides,
+    is_quantized_kv_cache,
+)
 from vllm.v1.attention.backend import (
     AttentionBackend,
     AttentionImpl,
@@ -635,14 +638,6 @@ def __init__(
             requires_alibi=alibi_slopes is not None,
             head_size=head_size,
         )
-        # head_size > 256 requires FA4 on SM90+; force upgrade from FA3
-        if (
-            head_size > 256
-            and self.vllm_flash_attn_version == 3
-            and current_platform.is_cuda()
-            and current_platform.is_device_capability_family(90)
-        ):
-            self.vllm_flash_attn_version = 4
         logger.info_once(
             "Using FlashAttention version %s",
             self.vllm_flash_attn_version,
@@ -747,6 +742,23 @@ def forward(
 
         # For decoder and cross-attention, use KV cache as before
         key_cache, value_cache = kv_cache.unbind(0)
+        # Fix degenerate strides on size-1 dims (e.g. num_kv_heads=1 with TP).
+        # FA3/4 on H100+ uses TMA, which requires ≥16-byte stride alignment.
+        # See vllm.utils.torch_utils.canonicalize_singleton_dim_strides.
+        fixed_k = canonicalize_singleton_dim_strides(key_cache)
+        fixed_v = canonicalize_singleton_dim_strides(value_cache)
+        if fixed_k is not key_cache or fixed_v is not value_cache:
+            logger.debug(
+                "Canonicalized degenerate KV cache strides (FlashAttention): "
+                "shape=%s, key strides before=%s after=%s, "
+                "value strides before=%s after=%s",
+                key_cache.shape,
+                key_cache.stride(),
+                fixed_k.stride(),
+                value_cache.stride(),
+                fixed_v.stride(),
+            )
+        key_cache, value_cache = fixed_k, fixed_v
 
         if is_quantized_kv_cache(self.kv_cache_dtype):
             # queries are quantized in the attention layer
@@ -861,6 +873,8 @@ def do_kv_cache_update(
             # we use direct Q, K, V tensors without caching
             return
 
+        # Scatter write into the KV cache using slot_mapping indices.
+        # No TMA kernel is invoked here, so stride canonicalization is not needed.
         key_cache, value_cache = kv_cache.unbind(0)
 
         # Reshape the input keys and values and store them in the cache.
diff --git a/vllm/v1/attention/backends/flash_attn_diffkv.py b/vllm/v1/attention/backends/flash_attn_diffkv.py
index d18054769711..82a9f07a4e59 100644
--- a/vllm/v1/attention/backends/flash_attn_diffkv.py
+++ b/vllm/v1/attention/backends/flash_attn_diffkv.py
@@ -4,7 +4,11 @@
 
 import torch
 
-from vllm.utils.torch_utils import is_quantized_kv_cache
+from vllm.logger import init_logger
+from vllm.utils.torch_utils import (
+    canonicalize_singleton_dim_strides,
+    is_quantized_kv_cache,
+)
 from vllm.v1.attention.backend import AttentionType
 from vllm.v1.attention.backends.fa_utils import (
     get_flash_attn_version,
@@ -25,6 +29,8 @@
     cascade_attention,
 )
 
+logger = init_logger(__name__)
+
 
 class FlashAttentionDiffKVBackend(FlashAttentionBackend):
     # Default to 128 for this backend
@@ -204,6 +210,23 @@ def forward(
         # Different head_size for K and V
         key_cache = kv_cache[..., : self.head_size]
         value_cache = kv_cache[..., self.head_size :]
+        # Fix degenerate strides on size-1 dims (e.g. num_kv_heads=1 with TP).
+        # FA3/4 on H100+ uses TMA, which requires ≥16-byte stride alignment.
+        # See vllm.utils.torch_utils.canonicalize_singleton_dim_strides.
+        fixed_k = canonicalize_singleton_dim_strides(key_cache)
+        fixed_v = canonicalize_singleton_dim_strides(value_cache)
+        if fixed_k is not key_cache or fixed_v is not value_cache:
+            logger.debug(
+                "Canonicalized degenerate KV cache strides (FlashAttentionDiffKV): "
+                "shape=%s, key strides before=%s after=%s, "
+                "value strides before=%s after=%s",
+                key_cache.shape,
+                key_cache.stride(),
+                fixed_k.stride(),
+                value_cache.stride(),
+                fixed_v.stride(),
+            )
+        key_cache, value_cache = fixed_k, fixed_v
 
         if is_quantized_kv_cache(self.kv_cache_dtype):
             # queries are quantized in the attention layer
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 662ead1d1d01..2de61a2b1f28 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -43,6 +43,7 @@
 from vllm.utils.math_utils import cdiv
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.torch_utils import (
+    canonicalize_singleton_dim_strides,
     is_quantized_kv_cache,
     is_strictly_contiguous,
     nvfp4_kv_cache_full_dim,
@@ -332,6 +333,7 @@ class FlashInferBackend(AttentionBackend):
         "fp8",
         "fp8_e4m3",
         "fp8_e5m2",
+        "nvfp4",
     ]
 
     @staticmethod
@@ -388,13 +390,15 @@ def get_kv_cache_stride_order(
         return stride_order
 
     @staticmethod
-    def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
+    def get_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
             return torch.float8_e4m3fn
         elif kv_cache_dtype == "fp8_e5m2":
             return torch.float8_e5m2
+        elif kv_cache_dtype == "nvfp4":
+            return torch.uint8
         else:
-            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
+            raise ValueError(f"Unrecognized dtype: {kv_cache_dtype}")
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -622,9 +626,8 @@ def __init__(
                 # For NVFP4, kv_cache_dtype stays as the string "nvfp4"
                 # which is passed to FlashInferImpl
                 self.kv_cache_dtype = self.cache_dtype
-                raise NotImplementedError("nvfp4 KV cache is not yet supported")
             else:
-                self.kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                self.kv_cache_dtype = FlashInferBackend.get_dtype_for_flashinfer(
                     self.cache_dtype
                 )
         else:
@@ -645,7 +648,7 @@ def __init__(
         ):
             if self.is_kvcache_nvfp4:
                 # NVFP4 KV cache uses FP8 quantized queries
-                self.q_data_type = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                self.q_data_type = FlashInferBackend.get_dtype_for_flashinfer(
                     "fp8_e4m3"
                 )
             else:
@@ -765,8 +768,13 @@ def _get_prefill_wrapper(
                     dcp_a2a=self.dcp_a2a,
                 )
             else:
+                # NVFP4 KV cache requires the trtllm-gen backend inside
+                # the wrapper; fa2/fa3 do not support nvfp4.
+                backend = "trtllm-gen" if self.is_kvcache_nvfp4 else "auto"
                 self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                    self._get_workspace_buffer(), get_kv_cache_layout()
+                    self._get_workspace_buffer(),
+                    get_kv_cache_layout(),
+                    backend=backend,
                 )
         assert self._prefill_wrapper is not None
         return self._prefill_wrapper
@@ -786,6 +794,9 @@ def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False):
                 paged_kv_indptr = None
                 paged_kv_indices = None
                 paged_kv_last_page_len = None
+            # NVFP4 KV cache requires the trtllm-gen backend inside
+            # the wrapper; fa2/fa3 do not support nvfp4.
+            backend = "trtllm-gen" if self.is_kvcache_nvfp4 else "auto"
             decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
                 self._get_workspace_buffer(),
                 get_kv_cache_layout(),
@@ -797,6 +808,7 @@ def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False):
                 # at least as good as cuda cores for all attention ops in latest
                 # gpus.
                 use_tensor_cores=True,
+                backend=backend,
             )
 
             # save the decode wrapper
@@ -1148,6 +1160,12 @@ def build(
                         prefill_wrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
                     )
+                    # NVFP4 trtllm kernel only supports FP8 output;
+                    # use FP8 o_data_type so the wrapper matches the
+                    # FP8 output buffer allocated in forward().
+                    o_dtype = (
+                        FP8_DTYPE if self.is_kvcache_nvfp4 else self.model_config.dtype
+                    )
                     prefill_wrapper.plan(
                         qo_indptr=qo_indptr_prefill_cpu,
                         paged_kv_indptr=paged_kv_indptr_prefill_cpu,
@@ -1163,7 +1181,7 @@ def build(
                         logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
-                        o_data_type=self.model_config.dtype,
+                        o_data_type=o_dtype,
                         fixed_split_size=self.prefill_fixed_split_size,
                         disable_split_kv=self.disable_split_kv,
                     )
@@ -1197,6 +1215,12 @@ def build(
                 # Use the persistent buffer with padding length,
                 # instead of the same address but chunked version
                 # in atten_metadata when using cudagraph.
+                # NVFP4 trtllm kernel only supports FP8 output;
+                # use FP8 o_data_type so the wrapper matches the
+                # FP8 output buffer allocated in forward().
+                o_dtype = (
+                    FP8_DTYPE if self.is_kvcache_nvfp4 else self.model_config.dtype
+                )
                 fast_plan_decode(
                     decode_wrapper,
                     indptr_cpu=self.paged_kv_indptr.cpu[: num_input_tokens + 1],
@@ -1215,7 +1239,7 @@ def build(
                     logits_soft_cap=self.logits_soft_cap,
                     q_data_type=self.q_data_type,
                     kv_data_type=self.kv_cache_dtype,
-                    o_data_type=self.model_config.dtype,
+                    o_data_type=o_dtype,
                     fixed_split_size=self.decode_fixed_split_size,
                     disable_split_kv=self.disable_split_kv,
                 )
@@ -1300,6 +1324,17 @@ def __init__(
         self.bmm2_scale: float | None = None
         self.o_sf_scale: float | None = None
 
+        # Pre-allocated FP8 output buffer for NVFP4 without fused output quant.
+        if self.is_kvcache_nvfp4 and vllm_config is not None:
+            max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
+            self._nvfp4_fp8_out = torch.empty(
+                (max_num_tokens, num_heads, head_size),
+                dtype=FP8_DTYPE,
+                device="cuda",
+            )
+        else:
+            self._nvfp4_fp8_out = None
+
         dcp_a2a = (
             vllm_config is not None
             and vllm_config.parallel_config.decode_context_parallel_size > 1
@@ -1420,7 +1455,7 @@ def forward(
         if self.kv_sharing_target_layer_name is None and is_quantized_kv_cache(
             self.kv_cache_dtype
         ):
-            torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+            torch_dtype = FlashInferBackend.get_dtype_for_flashinfer(
                 self.kv_cache_dtype
             )
             kv_cache = kv_cache.view(torch_dtype)
@@ -1445,6 +1480,21 @@ def forward(
 
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         kv_cache_permute = kv_cache.permute(*stride_order)  # HND and contiguous
+        # Fix degenerate strides on any size-1 dimension (e.g. num_kv_heads=1
+        # with TP=8).  PyTorch permits non-canonical strides on size-1 dims;
+        # CUDA TMA requires ≥16-byte alignment on all non-outermost strides.
+        # canonicalize_singleton_dim_strides patches metadata via as_strided —
+        # zero-copy.  See vllm.utils.torch_utils.
+        fixed = canonicalize_singleton_dim_strides(kv_cache_permute)
+        if fixed is not kv_cache_permute:
+            logger.debug(
+                "Canonicalized degenerate KV cache strides (FlashInfer): "
+                "shape=%s, strides before=%s, strides after=%s",
+                kv_cache_permute.shape,
+                kv_cache_permute.stride(),
+                fixed.stride(),
+            )
+        kv_cache_permute = fixed
 
         # For NVFP4, the kv_cache last dim is full_dim (data + scale packed).
         # Split into correctly-strided data and scale views.
@@ -1500,20 +1550,45 @@ def forward(
                     )
                     assert prefill_wrapper._sm_scale == self.scale
                     assert prefill_wrapper._causal
+
+                    if self.is_kvcache_nvfp4:
+                        kv_cache_permute = nvfp4_kv_data
+                    kv_cache_sf = (
+                        nvfp4_kv_block_scales if self.is_kvcache_nvfp4 else None
+                    )
+
+                    # NVFP4 trtllm kernel only supports FP8 output.
+                    # Use a pre-allocated FP8 buffer and dequantize
+                    # afterwards.
+                    needs_fp8_out_prefill = (
+                        self.is_kvcache_nvfp4 and output.dtype != FP8_DTYPE
+                    )
+                    if needs_fp8_out_prefill:
+                        out_prefill = self._nvfp4_fp8_out[:num_prefill_tokens]
+                    else:
+                        out_prefill = output[num_decode_tokens:]
+
                     prefill_wrapper.run(
                         prefill_query,
                         kv_cache_permute,
                         k_scale=layer._k_scale_float,
                         v_scale=layer._v_scale_float,
-                        out=output[num_decode_tokens:],
+                        out=out_prefill,
+                        kv_cache_sf=kv_cache_sf,
                     )
+
+                    if needs_fp8_out_prefill:
+                        output[
+                            num_decode_tokens : num_decode_tokens + num_prefill_tokens
+                        ].copy_(out_prefill.to(output.dtype))
             else:
                 assert isinstance(attn_metadata.prefill, TRTLLMPrefill)
                 # prefill_query may be non-contiguous or have degenerate strides
-                # First ensure memory contiguity, then fix degenerate strides
-                # with reshape. contiguous() alone doesn't fix degenerate
-                # strides when a dimension has size 1.
-                prefill_query = prefill_query.contiguous().reshape(prefill_query.shape)
+                # on size=1 dims. contiguous() ensures memory layout; then
+                # canonicalize_singleton_dim_strides fixes any remaining
+                # degenerate strides on size=1 dims for TMA alignment.
+                prefill_query = prefill_query.contiguous()
+                prefill_query = canonicalize_singleton_dim_strides(prefill_query)
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
                 block_tables_prefill = attn_metadata.prefill.block_tables
                 seq_lens_prefill = attn_metadata.prefill.seq_lens
@@ -1537,6 +1612,12 @@ def forward(
                     assert self.o_sf_scale is None
                     out = output[num_decode_tokens:]
 
+                # NVFP4 trtllm kernel only supports FP8 output.
+                # Use a pre-allocated FP8 buffer and dequantize afterwards.
+                needs_fp8_out = self.is_kvcache_nvfp4 and output.dtype != FP8_DTYPE
+                if needs_fp8_out:
+                    out = self._nvfp4_fp8_out[:num_prefill_tokens]
+
                 prefill_kv_block_scales = None
                 if self.is_kvcache_nvfp4:
                     # NVFP4 trtllm-gen kernel requires FP8 query.
@@ -1547,7 +1628,7 @@ def forward(
                     )
                     mock_kv_cache = nvfp4_kv_data
                     mock_block_table = block_tables_prefill
-                    prefill_kv_block_scales = nvfp4_kv_block_scales  # noqa: F841
+                    prefill_kv_block_scales = nvfp4_kv_block_scales
                 elif (
                     attn_metadata.q_data_type != FP8_DTYPE
                     and self.kv_cache_dtype.startswith("fp8")
@@ -1557,11 +1638,9 @@ def forward(
                     # with fp8 kv cache, we can construct a mock block
                     # and mock kv cache with BF16 KV involved in the prefill
                     #
-                    # The inner (block_size, head_size) dims must be
-                    # contiguous; outer dims may have non-canonical strides
-                    # (e.g. cross-layer unified allocation).
-                    # Degenerate strides on outer dims break TMA descriptors
-                    # (see flashinfer-ai/flashinfer#2232).
+                    kv_cache_permute = canonicalize_singleton_dim_strides(
+                        kv_cache_permute
+                    )
                     kv_strides = kv_cache_permute.stride()
                     assert (
                         kv_strides[-1] == 1
@@ -1598,8 +1677,14 @@ def forward(
                     sinks=self.sinks,
                     o_sf_scale=self.o_sf_scale,
                     out=out,
+                    kv_cache_sf=prefill_kv_block_scales,
                 )
 
+                if needs_fp8_out:
+                    output[
+                        num_decode_tokens : num_decode_tokens + num_prefill_tokens
+                    ].copy_(out[:num_prefill_tokens].to(output.dtype))
+
         if num_decode_tokens > 0:
             decode_query = query[:num_decode_tokens]
             assert decode_query.shape[0] == num_decode_tokens
@@ -1612,6 +1697,18 @@ def forward(
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
 
+                if self.is_kvcache_nvfp4:
+                    kv_cache_permute = nvfp4_kv_data
+                kv_cache_sf = nvfp4_kv_block_scales if self.is_kvcache_nvfp4 else None
+
+                # NVFP4 kernel only supports FP8 output.
+                # Use a pre-allocated FP8 buffer and dequantize afterwards.
+                needs_fp8_out = self.is_kvcache_nvfp4 and output.dtype != FP8_DTYPE
+                if needs_fp8_out:
+                    out_decode = self._nvfp4_fp8_out[:num_decode_tokens]
+                else:
+                    out_decode = output[:num_decode_tokens]
+
                 if use_dcp:
                     decode_query = get_dcp_group().all_gather(
                         decode_query.contiguous(), dim=-2
@@ -1630,6 +1727,7 @@ def forward(
                         out=output_tmp,
                         lse=lse,
                         return_lse=True,
+                        kv_cache_sf=kv_cache_sf,
                     )
                     output[:num_decode_tokens] = self.dcp_combine(
                         output_tmp,
@@ -1642,15 +1740,20 @@ def forward(
                         kv_cache_permute,
                         k_scale=layer._k_scale_float,
                         v_scale=layer._v_scale_float,
-                        out=output[:num_decode_tokens],
+                        out=out_decode,
+                        kv_cache_sf=kv_cache_sf,
                     )
+
+                if needs_fp8_out:
+                    output[:num_decode_tokens].copy_(out_decode.to(output.dtype))
             else:
-                # decode_query may be non-contiguous or have degenerate strides
                 assert isinstance(attn_metadata.decode, TRTLLMDecode)
-                # First ensure memory contiguity, then fix degenerate strides
-                # with reshape. contiguous() alone doesn't fix degenerate
-                # strides when a dimension has size 1.
-                decode_query = decode_query.contiguous().reshape(decode_query.shape)
+                # decode_query may be non-contiguous or have degenerate strides
+                # on size=1 dims. contiguous() ensures memory layout; then
+                # canonicalize_singleton_dim_strides fixes any remaining
+                # degenerate strides on size=1 dims for TMA alignment.
+                decode_query = decode_query.contiguous()
+                decode_query = canonicalize_singleton_dim_strides(decode_query)
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
                 block_tables_decode = attn_metadata.decode.block_tables
                 seq_lens_decode = attn_metadata.decode.seq_lens
@@ -1661,11 +1764,7 @@ def forward(
                 assert is_strictly_contiguous(workspace_buffer)
                 assert is_strictly_contiguous(block_tables_decode)
                 assert is_strictly_contiguous(seq_lens_decode)
-                # kv_cache outer dims may be non-contiguous (e.g.
-                # cross-layer unified allocation), but inner dims
-                # (block_size, head_size) must be contiguous and
-                # strides must be canonical to avoid TMA descriptor
-                # failures (see flashinfer-ai/flashinfer#2232).
+                kv_cache_permute = canonicalize_singleton_dim_strides(kv_cache_permute)
                 kv_strides = kv_cache_permute.stride()
                 assert (
                     kv_strides[-1] == 1 and kv_strides[-2] == kv_cache_permute.shape[-1]
@@ -1686,6 +1785,12 @@ def forward(
                     assert self.o_sf_scale is None
                     out = output[:num_decode_tokens]
 
+                # NVFP4 trtllm kernel only supports FP8 output.
+                # Use a pre-allocated FP8 buffer and dequantize afterwards.
+                needs_fp8_out = self.is_kvcache_nvfp4 and output.dtype != FP8_DTYPE
+                if needs_fp8_out:
+                    out = self._nvfp4_fp8_out[:num_decode_tokens]
+
                 if num_decode_tokens % attn_metadata.num_decodes != 0:
                     # This gets triggered when the dummy_run forces
                     # attention to be initialized with q_len = 0
@@ -1695,9 +1800,9 @@ def forward(
 
                 trtllm_batch_decode_with_kv_cache(
                     query=decode_query,
-                    kv_cache=nvfp4_kv_data
-                    if self.is_kvcache_nvfp4
-                    else kv_cache_permute,
+                    kv_cache=(
+                        nvfp4_kv_data if self.is_kvcache_nvfp4 else kv_cache_permute
+                    ),
                     workspace_buffer=workspace_buffer,
                     block_tables=block_tables_decode,
                     seq_lens=seq_lens_decode,
@@ -1709,7 +1814,13 @@ def forward(
                     o_sf_scale=self.o_sf_scale,
                     out=out,
                     q_len_per_req=q_len_per_req,
+                    kv_cache_sf=(
+                        nvfp4_kv_block_scales if self.is_kvcache_nvfp4 else None
+                    ),
                 )
+
+                if needs_fp8_out:
+                    output[:num_decode_tokens].copy_(out.to(output.dtype))
         return output_padded
 
     def do_kv_cache_update(
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index a917235ed8cb..1de6eb408ae2 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -35,6 +35,7 @@
     AttentionMetadataBuilder,
     AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec, EncoderOnlyAttentionSpec
 
@@ -99,6 +100,10 @@ def supports_attn_type(cls, attn_type: str) -> bool:
         """FlexAttention supports both decoder and encoder-only attention."""
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
+    @classmethod
+    def supports_batch_invariance(cls) -> bool:
+        return True
+
     @classmethod
     def supports_mm_prefix(cls) -> bool:
         """FlexAttention supports full attention for image tokens."""
@@ -130,6 +135,10 @@ def use_cascade_attention(*args, **kwargs) -> bool:
     def get_supported_head_sizes(cls) -> list[int]:
         return []
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
 
 # @torch.compile(fullgraph=True, mode="reduce-overhead")
 def physical_to_logical_mapping(
@@ -326,15 +335,9 @@ class BlockSparsityHint(NamedTuple):
 
 
 def copy_to_persistent(dst, src):
-    try:
-        dst = dst.as_strided(src.shape, src.stride())
-    except RuntimeError as e:
-        raise RuntimeError(
-            f"Fail to re-stride a persistent tensor of shape {dst.shape} "
-            f"for a tensor of shape {src.shape}"
-        ) from e
-    dst.copy_(src)
-    return dst
+    sliced = dst[tuple(slice(0, s) for s in src.shape)]
+    sliced.copy_(src)
+    return sliced
 
 
 @dataclass
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index ded321834607..7c0715a9e8b6 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -122,7 +122,7 @@ def get_name() -> str:
 
     @staticmethod
     def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
-        return [1 if current_platform.is_rocm() else 64]
+        return [1, 64] if current_platform.is_rocm() else [64]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -303,7 +303,10 @@ def __init__(self, *args, **kwargs):
                 device=self.device,
             )
         self.arange_buffer = torch.arange(
-            scheduler_config.max_num_seqs * next_n,
+            max(
+                scheduler_config.max_num_seqs * next_n,
+                scheduler_config.max_num_batched_tokens,
+            ),
             dtype=torch.int32,
             device=self.device,
         )
diff --git a/vllm/v1/attention/backends/mla/prefill/__init__.py b/vllm/v1/attention/backends/mla/prefill/__init__.py
new file mode 100644
index 000000000000..ae5b7ae82598
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/prefill/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+from vllm.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
+from vllm.v1.attention.backends.mla.prefill.selector import get_mla_prefill_backend
+
+__all__ = [
+    "MLAPrefillBackend",
+    "MLAPrefillBackendEnum",
+    "get_mla_prefill_backend",
+]
diff --git a/vllm/v1/attention/backends/mla/prefill/base.py b/vllm/v1/attention/backends/mla/prefill/base.py
new file mode 100644
index 000000000000..91d668826fd9
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/prefill/base.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Abstract base class for MLA prefill backends."""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, ClassVar
+
+import torch
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.attention.mla_attention import (
+        MLACommonPrefillMetadata,
+    )
+    from vllm.platforms.interface import DeviceCapability
+    from vllm.v1.attention.backends.mla.prefill.selector import (
+        MLAPrefillSelectorConfig,
+    )
+
+
+class MLAPrefillBackend(ABC):
+    """Abstract base class for MLA prefill backends."""
+
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    requires_r1_mla_dimensions: ClassVar[bool] = False
+
+    @staticmethod
+    @abstractmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @classmethod
+    def supports_compute_capability(cls, device_capability: "DeviceCapability") -> bool:
+        return True
+
+    @classmethod
+    def supports_dtype(cls, dtype: torch.dtype) -> bool:
+        return dtype in cls.supported_dtypes
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return True
+
+    @classmethod
+    def validate_configuration(
+        cls,
+        device_capability: "DeviceCapability",
+        selector_config: "MLAPrefillSelectorConfig",
+    ) -> list[str]:
+        invalid_reasons: list[str] = []
+
+        if not cls.supports_compute_capability(device_capability):
+            invalid_reasons.append(
+                f"compute capability {device_capability.major}."
+                f"{device_capability.minor} not supported"
+            )
+
+        if not cls.supports_dtype(selector_config.dtype):
+            invalid_reasons.append(f"dtype {selector_config.dtype} not supported")
+
+        if not cls.is_available():
+            invalid_reasons.append("required dependencies not available")
+
+        if cls.requires_r1_mla_dimensions and not selector_config.is_r1_compatible:
+            invalid_reasons.append(
+                "model does not have DeepSeek R1 MLA dimensions "
+                "(qk_nope_head_dim=128, qk_rope_head_dim=64, v_head_dim=128)"
+            )
+
+        return invalid_reasons
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        vllm_config: "VllmConfig",
+    ) -> None:
+        self.num_heads = num_heads
+        self.scale = scale
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.vllm_config = vllm_config
+
+    def prepare_metadata(  # noqa: B027
+        self,
+        prefill_metadata: "MLACommonPrefillMetadata",
+    ) -> None:
+        """Prepare backend-specific metadata before the forward pass.
+
+        Called by the metadata builder after constructing the prefill metadata.
+        """
+        self._prefill_metadata = prefill_metadata
+
+    @abstractmethod
+    def run_prefill_new_tokens(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def run_prefill_context_chunk(
+        self,
+        chunk_idx: int,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/vllm/v1/attention/backends/mla/prefill/flash_attn.py b/vllm/v1/attention/backends/mla/prefill/flash_attn.py
new file mode 100644
index 000000000000..029bd8ec9560
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/prefill/flash_attn.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""FlashAttention backend for MLA prefill."""
+
+import functools
+from typing import TYPE_CHECKING
+
+import torch
+
+import vllm.envs as envs
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.fa_utils import (
+    get_flash_attn_version,
+    is_flash_attn_varlen_func_available,
+)
+from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+if is_flash_attn_varlen_func_available():
+    from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func
+else:
+    flash_attn_varlen_func = None  # type: ignore[assignment]
+
+
+class FlashAttnPrefillBackend(MLAPrefillBackend):
+    """FlashAttention backend for MLA prefill."""
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN"
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return is_flash_attn_varlen_func_available()
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        vllm_config: "VllmConfig",
+    ) -> None:
+        super().__init__(
+            num_heads=num_heads,
+            scale=scale,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            vllm_config=vllm_config,
+        )
+
+        # Handle the differences between the flash_attn_varlen from
+        # flash_attn and the one from vllm_flash_attn
+        assert flash_attn_varlen_func is not None, (
+            "FlashAttnPrefillBackend requires flash_attn_varlen_func. "
+            "Ensure FlashAttnPrefillBackend.is_available() is checked first."
+        )
+        qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+        self.vllm_flash_attn_version = get_flash_attn_version(head_size=qk_head_dim)
+        if self.vllm_flash_attn_version is not None:
+            self.flash_attn_varlen_func = functools.partial(
+                flash_attn_varlen_func, fa_version=self.vllm_flash_attn_version
+            )
+
+        # Determine if we need to pad V
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims.
+        # FA3 on Hopper (SM90) and FA4 natively handle diff headdims.
+        device_capability = current_platform.get_device_capability()
+        self.requires_v_padding = self.vllm_flash_attn_version is None or not (
+            (
+                self.vllm_flash_attn_version == 3
+                and device_capability is not None
+                and device_capability[0] == 9
+            )
+            or self.vllm_flash_attn_version == 4
+        )
+
+        # Track whether we're using vllm's FA or upstream (for ROCm)
+        self._is_vllm_fa = current_platform.is_cuda() or current_platform.is_xpu()
+
+    def _flash_attn_varlen_diff_headdims(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool = False,
+        softmax_scale: float | None = None,
+        **kwargs,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        maybe_padded_v = v
+        if self.requires_v_padding:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0
+            )
+
+        if self._is_vllm_fa:
+            kwargs["return_softmax_lse"] = return_softmax_lse
+        else:
+            # ROCm leverages the upstream flash_attn, which takes a parameter
+            # called "return_attn_probs" instead of return_softmax_lse
+            kwargs["return_attn_probs"] = return_softmax_lse
+        if envs.VLLM_BATCH_INVARIANT:
+            kwargs["num_splits"] = 1
+
+        attn_out = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=maybe_padded_v,
+            softmax_scale=softmax_scale,
+            **kwargs,
+        )
+
+        # Unpack the output if there are multiple results
+        lse = None
+        if isinstance(attn_out, tuple):
+            attn_out, lse = attn_out[0], attn_out[1]
+
+        # Unpad output back to v_head_dim if we padded V
+        if self.requires_v_padding:
+            attn_out = attn_out[..., : v.shape[-1]]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            return attn_out, lse
+        return attn_out
+
+    def run_prefill_new_tokens(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        return self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=self._prefill_metadata.query_start_loc,
+            cu_seqlens_k=self._prefill_metadata.query_start_loc,
+            max_seqlen_q=self._prefill_metadata.max_query_len,
+            max_seqlen_k=self._prefill_metadata.max_query_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=return_softmax_lse,
+        )
+
+    def run_prefill_context_chunk(
+        self,
+        chunk_idx: int,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert self._prefill_metadata.chunked_context is not None
+        return self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=self._prefill_metadata.query_start_loc,
+            cu_seqlens_k=self._prefill_metadata.chunked_context.cu_seq_lens[chunk_idx],
+            max_seqlen_q=self._prefill_metadata.max_query_len,
+            max_seqlen_k=self._prefill_metadata.chunked_context.max_seq_lens[chunk_idx],
+            softmax_scale=self.scale,
+            causal=False,  # Context is unmasked
+            return_softmax_lse=True,
+        )
diff --git a/vllm/v1/attention/backends/mla/prefill/flashinfer.py b/vllm/v1/attention/backends/mla/prefill/flashinfer.py
new file mode 100644
index 000000000000..77199fb5238a
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/prefill/flashinfer.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""FlashInfer backend for MLA prefill."""
+
+from typing import TYPE_CHECKING
+
+import torch
+
+import vllm.envs as envs
+from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+from vllm.v1.attention.backends.utils import (
+    PerLayerParameters,
+    get_per_layer_parameters,
+    infer_global_hyperparameters,
+)
+from vllm.v1.worker.workspace import current_workspace_manager
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.attention.mla_attention import (
+        MLACommonPrefillMetadata,
+    )
+    from vllm.platforms.interface import DeviceCapability
+
+try:
+    from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
+except ImportError:
+    BatchPrefillWithRaggedKVCacheWrapper = object  # type: ignore[misc,assignment]
+
+_DEFAULT_NUM_CHUNKS = 32
+
+
+class FlashInferPrefillBackend(MLAPrefillBackend):
+    """FlashInfer backend for MLA prefill."""
+
+    requires_r1_mla_dimensions = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER"
+
+    @classmethod
+    def supports_compute_capability(cls, device_capability: "DeviceCapability") -> bool:
+        return device_capability.major == 10
+
+    @classmethod
+    def is_available(cls) -> bool:
+        try:
+            from flashinfer import (
+                BatchPrefillWithRaggedKVCacheWrapper,  # noqa: F401
+            )
+
+            return True
+        except ImportError:
+            return False
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        vllm_config: "VllmConfig",
+    ) -> None:
+        super().__init__(
+            num_heads=num_heads,
+            scale=scale,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            vllm_config=vllm_config,
+        )
+
+        self._prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
+        self._prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = []
+        self._global_hyperparameters: PerLayerParameters | None = None
+
+    def _ensure_chunks(
+        self,
+        num_chunks: int,
+        workspace_buffer: torch.Tensor,
+    ) -> None:
+        if len(self._prefill_chunks) < num_chunks:
+            for _ in range(len(self._prefill_chunks), num_chunks):
+                self._prefill_chunks.append(
+                    BatchPrefillWithRaggedKVCacheWrapper(
+                        workspace_buffer, "NHD", backend="cutlass"
+                    )
+                )
+
+    def _resolve_global_hyperparameters(self) -> PerLayerParameters:
+        if self._global_hyperparameters is not None:
+            return self._global_hyperparameters
+
+        from vllm.model_executor.layers.attention.mla_attention import (
+            MLAAttention,
+            MLACommonImpl,
+        )
+
+        forward_context = self.vllm_config.compilation_config.static_forward_context
+        layer_names = [
+            name
+            for name, layer in forward_context.items()
+            if isinstance(layer, MLAAttention)
+        ]
+
+        self._global_hyperparameters = infer_global_hyperparameters(
+            get_per_layer_parameters(
+                self.vllm_config,
+                layer_names,
+                MLACommonImpl,  # type: ignore[type-abstract]
+            )
+        )
+        return self._global_hyperparameters
+
+    def prepare_metadata(
+        self,
+        prefill_metadata: "MLACommonPrefillMetadata",
+    ) -> None:
+        global_hyperparameters = self._resolve_global_hyperparameters()
+        qo_indptr = prefill_metadata.query_start_loc
+        has_context = prefill_metadata.chunked_context is not None
+        (workspace_buffer,) = current_workspace_manager().get_simultaneous(
+            ((envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,), torch.uint8),
+        )
+
+        if self._prefill_main is None:
+            self._prefill_main = BatchPrefillWithRaggedKVCacheWrapper(
+                workspace_buffer, "NHD", backend="cutlass"
+            )
+            self._ensure_chunks(_DEFAULT_NUM_CHUNKS, workspace_buffer)
+
+        if has_context:
+            chunked_context = prefill_metadata.chunked_context
+            assert chunked_context is not None
+            num_chunks = chunked_context.cu_seq_lens.shape[0]
+            self._ensure_chunks(num_chunks, workspace_buffer)
+
+        num_qo_heads = self.num_heads
+        num_kv_heads = num_qo_heads
+
+        head_dim_qk = self.qk_nope_head_dim + self.qk_rope_head_dim
+        head_dim_vo = self.v_head_dim
+        kv_indptr = qo_indptr.clone()
+
+        assert self._prefill_main is not None
+        self._prefill_main.plan(
+            qo_indptr=qo_indptr,
+            kv_indptr=kv_indptr,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim_qk=head_dim_qk,
+            head_dim_vo=head_dim_vo,
+            causal=True,
+            sm_scale=global_hyperparameters.sm_scale,
+            window_left=global_hyperparameters.window_left,
+            logits_soft_cap=global_hyperparameters.logits_soft_cap,
+            q_data_type=prefill_metadata.q_data_type,
+            o_data_type=prefill_metadata.output_dtype,
+        )
+
+        if has_context:
+            chunked_context = prefill_metadata.chunked_context
+            assert chunked_context is not None
+            for i in range(num_chunks):
+                kv_indptr_chunk = chunked_context.cu_seq_lens[i]
+
+                self._prefill_chunks[i].plan(
+                    qo_indptr=qo_indptr,
+                    kv_indptr=kv_indptr_chunk,
+                    num_qo_heads=num_qo_heads,
+                    num_kv_heads=num_kv_heads,
+                    head_dim_qk=head_dim_qk,
+                    head_dim_vo=head_dim_vo,
+                    causal=False,
+                    sm_scale=global_hyperparameters.sm_scale,
+                    window_left=global_hyperparameters.window_left,
+                    logits_soft_cap=global_hyperparameters.logits_soft_cap,
+                    q_data_type=prefill_metadata.q_data_type,
+                    o_data_type=prefill_metadata.output_dtype,
+                )
+
+    def run_prefill_new_tokens(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        assert self._prefill_main is not None
+
+        ret = self._prefill_main.run(
+            q=q,
+            k=k,
+            v=v,
+            return_lse=return_softmax_lse,
+        )
+
+        if isinstance(ret, tuple):
+            # Convert from (q_len, num_heads) to (num_heads, q_len)
+            return ret[0], ret[1].transpose(0, 1).contiguous()
+        return ret
+
+    def run_prefill_context_chunk(
+        self,
+        chunk_idx: int,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        attn_out, lse = self._prefill_chunks[chunk_idx].run(
+            q=q,
+            k=k,
+            v=v,
+            return_lse=True,
+        )
+
+        # Convert from (q_len, num_heads) to (num_heads, q_len)
+        return attn_out, lse.transpose(0, 1).contiguous()
diff --git a/vllm/v1/attention/backends/mla/prefill/registry.py b/vllm/v1/attention/backends/mla/prefill/registry.py
new file mode 100644
index 000000000000..3a3242f60365
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/prefill/registry.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Registry for MLA prefill backends.
+
+This module provides an enumeration of all available MLA prefill backends
+and utilities for loading them.
+"""
+
+from enum import Enum, EnumMeta
+from typing import TYPE_CHECKING
+
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+if TYPE_CHECKING:
+    from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+
+
+class _MLAPrefillBackendEnumMeta(EnumMeta):
+    """Metaclass for MLAPrefillBackendEnum to provide better error messages."""
+
+    def __getitem__(cls, name: str):
+        try:
+            return super().__getitem__(name)
+        except KeyError:
+            members = cls.__members__.keys()
+            valid_backends = ", ".join(members)
+            raise ValueError(
+                f"Unknown MLA prefill backend: '{name}'. "
+                f"Valid options are: {valid_backends}"
+            ) from None
+
+
+class MLAPrefillBackendEnum(Enum, metaclass=_MLAPrefillBackendEnumMeta):
+    """Enumeration of all supported MLA prefill backends."""
+
+    FLASH_ATTN = (
+        "vllm.v1.attention.backends.mla.prefill.flash_attn.FlashAttnPrefillBackend"
+    )
+    FLASHINFER = (
+        "vllm.v1.attention.backends.mla.prefill.flashinfer.FlashInferPrefillBackend"
+    )
+    TRTLLM_RAGGED = (
+        "vllm.v1.attention.backends.mla.prefill.trtllm_ragged."
+        "TrtllmRaggedPrefillBackend"
+    )
+
+    def get_path(self) -> str:
+        """Get the fully qualified class path for this backend."""
+        return self.value
+
+    def get_class(self) -> "type[MLAPrefillBackend]":
+        """Lazy load and return the backend class."""
+        return resolve_obj_by_qualname(self.get_path())
diff --git a/vllm/v1/attention/backends/mla/prefill/selector.py b/vllm/v1/attention/backends/mla/prefill/selector.py
new file mode 100644
index 000000000000..fdb8be6d65d9
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/prefill/selector.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Selector for MLA prefill backends.
+
+This module provides functions for selecting the appropriate MLA prefill
+backend based on device capabilities and configuration.
+"""
+
+from functools import cache
+from typing import TYPE_CHECKING, NamedTuple
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.platforms.interface import DeviceCapability
+from vllm.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+
+logger = init_logger(__name__)
+
+
+class MLAPrefillSelectorConfig(NamedTuple):
+    """Hashable configuration for MLA prefill backend selection.
+
+    This is analogous to AttentionSelectorConfig and contains model-specific
+    configuration needed to select an MLA prefill backend, extracted from
+    VllmConfig into a hashable form for caching.
+    """
+
+    dtype: torch.dtype
+    is_r1_compatible: bool
+
+
+def is_deepseek_r1_mla_compatible(vllm_config: "VllmConfig") -> bool:
+    """Check if model has DeepSeek R1 compatible MLA dimensions.
+
+    DeepSeek R1 MLA dimensions are:
+    - qk_nope_head_dim = 128
+    - qk_rope_head_dim = 64
+    - v_head_dim = 128
+    """
+    if vllm_config.model_config is None:
+        return False
+    hf_text_config = vllm_config.model_config.hf_text_config
+    qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
+    qk_rope_head_dim = getattr(hf_text_config, "qk_rope_head_dim", 1)
+    v_head_dim = getattr(hf_text_config, "v_head_dim", 1)
+    return qk_nope_head_dim == 128 and qk_rope_head_dim == 64 and v_head_dim == 128
+
+
+def _get_mla_prefill_backend_priorities(
+    device_capability: DeviceCapability,
+) -> list[MLAPrefillBackendEnum]:
+    """Get MLA prefill backend priorities based on device capability.
+
+    Args:
+        device_capability: The device's compute capability.
+
+    Returns:
+        List of backends in priority order (highest priority first).
+    """
+    if device_capability.major == 10:  # Blackwell
+        return [
+            MLAPrefillBackendEnum.FLASH_ATTN,
+            MLAPrefillBackendEnum.TRTLLM_RAGGED,
+            MLAPrefillBackendEnum.FLASHINFER,
+        ]
+    else:  # Hopper (SM90) and older
+        return [
+            MLAPrefillBackendEnum.FLASH_ATTN,
+        ]
+
+
+def get_mla_prefill_backend(
+    vllm_config: "VllmConfig",
+) -> "type[MLAPrefillBackend]":
+    """Select the MLA prefill backend based on configuration and device.
+
+    This function first checks for explicit user preferences via
+    mla_prefill_backend in AttentionConfig, then falls back to automatic
+    priority-based selection.
+
+    Args:
+        vllm_config: The vLLM configuration.
+
+    Returns:
+        The selected prefill backend class.
+    """
+    from vllm.platforms import current_platform
+
+    device_capability = current_platform.get_device_capability()
+    if device_capability is None:
+        logger.info_once(
+            "Device capability not available, using FlashAttention MLA prefill backend."
+        )
+        return MLAPrefillBackendEnum.FLASH_ATTN.get_class()
+
+    attention_config = vllm_config.attention_config
+
+    selector_config = MLAPrefillSelectorConfig(
+        dtype=vllm_config.model_config.dtype,
+        is_r1_compatible=is_deepseek_r1_mla_compatible(vllm_config),
+    )
+
+    if attention_config.mla_prefill_backend is not None:
+        selected_backend = attention_config.mla_prefill_backend
+        backend_cls: type[MLAPrefillBackend] | None = None
+        try:
+            backend_cls = selected_backend.get_class()
+            invalid_reasons = backend_cls.validate_configuration(
+                device_capability, selector_config
+            )
+        except ImportError:
+            invalid_reasons = ["ImportError"]
+        if invalid_reasons:
+            raise ValueError(
+                f"Selected MLA prefill backend {selected_backend.name} "
+                f"is not valid for this configuration. "
+                f"Reason: {invalid_reasons}"
+            )
+        assert backend_cls is not None
+        logger.info("Using %s MLA prefill backend.", selected_backend.name)
+        return backend_cls
+
+    return _auto_select_mla_prefill_backend(
+        device_capability,
+        selector_config,
+    )
+
+
+@cache
+def _auto_select_mla_prefill_backend(
+    device_capability: DeviceCapability,
+    selector_config: MLAPrefillSelectorConfig,
+) -> "type[MLAPrefillBackend]":
+    """Auto-select the best available MLA prefill backend.
+
+    Args:
+        device_capability: The device's compute capability.
+        selector_config: Hashable configuration for backend selection.
+
+    Returns:
+        The selected prefill backend class.
+    """
+    priorities = _get_mla_prefill_backend_priorities(device_capability)
+    all_invalid_reasons: dict[str, list[str]] = {}
+
+    for backend_enum in priorities:
+        backend_cls: type[MLAPrefillBackend] | None = None
+        try:
+            backend_cls = backend_enum.get_class()
+            invalid_reasons = backend_cls.validate_configuration(
+                device_capability, selector_config
+            )
+        except ImportError:
+            invalid_reasons = ["ImportError"]
+        if not invalid_reasons:
+            assert backend_cls is not None
+            logger.info_once("Using %s MLA prefill backend.", backend_enum.name)
+            return backend_cls
+        all_invalid_reasons[backend_enum.name] = invalid_reasons
+
+    reasons_str = (
+        "{"
+        + ", ".join(
+            f"{name}: [{', '.join(reasons)}]"
+            for name, reasons in all_invalid_reasons.items()
+        )
+        + "}"
+    )
+    config_str = repr(selector_config)
+    logger.debug_once(
+        "Some MLA prefill backends are not valid with %s. Reasons: %s.",
+        config_str,
+        reasons_str,
+    )
+
+    raise ValueError(
+        f"No valid MLA prefill backend found with {config_str}. Reasons: {reasons_str}."
+    )
diff --git a/vllm/v1/attention/backends/mla/prefill/trtllm_ragged.py b/vllm/v1/attention/backends/mla/prefill/trtllm_ragged.py
new file mode 100644
index 000000000000..7462dc39e7bc
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/prefill/trtllm_ragged.py
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""TRT-LLM Ragged backend for MLA prefill."""
+
+from typing import TYPE_CHECKING
+
+import torch
+
+import vllm.envs as envs
+from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+from vllm.v1.worker.workspace import current_workspace_manager
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.attention.mla_attention import (
+        MLACommonPrefillMetadata,
+    )
+    from vllm.platforms.interface import DeviceCapability
+
+
+class TrtllmRaggedPrefillBackend(MLAPrefillBackend):
+    """TRT-LLM Ragged backend for MLA prefill."""
+
+    requires_r1_mla_dimensions = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRTLLM_RAGGED"
+
+    @classmethod
+    def supports_compute_capability(cls, device_capability: "DeviceCapability") -> bool:
+        return device_capability.major == 10
+
+    @classmethod
+    def is_available(cls) -> bool:
+        try:
+            from flashinfer.prefill import (
+                trtllm_ragged_attention_deepseek,  # noqa: F401
+            )
+
+            return True
+        except ImportError:
+            return False
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        vllm_config: "VllmConfig",
+    ) -> None:
+        super().__init__(
+            num_heads=num_heads,
+            scale=scale,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            vllm_config=vllm_config,
+        )
+
+    def _get_workspace_buffer(self) -> torch.Tensor:
+        (workspace_buffer,) = current_workspace_manager().get_simultaneous(
+            (
+                (envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,),
+                torch.uint8,
+            ),
+        )
+        return workspace_buffer
+
+    def prepare_metadata(
+        self,
+        prefill_metadata: "MLACommonPrefillMetadata",
+    ) -> None:
+        super().prepare_metadata(prefill_metadata)
+        self._query_seq_lens = (
+            prefill_metadata.query_start_loc[1:] - prefill_metadata.query_start_loc[:-1]
+        )
+
+    def run_prefill_new_tokens(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        from flashinfer.prefill import trtllm_ragged_attention_deepseek
+
+        workspace_buffer = self._get_workspace_buffer()
+        out = torch.empty(
+            q.shape[0],
+            q.shape[1],
+            v.shape[2],
+            device=q.device,
+            dtype=self._prefill_metadata.output_dtype,
+        )
+
+        ret = trtllm_ragged_attention_deepseek(
+            query=q,
+            key=k,
+            value=v,
+            workspace_buffer=workspace_buffer,
+            seq_lens=self._query_seq_lens,
+            max_q_len=self._prefill_metadata.max_query_len,
+            max_kv_len=self._prefill_metadata.max_query_len,
+            bmm1_scale=self.scale,
+            bmm2_scale=1.0,
+            o_sf_scale=1.0,
+            batch_size=self._query_seq_lens.shape[0],
+            window_left=-1,
+            cum_seq_lens_q=self._prefill_metadata.query_start_loc,
+            cum_seq_lens_kv=self._prefill_metadata.query_start_loc,
+            enable_pdl=False,
+            is_causal=True,
+            return_lse=return_softmax_lse,
+            out=out,
+        )
+
+        if isinstance(ret, tuple):
+            # Convert from (q_len, num_heads) to (num_heads, q_len)
+            return ret[0], ret[1].transpose(0, 1).contiguous()
+        return ret
+
+    def run_prefill_context_chunk(
+        self,
+        chunk_idx: int,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        from flashinfer.prefill import trtllm_ragged_attention_deepseek
+
+        assert self._prefill_metadata.chunked_context is not None
+        assert self._prefill_metadata.chunked_context.seq_lens[chunk_idx] is not None
+        workspace_buffer = self._get_workspace_buffer()
+
+        out = torch.empty(
+            q.shape[0],
+            q.shape[1],
+            v.shape[2],
+            device=q.device,
+            dtype=self._prefill_metadata.output_dtype,
+        )
+
+        attn_out, lse = trtllm_ragged_attention_deepseek(
+            query=q,
+            key=k,
+            value=v,
+            workspace_buffer=workspace_buffer,
+            seq_lens=self._prefill_metadata.chunked_context.seq_lens[chunk_idx],
+            max_q_len=self._prefill_metadata.max_query_len,
+            max_kv_len=self._prefill_metadata.chunked_context.max_seq_lens[chunk_idx],
+            bmm1_scale=self.scale,
+            bmm2_scale=1.0,
+            o_sf_scale=1.0,
+            batch_size=self._prefill_metadata.chunked_context.seq_lens[chunk_idx].shape[
+                0
+            ],
+            window_left=-1,
+            cum_seq_lens_q=self._prefill_metadata.query_start_loc,
+            cum_seq_lens_kv=self._prefill_metadata.chunked_context.cu_seq_lens[
+                chunk_idx
+            ],
+            enable_pdl=False,
+            is_causal=False,
+            return_lse=True,
+            out=out,
+        )
+
+        # Convert from (q_len, num_heads) to (num_heads, q_len)
+        return attn_out, lse.transpose(0, 1).contiguous()
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index a66a97311fbc..2106226118ef 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -396,6 +396,7 @@ class AiterMLAHelper:
     """
 
     _AITER_MIN_MLA_HEADS: Final = 16
+    _AITER_UNSUPPORTED_HEADS = [32]
 
     @staticmethod
     def check_num_heads_validity(num_heads: int):
@@ -419,6 +420,9 @@ def get_actual_mla_num_heads(num_heads: int) -> int:
 
     @staticmethod
     def get_mla_padded_q(num_heads: int, q: torch.Tensor) -> torch.Tensor:
+        assert num_heads not in AiterMLAHelper._AITER_UNSUPPORTED_HEADS, (
+            f"unsupported head_num: {num_heads}"
+        )
         return (
             q
             if num_heads >= AiterMLAHelper._AITER_MIN_MLA_HEADS
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
index 503bb509b105..dc343b639f6c 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
@@ -7,6 +7,7 @@
 import numpy as np
 import torch
 
+from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig
 from vllm.config.cache import CacheDType
@@ -14,6 +15,7 @@
 from vllm.model_executor.layers.attention.mla_attention import (
     get_mla_dims,
 )
+from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.v1.attention.backend import (
     AttentionBackend,
@@ -25,9 +27,6 @@
     MultipleOf,
     SparseMLAAttentionImpl,
 )
-from vllm.v1.attention.backends.mla.flashmla_sparse import (
-    triton_convert_req_index_to_global_index,
-)
 from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
     AiterMLAHelper,
 )
@@ -38,6 +37,188 @@
 logger = init_logger(__name__)
 
 
+@triton.jit
+def _convert_req_index_to_global_index_kernel(
+    req_id_ptr,  # int32 [num_tokens]
+    block_table_ptr,  # int32 [num_requests, max_num_blocks_per_req]
+    token_indices_ptr,  # int32 [num_tokens, NUM_TOPK_TOKENS]
+    cu_seqlens_ptr,  # int32 [num_tokens + 1]
+    out_ptr,  # int32 [num_tokens, NUM_TOPK_TOKENS]
+    # shapes (compile-time where possible)
+    max_num_blocks_per_req: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BLOCK_N: tl.constexpr,  # tile width along columns
+    # strides (in elements)
+    bt_stride0,
+    bt_stride1,
+    ti_stride0,
+    ti_stride1,
+):
+    # program_id(0) -> token_id (row)
+    # program_id(1) -> tile index along columns
+    token_id = tl.program_id(0)
+    tile_id = tl.program_id(1)
+
+    # Each program covers BLOCK_N consecutive columns
+    indice_id = tile_id * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    # Load request id for this token (no mask: grid is exact)
+    req = tl.load(req_id_ptr + token_id)
+
+    # Load cumulative sequence lengths to get starting index of this request
+    seq_start = tl.load(cu_seqlens_ptr + token_id)
+    seq_end = tl.load(cu_seqlens_ptr + token_id + 1)
+
+    if tile_id * BLOCK_N + seq_start >= seq_end:
+        return
+
+    # Load token indices for this tile
+    ti_ptr = token_indices_ptr + token_id * ti_stride0 + indice_id * ti_stride1
+    tok = tl.load(ti_ptr)  # int32
+
+    # Only token == -1 should propagate as -1
+    is_invalid_tok = tok < 0
+
+    # Compute block id and in-block offset
+    block_id = tok // BLOCK_SIZE
+    inblock_off = tok % BLOCK_SIZE
+
+    # Guard block_table access
+    valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0)
+    bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1
+    base = tl.load(bt_ptr, mask=valid_block, other=0)
+
+    # # If token == -1 OR block_id OOB, output 0; else base * BLOCK_SIZE + offset
+    out_val = tl.where(
+        is_invalid_tok | (~valid_block), 0, base * BLOCK_SIZE + inblock_off
+    )
+    out_ptr_ij = out_ptr + seq_start + indice_id
+    out_ptr_ij_mask = (seq_start + indice_id) < seq_end
+
+    # store the results with mask
+    tl.store(out_ptr_ij, out_val, mask=out_ptr_ij_mask)
+
+
+def triton_convert_req_index_to_global_index(
+    req_id: torch.Tensor,  # int32 [num_tokens]
+    block_table: torch.Tensor,  # int32 [num_requests, max_num_blocks_per_req]
+    token_indices: torch.Tensor,  # int32 [num_tokens, NUM_TOPK_TOKENS]
+    cu_seqlens: torch.Tensor,  # int32 [num_tokens + 1]
+    paged_kv_indices: torch.Tensor,  # int32 [num_tokens * topk] out_buffer
+    BLOCK_SIZE: int = 64,
+    NUM_TOPK_TOKENS: int = 2048,
+    BLOCK_N: int = 128,  # tile width along columns
+):
+    """
+    out[token_id, indice_id] =
+        block_table[req_id[token_id],
+            token_indices[token_id, indice_id] // BLOCK_SIZE] * BLOCK_SIZE
+        + token_indices[token_id, indice_id] % BLOCK_SIZE
+
+    Only when token_indices[token_id, indice_id] == -1 do we output -1.
+    For safety, we also output -1 if the derived block_id would be
+        out-of-bounds.
+    """
+    assert req_id.dtype == torch.int32
+    assert block_table.dtype == torch.int32
+    assert token_indices.dtype == torch.int32
+    assert token_indices.shape[1] == NUM_TOPK_TOKENS
+    assert NUM_TOPK_TOKENS % BLOCK_N == 0, (
+        f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible byBLOCK_N ({BLOCK_N})"
+    )
+    # print("req_id: ", req_id, flush=True)
+    num_tokens = req_id.shape[0]
+    _, max_num_blocks_per_req = block_table.shape
+    tiles_per_row = NUM_TOPK_TOKENS // BLOCK_N
+
+    # Ensure contiguous tensors on the same device
+    req_id_c = req_id.contiguous()
+    block_table_c = block_table.contiguous()
+    token_indices_c = token_indices.contiguous()
+
+    # Strides in elements
+    bt_stride0, bt_stride1 = block_table_c.stride()
+    ti_stride0, ti_stride1 = token_indices_c.stride()
+
+    # Exact 2D grid: tokens × column tiles
+    grid = (num_tokens, tiles_per_row)
+
+    _convert_req_index_to_global_index_kernel[grid](
+        req_id_c,
+        block_table_c,
+        token_indices_c,
+        cu_seqlens,
+        paged_kv_indices,
+        # shapes / constexprs
+        max_num_blocks_per_req,
+        BLOCK_SIZE,
+        BLOCK_N,
+        # strides
+        bt_stride0,
+        bt_stride1,
+        ti_stride0,
+        ti_stride1,
+    )
+    return
+
+
+@triton.jit
+def generate_sparse_seqlen_kernel(
+    seq_len_ptr,  # [num_seq]
+    cu_query_lens_ptr,  # [num_seq]
+    out_ptr,  # [num_query_tokens]
+    topk_token: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    query_offset = tl.program_id(1) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    query_start = tl.load(cu_query_lens_ptr + seq_id)
+    query_end = tl.load(cu_query_lens_ptr + seq_id + 1)
+    if query_start + tl.program_id(1) * BLOCK_SIZE > query_end:
+        return
+    query_len = query_end - query_start
+    query_mask = query_offset + query_start < query_end
+    seq_len = tl.load(seq_len_ptr + seq_id)
+    # Just return since the out_ptr is zero initialized.
+    if seq_len == 0:
+        return
+    context_start_point = seq_len - query_len
+    sparse_seqlen = context_start_point + query_offset
+    sparse_seqlen_masked = tl.where(
+        sparse_seqlen + 1 < topk_token, sparse_seqlen + 1, topk_token
+    )
+    tl.store(
+        out_ptr + query_start + query_offset, sparse_seqlen_masked, mask=query_mask
+    )
+
+
+def generate_sparse_seqlen_triton(
+    query_lens: torch.Tensor,
+    seq_lens: torch.Tensor,
+    cu_query_lens: torch.Tensor,
+    topk_token: int,
+    num_tokens: int,
+    max_query_len: int,
+):
+    num_seqs = query_lens.size(0)
+    # zero initialize the tensor to make sure invalid positions will be zero
+    out = torch.zeros([num_tokens], dtype=torch.int32, device=query_lens.device)
+    block_size = 64
+    num_block_per_row = triton.cdiv(max_query_len, block_size)
+    grid = (
+        num_seqs,
+        num_block_per_row,
+    )
+    generate_sparse_seqlen_kernel[grid](
+        seq_lens,
+        cu_query_lens,
+        out,
+        topk_token,
+        block_size,
+    )
+    return out
+
+
 @triton.jit
 def fetch_id_to_ragged_kernel(
     in_tensor_ptr,  # [num_seq, topk]
@@ -86,11 +267,13 @@ class ROCMAiterMLASparseBackend(AttentionBackend):
         "auto",
         "float16",
         "bfloat16",
+        "fp8",
+        "fp8_e4m3",
     ]
 
     @staticmethod
     def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
-        return [1]
+        return [1, 64]
 
     @staticmethod
     def get_name() -> str:
@@ -144,7 +327,7 @@ class ROCMAiterMLASparseMetadata(AttentionMetadata):
     paged_kv_last_page_len: torch.Tensor
     paged_kv_indices: torch.Tensor
     paged_kv_indptr: torch.Tensor
-    paged_kv_indptr_rest: torch.Tensor
+    attn_out_dtype: torch.dtype
 
     block_size: int = 1
     topk_tokens: int = 2048
@@ -167,6 +350,7 @@ def __init__(
     ):
         self.kv_cache_spec = kv_cache_spec
         self.model_config = vllm_config.model_config
+        self.model_dtype = vllm_config.model_config.dtype
         parallel_config = vllm_config.parallel_config
         self.device = device
         max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
@@ -174,9 +358,6 @@ def __init__(
         self.num_heads = self.model_config.get_num_attention_heads(parallel_config)
         self.mla_dims = get_mla_dims(self.model_config)
         self.topk_tokens = vllm_config.model_config.hf_config.index_topk
-        self.topk_tokens_tensor = torch.tensor(
-            [self.topk_tokens], device=device, dtype=torch.int32
-        )
         self.max_model_len_tensor = torch.tensor(
             [self.model_config.max_model_len], device=device, dtype=torch.int32
         )
@@ -222,18 +403,33 @@ def build(
         )
         # Zero-fill for cudagraphs
         self.req_id_per_token_buffer.fill_(0)
+        self.paged_kv_indices.fill_(0)
+        self.paged_kv_indptr.fill_(0)
         self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_(
             torch.from_numpy(req_id_per_token), non_blocking=True
         )
-        self.paged_kv_indices.fill_(0)
-        self.paged_kv_indptr.fill_(0)
+        query_lens = (
+            common_attn_metadata.query_start_loc[1:]
+            - common_attn_metadata.query_start_loc[:-1]
+        )
+        seq_lens = common_attn_metadata.seq_lens
+        sparse_seqlen = generate_sparse_seqlen_triton(
+            query_lens,
+            seq_lens,
+            common_attn_metadata.query_start_loc,
+            self.topk_tokens,
+            num_tokens,
+            common_attn_metadata.max_query_len,
+        )
+
+        torch.cumsum(sparse_seqlen, dim=0, out=self.paged_kv_indptr[1 : num_tokens + 1])
+        self.paged_kv_indptr[num_tokens + 1 :].fill_(self.paged_kv_indptr[num_tokens])
 
         req_id_per_token = self.req_id_per_token_buffer[:num_tokens]
         qo_indptr = self.qo_indptr[: num_tokens + 1]
         paged_kv_last_page_len = self.paged_kv_last_page_len[:num_tokens]
-        paged_kv_indices = self.paged_kv_indices[: num_tokens * self.topk_tokens]
         paged_kv_indptr = self.paged_kv_indptr[: num_tokens + 1]
-        paged_kv_indptr_rest = self.paged_kv_indptr[num_tokens + 1 :]
+        paged_kv_indices = self.paged_kv_indices[: num_tokens * self.topk_tokens]
 
         metadata = ROCMAiterMLASparseMetadata(
             num_reqs=common_attn_metadata.num_reqs,
@@ -245,12 +441,12 @@ def build(
             block_table=common_attn_metadata.block_table_tensor,
             req_id_per_token=req_id_per_token,
             block_size=self.kv_cache_spec.block_size,
+            attn_out_dtype=self.model_dtype,
             topk_tokens=self.topk_tokens,
             qo_indptr=qo_indptr,
             paged_kv_last_page_len=paged_kv_last_page_len,
             paged_kv_indices=paged_kv_indices,
             paged_kv_indptr=paged_kv_indptr,
-            paged_kv_indptr_rest=paged_kv_indptr_rest,
         )
         return metadata
 
@@ -314,29 +510,20 @@ def __init__(
         assert indexer is not None
         self.topk_indices_buffer: torch.Tensor | None = indexer.topk_indices_buffer
 
-    def _forward_bf16_kv(
+    def _forward_mla(
         self,
+        layer: AttentionLayer,
         q: torch.Tensor,  # [sq, heads, d_qk]
         kv_c_and_k_pe_cache: torch.Tensor,  # [blocks, heads, d_qk]
-        topk_indices: torch.Tensor,  # [sq, topk]
         attn_metadata: ROCMAiterMLASparseMetadata,
     ) -> torch.Tensor:
         num_tokens = q.shape[0]
         mla_num_heads = AiterMLAHelper.get_actual_mla_num_heads(self.num_heads)
         output = torch.empty(
             [num_tokens, mla_num_heads, self.kv_lora_rank],
-            dtype=q.dtype,
+            dtype=attn_metadata.attn_out_dtype,
             device=q.device,
         )
-        seq_len = (topk_indices != -1).sum(dim=-1)
-        torch.cumsum(seq_len, dim=0, out=attn_metadata.paged_kv_indptr[1:])
-        attn_metadata.paged_kv_indptr_rest.fill_(attn_metadata.paged_kv_indptr[-1])
-        fetch_id_to_ragged_triton(
-            topk_indices,
-            attn_metadata.paged_kv_indptr,
-            attn_metadata.paged_kv_indices,
-            attn_metadata.topk_tokens,
-        )
 
         rocm_aiter_ops.mla_decode_fwd(
             q,
@@ -348,6 +535,8 @@ def _forward_bf16_kv(
             attn_metadata.paged_kv_indptr,
             attn_metadata.paged_kv_indices,
             attn_metadata.paged_kv_last_page_len,
+            q_scale=layer._q_scale,
+            kv_scale=layer._k_scale,
         )
 
         return AiterMLAHelper.get_mla_unpadded_o(self.num_heads, output)
@@ -366,23 +555,32 @@ def forward_mqa(
         if isinstance(q, tuple):
             q = torch.cat(q, dim=-1)
 
-        num_actual_toks = q.shape[0]
+        num_actual_toks = attn_metadata.num_actual_tokens
 
         # Get topk indices
         assert self.topk_indices_buffer is not None
         topk_indices = self.topk_indices_buffer[:num_actual_toks]
 
-        topk_indices_global = triton_convert_req_index_to_global_index(
+        triton_convert_req_index_to_global_index(
             attn_metadata.req_id_per_token,
             attn_metadata.block_table,
             topk_indices,
+            attn_metadata.paged_kv_indptr,
+            attn_metadata.paged_kv_indices,
             BLOCK_SIZE=attn_metadata.block_size,
             NUM_TOPK_TOKENS=attn_metadata.topk_tokens,
         )
 
+        # write the latent and rope to kv cache
+        fp8_attention = self.kv_cache_dtype.startswith("fp8")
+        if fp8_attention:
+            original_q_shape = q.shape
+            kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(current_platform.fp8_dtype())
+            q, _ = ops.scaled_fp8_quant(q.view(q.shape[0], -1), layer._q_scale)
+            q = q.view(original_q_shape)
         mla_padded_q = AiterMLAHelper.get_mla_padded_q(self.num_heads, q)
-        attn_out = self._forward_bf16_kv(
-            mla_padded_q, kv_c_and_k_pe_cache, topk_indices_global, attn_metadata
+        attn_out = self._forward_mla(
+            layer, mla_padded_q, kv_c_and_k_pe_cache, attn_metadata
         )
 
         return attn_out, None
diff --git a/vllm/v1/attention/backends/mla/sparse_swa.py b/vllm/v1/attention/backends/mla/sparse_swa.py
index b17fd5d34418..28564e6a97d3 100644
--- a/vllm/v1/attention/backends/mla/sparse_swa.py
+++ b/vllm/v1/attention/backends/mla/sparse_swa.py
@@ -7,6 +7,7 @@
 
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.v1.attention.backend import (
     AttentionBackend,
@@ -360,7 +361,7 @@ def build_tile_scheduler(
             _LAYER_TYPE_C4A: None,
             _LAYER_TYPE_C128A: None,
         }
-        if num_decode_tokens == 0:
+        if num_decode_tokens == 0 or current_platform.is_rocm():
             return out
         for layer_type in self._layer_types:
             # get_mla_metadata() is the official FlashMLA entry point that
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 7aa8a646f415..c45a631008e5 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -123,18 +123,6 @@ def __init__(
 
         self._sm_count = current_platform.num_compute_units()
 
-    def _flash_attn_varlen_diff_headdims(
-        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
-    ):
-        return super()._flash_attn_varlen_diff_headdims(
-            q,
-            k,
-            v,
-            return_softmax_lse=return_softmax_lse,
-            softmax_scale=softmax_scale,
-            **kwargs,
-        )
-
     def forward_mqa(
         self,
         q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index 55ed5c5b3c49..f56b58c43e7f 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -200,19 +200,9 @@ def forward(
         key_cache, value_cache = kv_cache.unbind(0)
 
         softmax_scale = self.scale
-        fp8_post_attn_v_rescale = False
         if is_quantized_kv_cache(self.kv_cache_dtype):
             key_cache = key_cache.view(self.fp8_dtype)
             value_cache = value_cache.view(self.fp8_dtype)
-            # When Q is FP8, triton kernel skips K/V dequant (for fp8xfp8 matmul).
-            # Compensate by absorbing q_scale and k_scale into softmax_scale, and
-            # v_scale into output_scale (or post-multiplying if no fusion).
-            if query.dtype == self.fp8_dtype:
-                softmax_scale = self.scale * layer._q_scale_float * layer._k_scale_float
-                if output_scale is not None:
-                    output_scale = output_scale / layer._v_scale_float
-                else:
-                    fp8_post_attn_v_rescale = True
 
         cu_seqlens_q = attn_metadata.query_start_loc
         seqused_k = attn_metadata.seq_lens
@@ -220,11 +210,6 @@ def forward(
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        descale_shape = (
-            cu_seqlens_q.shape[0] - 1,
-            key.shape[1] if key is not None else self.num_kv_heads,
-        )
-
         self.unified_attention(
             q=query[:num_actual_tokens],
             k=key_cache,
@@ -240,16 +225,13 @@ def forward(
             window_size=self.sliding_window,
             block_table=block_table,
             softcap=self.logits_soft_cap,
-            q_descale=None,  # q_scale absorbed into softmax_scale
-            k_descale=layer._k_scale.expand(descale_shape),
-            v_descale=layer._v_scale.expand(descale_shape),
+            q_descale=layer._q_scale if query.dtype == self.fp8_dtype else None,
+            k_descale=layer._k_scale,
+            v_descale=layer._v_scale,
             sinks=self.sinks,
             output_scale=output_scale,
         )
 
-        if fp8_post_attn_v_rescale:
-            output[:num_actual_tokens].mul_(layer._v_scale_float)
-
         return output
 
     def do_kv_cache_update(
diff --git a/vllm/v1/attention/ops/dcp_alltoall.py b/vllm/v1/attention/ops/dcp_alltoall.py
index 92f50f63e3ef..1469a5c754d6 100644
--- a/vllm/v1/attention/ops/dcp_alltoall.py
+++ b/vllm/v1/attention/ops/dcp_alltoall.py
@@ -9,10 +9,8 @@
 A2A exchanges partial attention outputs and their LSE values across
 ranks, then combines them with exact LSE-weighted reduction.
 
-This reduces the number of NCCL calls per attention layer from 3
-(AG for Q, AG for K metadata, RS for output) to 2 (A2A for output,
-A2A for LSE), lowering per-step communication overhead for long-context
-decode where NCCL latency is a significant fraction of step time.
+This reduces the number of NCCL calls per attention layer by exchanging
+the partial output and LSE in a single packed All-to-All payload.
 
 Usage:
     vllm serve model --tp 16 --dcp 16 --dcp-comm-backend a2a
@@ -28,6 +26,10 @@
 import torch.distributed as dist
 
 from vllm.triton_utils import tl, triton
+from vllm.v1.worker.workspace import (
+    current_workspace_manager,
+    is_workspace_manager_initialized,
+)
 
 if TYPE_CHECKING:
     from vllm.distributed.parallel_state import GroupCoordinator
@@ -44,7 +46,6 @@ def _lse_weighted_combine(
     CPU reference implementation for LSE-weighted combination.
 
     This is a pure PyTorch implementation used for testing and validation.
-    For GPU execution, use dcp_lse_combine_triton instead.
 
     Args:
         outputs: Partial attention outputs [N, B, H, D]
@@ -102,57 +103,137 @@ def _lse_weighted_combine(
     return result
 
 
+def _dcp_a2a_lse_pack_dim(output_dtype: torch.dtype) -> int:
+    bits = torch.finfo(output_dtype).bits
+    if bits == 16:
+        return 2
+    if bits == 32:
+        return 1
+    raise ValueError(f"Cannot pack fp32 LSE into output dtype {output_dtype}.")
+
+
+def _dcp_a2a_send_recv_buffers(
+    shape: tuple[int, ...],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if is_workspace_manager_initialized():
+        send_buffer, recv_buffer = current_workspace_manager().get_simultaneous(
+            (shape, dtype),
+            (shape, dtype),
+        )
+        return send_buffer, recv_buffer
+
+    return (
+        torch.empty(shape, device=device, dtype=dtype),
+        torch.empty(shape, device=device, dtype=dtype),
+    )
+
+
 @triton.jit
-def _dcp_lse_combine_kernel(
-    # Input pointers
-    recv_output_ptr,
-    recv_lse_ptr,
-    # Output pointers
+def _dcp_a2a_pack_send_kernel(
     out_ptr,
-    out_lse_ptr,
-    # Strides for recv_output [N, B, H_local, D]
-    ro_stride_N,
-    ro_stride_B,
-    ro_stride_H,
-    ro_stride_D,
-    # Strides for recv_lse [N, B, H_local]
-    rl_stride_N,
-    rl_stride_B,
-    rl_stride_H,
-    # Strides for output [B, H_local, D]
-    o_stride_B,
-    o_stride_H,
-    o_stride_D,
-    # Constants
+    lse_ptr,
+    send_ptr,
+    out_stride_B,
+    out_stride_H,
+    out_stride_D,
+    lse_stride_B,
+    lse_stride_H,
+    send_stride_N,
+    send_stride_B,
+    send_stride_H,
+    send_stride_D,
     N: tl.constexpr,
     HEAD_DIM: tl.constexpr,
-    IS_BASE_E: tl.constexpr,
-    RETURN_LSE: tl.constexpr,
+    H_PER_RANK: tl.constexpr,
+    LSE_PACK_DIM: tl.constexpr,
 ):
-    """
-    Triton kernel for LSE-weighted combination of partial attention outputs.
+    batch_idx = tl.program_id(0).to(tl.int64)
+    local_head_idx = tl.program_id(1).to(tl.int64)
+    d_offsets = tl.arange(0, HEAD_DIM)
 
-    After All-to-All, each rank has:
-    - recv_output [N, B, H_local, D]: partial outputs from all KV shards
-    - recv_lse [N, B, H_local]: partial LSEs from all KV shards
+    for rank_idx in tl.static_range(N):
+        src_head_idx = rank_idx * H_PER_RANK + local_head_idx
+        send_base = (
+            rank_idx * send_stride_N
+            + batch_idx * send_stride_B
+            + local_head_idx * send_stride_H
+        )
 
-    This kernel computes the weighted combination locally (no communication).
+        out_offsets = (
+            batch_idx * out_stride_B
+            + src_head_idx * out_stride_H
+            + d_offsets * out_stride_D
+        )
+        tl.store(
+            send_ptr + send_base + d_offsets * send_stride_D,
+            tl.load(out_ptr + out_offsets),
+        )
 
-    Grid: (B, H_local)
-    Each program handles one (batch, head) and processes all D elements.
-    """
+        lse_val = tl.load(
+            lse_ptr + batch_idx * lse_stride_B + src_head_idx * lse_stride_H
+        )
+        if LSE_PACK_DIM == 1:
+            tl.store(
+                send_ptr + send_base + HEAD_DIM * send_stride_D,
+                lse_val.to(send_ptr.dtype.element_ty),
+            )
+        else:
+            lse_bits = lse_val.to(tl.uint32, bitcast=True)
+            lo = (lse_bits & 0xFFFF).to(tl.uint16)
+            hi = ((lse_bits >> 16) & 0xFFFF).to(tl.uint16)
+            tl.store(
+                send_ptr + send_base + HEAD_DIM * send_stride_D,
+                lo.to(send_ptr.dtype.element_ty, bitcast=True),
+            )
+            tl.store(
+                send_ptr + send_base + (HEAD_DIM + 1) * send_stride_D,
+                hi.to(send_ptr.dtype.element_ty, bitcast=True),
+            )
+
+
+@triton.jit
+def _dcp_a2a_unpack_combine_kernel(
+    recv_ptr,
+    out_ptr,
+    out_lse_ptr,
+    recv_stride_N,
+    recv_stride_B,
+    recv_stride_H,
+    recv_stride_D,
+    out_stride_B,
+    out_stride_H,
+    out_stride_D,
+    out_lse_stride_B,
+    out_lse_stride_H,
+    N: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    IS_BASE_E: tl.constexpr,
+    RETURN_LSE: tl.constexpr,
+    LSE_PACK_DIM: tl.constexpr,
+):
     batch_idx = tl.program_id(0).to(tl.int64)
     head_idx = tl.program_id(1).to(tl.int64)
+    d_offsets = tl.arange(0, HEAD_DIM)
 
-    # Base offset for this (batch, head)
-    base_lse_offset = batch_idx * rl_stride_B + head_idx * rl_stride_H
-    base_out_offset = batch_idx * ro_stride_B + head_idx * ro_stride_H
-
-    # First pass: find max LSE for numerical stability
     lse_max = -float("inf")
-    for n in tl.static_range(N):
-        lse_offset = n * rl_stride_N + base_lse_offset
-        lse_val = tl.load(recv_lse_ptr + lse_offset)
+    for rank_idx in tl.static_range(N):
+        recv_base = (
+            rank_idx * recv_stride_N
+            + batch_idx * recv_stride_B
+            + head_idx * recv_stride_H
+        )
+        if LSE_PACK_DIM == 1:
+            lse_val = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D).to(
+                tl.float32
+            )
+        else:
+            lo_raw = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D)
+            hi_raw = tl.load(recv_ptr + recv_base + (HEAD_DIM + 1) * recv_stride_D)
+            lo = lo_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            hi = hi_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            lse_val = (lo | (hi << 16)).to(tl.float32, bitcast=True)
         lse_val = tl.where(
             (lse_val != lse_val) | (lse_val == float("inf")),
             -float("inf"),
@@ -162,11 +243,23 @@ def _dcp_lse_combine_kernel(
 
     lse_max = tl.where(lse_max == -float("inf"), 0.0, lse_max)
 
-    # Second pass: compute sum of exp(lse - max)
     lse_sum = 0.0
-    for n in tl.static_range(N):
-        lse_offset = n * rl_stride_N + base_lse_offset
-        lse_val = tl.load(recv_lse_ptr + lse_offset)
+    for rank_idx in tl.static_range(N):
+        recv_base = (
+            rank_idx * recv_stride_N
+            + batch_idx * recv_stride_B
+            + head_idx * recv_stride_H
+        )
+        if LSE_PACK_DIM == 1:
+            lse_val = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D).to(
+                tl.float32
+            )
+        else:
+            lo_raw = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D)
+            hi_raw = tl.load(recv_ptr + recv_base + (HEAD_DIM + 1) * recv_stride_D)
+            lo = lo_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            hi = hi_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            lse_val = (lo | (hi << 16)).to(tl.float32, bitcast=True)
         lse_val = tl.where(
             (lse_val != lse_val) | (lse_val == float("inf")),
             -float("inf"),
@@ -177,19 +270,28 @@ def _dcp_lse_combine_kernel(
         else:
             lse_sum += tl.exp2(lse_val - lse_max)
 
-    # Compute global LSE
     if IS_BASE_E:  # noqa: SIM108
         global_lse = tl.log(lse_sum) + lse_max
     else:
         global_lse = tl.log2(lse_sum) + lse_max
 
-    # Third pass: weighted combination across D dimension
-    d_offsets = tl.arange(0, HEAD_DIM)
     acc = tl.zeros([HEAD_DIM], dtype=tl.float32)
-
-    for n in tl.static_range(N):
-        lse_offset = n * rl_stride_N + base_lse_offset
-        lse_val = tl.load(recv_lse_ptr + lse_offset)
+    for rank_idx in tl.static_range(N):
+        recv_base = (
+            rank_idx * recv_stride_N
+            + batch_idx * recv_stride_B
+            + head_idx * recv_stride_H
+        )
+        if LSE_PACK_DIM == 1:
+            lse_val = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D).to(
+                tl.float32
+            )
+        else:
+            lo_raw = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D)
+            hi_raw = tl.load(recv_ptr + recv_base + (HEAD_DIM + 1) * recv_stride_D)
+            lo = lo_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            hi = hi_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            lse_val = (lo | (hi << 16)).to(tl.float32, bitcast=True)
         lse_val = tl.where(
             (lse_val != lse_val) | (lse_val == float("inf")),
             -float("inf"),
@@ -200,80 +302,89 @@ def _dcp_lse_combine_kernel(
         else:
             weight = tl.exp2(lse_val - global_lse)
         weight = tl.where(weight != weight, 0.0, weight)
+        acc += (
+            tl.load(recv_ptr + recv_base + d_offsets * recv_stride_D).to(tl.float32)
+            * weight
+        )
 
-        out_offsets = n * ro_stride_N + base_out_offset + d_offsets * ro_stride_D
-        out_vals = tl.load(recv_output_ptr + out_offsets)
-        acc += out_vals.to(tl.float32) * weight
-
-    # Store result
     final_offsets = (
-        batch_idx * o_stride_B + head_idx * o_stride_H + d_offsets * o_stride_D
+        batch_idx * out_stride_B + head_idx * out_stride_H + d_offsets * out_stride_D
     )
     tl.store(out_ptr + final_offsets, acc)
 
     if RETURN_LSE:
-        tl.store(out_lse_ptr + base_lse_offset, global_lse)
-
+        out_lse_offset = batch_idx * out_lse_stride_B + head_idx * out_lse_stride_H
+        tl.store(out_lse_ptr + out_lse_offset, global_lse)
 
-def dcp_lse_combine_triton(
-    recv_output: torch.Tensor,
-    recv_lse: torch.Tensor,
-    return_lse: bool = False,
-    is_lse_base_on_e: bool = True,
-) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-    """
-    Triton-accelerated LSE-weighted combination for DCP A2A.
 
-    Args:
-        recv_output: [N, B, H_local, D] - partial outputs from all KV shards
-        recv_lse: [N, B, H_local] - partial LSEs from all KV shards
-        return_lse: If True, also return the global LSE
-        is_lse_base_on_e: If True, LSE is base e; if False, base 2
+def _dcp_a2a_pack_send(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    send_buffer: torch.Tensor,
+    world_size: int,
+    h_per_rank: int,
+    head_dim: int,
+    lse_pack_dim: int,
+) -> None:
+    grid = (cp_attn_out.shape[0], h_per_rank, 1)
+    _dcp_a2a_pack_send_kernel[grid](
+        cp_attn_out,
+        cp_attn_lse,
+        send_buffer,
+        cp_attn_out.stride(0),
+        cp_attn_out.stride(1),
+        cp_attn_out.stride(2),
+        cp_attn_lse.stride(0),
+        cp_attn_lse.stride(1),
+        send_buffer.stride(0),
+        send_buffer.stride(1),
+        send_buffer.stride(2),
+        send_buffer.stride(3),
+        N=world_size,
+        HEAD_DIM=head_dim,
+        H_PER_RANK=h_per_rank,
+        LSE_PACK_DIM=lse_pack_dim,
+    )
 
-    Returns:
-        Combined output [B, H_local, D]
-        If return_lse=True, also returns global_lse [B, H_local]
-    """
-    N, B, H_local, D = recv_output.shape
 
+def _dcp_a2a_unpack_combine(
+    recv_buffer: torch.Tensor,
+    head_dim: int,
+    lse_pack_dim: int,
+    return_lse: bool,
+    is_lse_base_on_e: bool,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+    world_size, num_tokens, h_per_rank, _ = recv_buffer.shape
     out = torch.empty(
-        (B, H_local, D), device=recv_output.device, dtype=recv_output.dtype
+        (num_tokens, h_per_rank, head_dim),
+        device=recv_buffer.device,
+        dtype=recv_buffer.dtype,
     )
-
-    if return_lse:
-        out_lse = torch.empty(
-            (B, H_local), device=recv_lse.device, dtype=recv_lse.dtype
-        )
-    else:
-        out_lse = torch.empty(1, device=recv_lse.device, dtype=recv_lse.dtype)
-
-    ro_stride_N, ro_stride_B, ro_stride_H, ro_stride_D = recv_output.stride()
-    rl_stride_N, rl_stride_B, rl_stride_H = recv_lse.stride()
-    o_stride_B, o_stride_H, o_stride_D = out.stride()
-
-    grid = (B, H_local, 1)
-
-    _dcp_lse_combine_kernel[grid](
-        recv_output,
-        recv_lse,
+    out_lse = torch.empty(
+        (num_tokens, h_per_rank) if return_lse else (1, 1),
+        device=recv_buffer.device,
+        dtype=torch.float32 if return_lse else recv_buffer.dtype,
+    )
+    grid = (num_tokens, h_per_rank, 1)
+    _dcp_a2a_unpack_combine_kernel[grid](
+        recv_buffer,
         out,
         out_lse,
-        ro_stride_N,
-        ro_stride_B,
-        ro_stride_H,
-        ro_stride_D,
-        rl_stride_N,
-        rl_stride_B,
-        rl_stride_H,
-        o_stride_B,
-        o_stride_H,
-        o_stride_D,
-        N=N,
-        HEAD_DIM=D,
+        recv_buffer.stride(0),
+        recv_buffer.stride(1),
+        recv_buffer.stride(2),
+        recv_buffer.stride(3),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        out_lse.stride(0),
+        out_lse.stride(1),
+        N=world_size,
+        HEAD_DIM=head_dim,
         IS_BASE_E=is_lse_base_on_e,
         RETURN_LSE=return_lse,
+        LSE_PACK_DIM=lse_pack_dim,
     )
-
     if return_lse:
         return out, out_lse
     return out
@@ -290,17 +401,8 @@ def dcp_a2a_lse_reduce(
     """
     Combine partial attention outputs across DCP ranks using All-to-All.
 
-    Each rank holds attention output for all heads but only a local shard
-    of the KV cache. This function:
-    1. Exchanges partial outputs across ranks via All-to-All
-    2. Exchanges LSE values via All-to-All
-    3. Combines them with exact LSE-weighted reduction (Triton kernel)
-
-    Tensor flow:
-        Input:  cp_attn_out [B, H, D] - all heads, local KV shard
-        Reshape: [N, B, H/N, D] - split heads across ranks
-        A2A:    Two all_to_all_single calls (output and LSE)
-        Combine: recv [N, B, H/N, D] + lse [N, B, H/N] -> [B, H/N, D]
+    The output and fp32 LSE are packed into a single output-dtype buffer, sent
+    with one All-to-All, then unpacked and combined with exact LSE weighting.
 
     Args:
         cp_attn_out: [B, H, D] where B=num_tokens, H=total_heads, D=head_dim
@@ -321,43 +423,36 @@ def dcp_a2a_lse_reduce(
             return cp_attn_out, cp_attn_lse
         return cp_attn_out
 
-    local_output = cp_attn_out.contiguous()
-    local_lse = cp_attn_lse.contiguous()
-
-    B, H, D = local_output.shape
+    B, H, D = cp_attn_out.shape
+    if H % world_size != 0:
+        raise ValueError(f"H={H} must be divisible by DCP world size {world_size}.")
     H_per_rank = H // world_size
+    lse_pack_dim = _dcp_a2a_lse_pack_dim(cp_attn_out.dtype)
 
-    # Reshape for All-to-All: [B, H, D] -> [N, B, H/N, D]
-    # Split heads into N chunks, each destined for a different rank
-    send_output = (
-        local_output.view(B, world_size, H_per_rank, D).permute(1, 0, 2, 3).contiguous()
+    send_buffer, recv_buffer = _dcp_a2a_send_recv_buffers(
+        (world_size, B, H_per_rank, D + lse_pack_dim),
+        device=cp_attn_out.device,
+        dtype=cp_attn_out.dtype,
     )
-    recv_output = torch.empty_like(send_output)
-
-    # Same for LSE: [B, H] -> [N, B, H/N]
-    send_lse = local_lse.view(B, world_size, H_per_rank).permute(1, 0, 2).contiguous()
-    recv_lse = torch.empty_like(send_lse)
 
-    # All-to-All for partial attention outputs and LSE values (async overlap)
-    work_output = dist.all_to_all_single(
-        recv_output.view(-1),
-        send_output.view(-1),
-        group=cp_group.device_group,
-        async_op=True,
+    _dcp_a2a_pack_send(
+        cp_attn_out,
+        cp_attn_lse,
+        send_buffer,
+        world_size,
+        H_per_rank,
+        D,
+        lse_pack_dim,
     )
-    work_lse = dist.all_to_all_single(
-        recv_lse.view(-1),
-        send_lse.view(-1),
+
+    work = dist.all_to_all_single(
+        recv_buffer.view(-1),
+        send_buffer.view(-1),
         group=cp_group.device_group,
         async_op=True,
     )
-    work_output.wait()
-    work_lse.wait()
-
-    # LSE-weighted combination via Triton kernel (local, no communication)
-    return dcp_lse_combine_triton(
-        recv_output,
-        recv_lse,
-        return_lse=return_lse,
-        is_lse_base_on_e=is_lse_base_on_e,
+    work.wait()
+
+    return _dcp_a2a_unpack_combine(
+        recv_buffer, D, lse_pack_dim, return_lse, is_lse_base_on_e
     )
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/fused_compress_quant_cache.py b/vllm/v1/attention/ops/deepseek_v4_ops/fused_compress_quant_cache.py
index 26b076f34238..2f97d8733c95 100644
--- a/vllm/v1/attention/ops/deepseek_v4_ops/fused_compress_quant_cache.py
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/fused_compress_quant_cache.py
@@ -21,7 +21,7 @@
 
 from vllm.triton_utils import tl, triton
 
-from .fused_indexer_q import _e2m1_nibble
+from .fused_indexer_q import _fp32x2_to_fp4x2
 
 
 # =============================================================================
@@ -566,18 +566,18 @@ def _fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn(
         tl.max(tl.abs(even_2d), axis=1),
         tl.max(tl.abs(odd_2d), axis=1),
     )
-    amax = tl.maximum(amax, 1e-4)
+    amax = tl.maximum(amax, 6.0 * (2**-126))
 
     # ue8m0 block scale: 2^ceil(log2(amax / 6.0)), stored as (exp + 127) byte.
-    log2_ratio = tl.ceil(tl.log2(amax / 6.0))
+    log2_ratio = tl.ceil(tl.log2(amax * (1.0 / 6.0)))
     log2_ratio = tl.minimum(tl.maximum(log2_ratio, -127.0), 127.0)
     inv_scale = tl.exp2(-log2_ratio)
     ue8m0 = (log2_ratio + 127.0).to(tl.uint8)  # [N_QUANT_BLOCKS]
 
     inv_scale_col = tl.reshape(inv_scale, (N_QUANT_BLOCKS, 1))
-    lo_nib = _e2m1_nibble(even_2d * inv_scale_col)  # (N_BLOCKS, HALF_BLOCK) uint8
-    hi_nib = _e2m1_nibble(odd_2d * inv_scale_col)
-    packed = lo_nib | (hi_nib << 4)
+    packed = _fp32x2_to_fp4x2(
+        even_2d * inv_scale_col, odd_2d * inv_scale_col
+    )  # (N_BLOCKS, HALF_BLOCK) uint8
     packed_flat = tl.reshape(packed, (TOKEN_STRIDE,))
 
     tl.store(val_ptr + tl.arange(0, TOKEN_STRIDE), packed_flat)
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py b/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
index 0254a46752c6..f94fc013f5c6 100644
--- a/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
@@ -24,36 +24,22 @@ def _get_cos_sin(
 
 
 @triton.jit
-def _e2m1_nibble(x):
-    """Quantize fp32 x (already scale-divided) to E2M1 4-bit nibble in uint8.
-    Matches torch.bucketize with boundaries
-    [0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0] and right=False (each boundary
-    belongs to the lower bucket), plus sign bit."""
-    abs_x = tl.minimum(tl.abs(x), 6.0)
-    code = tl.where(
-        abs_x <= 0.25,
-        0.0,
-        tl.where(
-            abs_x <= 0.75,
-            1.0,
-            tl.where(
-                abs_x <= 1.25,
-                2.0,
-                tl.where(
-                    abs_x <= 1.75,
-                    3.0,
-                    tl.where(
-                        abs_x <= 2.5,
-                        4.0,
-                        tl.where(abs_x <= 3.5, 5.0, tl.where(abs_x <= 5.0, 6.0, 7.0)),
-                    ),
-                ),
-            ),
-        ),
-    )
-    code_u8 = code.to(tl.uint8)
-    sign = ((x < 0) & (code_u8 != 0)).to(tl.uint8)
-    return code_u8 | (sign << 3)
+def _fp32x2_to_fp4x2(x_lo, x_hi):
+    # NOTE: $1 is high nibble, $2 is low nibble
+    return tl.inline_asm_elementwise(
+        """
+        {
+            .reg .b8 tmp;
+            cvt.rn.satfinite.e2m1x2.f32 tmp, $1, $2;
+            cvt.u32.u8 $0, tmp;
+        }
+        """,
+        constraints="=r,f,f",
+        args=[x_hi, x_lo],
+        dtype=tl.uint32,
+        is_pure=True,
+        pack=1,
+    ).to(tl.uint8)
 
 
 @triton.jit
@@ -65,17 +51,16 @@ def _quantize_mxfp4_pair(x_lo, x_hi):
         - ue8m0  : scalar uint8    (block scale = 2^(ue8m0 - 127))
     """
     amax = tl.maximum(tl.max(tl.abs(x_lo)), tl.max(tl.abs(x_hi)))
-    amax = tl.maximum(amax, 1e-4)
+    # 6 * 2^-126 is from https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/inference/kernel.py#L163
+    amax = tl.maximum(amax, 6.0 * (2**-126))
     # ue8m0 block scale: 2^ceil(log2(amax/6.0)).
-    log2_ratio = tl.math.ceil(tl.math.log2(amax / 6.0))
+    log2_ratio = tl.math.ceil(tl.math.log2(amax * (1.0 / 6.0)))
     log2_ratio = tl.minimum(tl.maximum(log2_ratio, -127.0), 127.0)
     scale = tl.math.exp2(log2_ratio)
     ue8m0 = (log2_ratio + 127.0).to(tl.uint8)
 
     inv_scale = 1.0 / scale
-    lo_nib = _e2m1_nibble(x_lo * inv_scale)
-    hi_nib = _e2m1_nibble(x_hi * inv_scale)
-    packed = lo_nib | (hi_nib << 4)
+    packed = _fp32x2_to_fp4x2(x_lo * inv_scale, x_hi * inv_scale)
     return packed, ue8m0
 
 
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py b/vllm/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
index d9ad22ae0556..68d33f1aa105 100644
--- a/vllm/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
@@ -9,7 +9,9 @@
 
 import torch
 
-from vllm.triton_utils import maybe_launch_pdl, tl, triton
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.torch_utils import direct_register_custom_op
 
 
 @triton.jit
@@ -180,34 +182,75 @@ def fused_inv_rope_fp8_quant(
     fp8_dtype = torch.float8_e4m3fn
     fp8_max = torch.finfo(fp8_dtype).max
 
-    fp8_buf = torch.empty(
-        (n_groups, num_tokens, d),
-        dtype=fp8_dtype,
-        device=o.device,
-    )
-
     tma_aligned_T = get_tma_aligned_size(num_tokens, 4)
     if tma_aligned_scales:
         packed_sf_k = (num_scale_blocks + 3) // 4
-        scale_buf = torch.empty(
-            n_groups * packed_sf_k * tma_aligned_T,
-            dtype=torch.int32,
-            device=o.device,
-        ).as_strided(
-            (n_groups, num_tokens, packed_sf_k),
-            (packed_sf_k * tma_aligned_T, 1, tma_aligned_T),
-        )
+        scale_inner = packed_sf_k
     else:
-        scale_buf = torch.empty(
-            n_groups * num_scale_blocks * tma_aligned_T,
-            dtype=torch.float32,
-            device=o.device,
-        ).as_strided(
-            (n_groups, num_tokens, num_scale_blocks),
-            (num_scale_blocks * tma_aligned_T, 1, tma_aligned_T),
-        )
+        scale_inner = num_scale_blocks
+
+    # Run kernel through a custom op so inductor sees an opaque boundary.
+    # It's a pytorch bug, see https://github.com/vllm-project/vllm/issues/41106
+    fp8_buf, scale_buf = torch.ops.vllm.fused_inv_rope_fp8_quant_kernel(
+        o,
+        positions,
+        cos_sin_cache,
+        heads_per_group,
+        quant_group_size,
+        chunks_per_head,
+        nope_dim % quant_group_size,
+        rope_dim // 2,
+        tma_aligned_scales,
+        fp8_max,
+        tma_aligned_T,
+        num_tokens,
+        n_groups,
+        d,
+        scale_inner,
+    )
+    return fp8_buf.transpose(0, 1), scale_buf.transpose(0, 1)
+
 
-    common_args = dict(
+def _fused_inv_rope_fp8_quant_kernel_impl(
+    o: torch.Tensor,
+    positions: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    heads_per_group: int,
+    quant_group_size: int,
+    chunks_per_head: int,
+    rope_start: int,
+    half_rope: int,
+    tma_aligned_scales: bool,
+    fp8_max: float,
+    tma_aligned_T: int,
+    num_tokens: int,
+    n_groups: int,
+    d: int,
+    scale_inner: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    fp8_buf = torch.empty(
+        (n_groups, num_tokens, d),
+        dtype=torch.float8_e4m3fn,
+        device=o.device,
+    )
+    scale_dtype = torch.int32 if tma_aligned_scales else torch.float32
+    scale_buf = torch.empty(
+        n_groups * scale_inner * tma_aligned_T,
+        dtype=scale_dtype,
+        device=o.device,
+    ).as_strided(
+        (n_groups, num_tokens, scale_inner),
+        (scale_inner * tma_aligned_T, 1, tma_aligned_T),
+    )
+    grid = (tma_aligned_T, n_groups * heads_per_group)
+    pdl_kwargs = {} if current_platform.is_rocm() else {"launch_pdl": False}
+    _fused_inv_rope_fp8_quant_per_head[grid](
+        o,
+        positions,
+        cos_sin_cache,
+        fp8_buf,
+        scale_buf,
+        num_tokens,
         heads_per_group=heads_per_group,
         o_stride_token=o.stride(0),
         o_stride_head=o.stride(1),
@@ -220,25 +263,52 @@ def fused_inv_rope_fp8_quant(
         eps=1e-10,
         QUANT_GROUP_SIZE=quant_group_size,
         CHUNKS_PER_HEAD=chunks_per_head,
-        ROPE_START=nope_dim % quant_group_size,
-        HALF_ROPE=rope_dim // 2,
+        ROPE_START=rope_start,
+        HALF_ROPE=half_rope,
         TMA_ALIGNED_SCALES=tma_aligned_scales,
         num_stages=1,
-        # PDL is a NVIDIA Hopper-only Triton launch attribute; omit on
-        # other backends (e.g. ROCm) to avoid KeyError in JITKernel.
-        **maybe_launch_pdl(),
+        **pdl_kwargs,
+        num_warps=1,
     )
+    return fp8_buf, scale_buf
 
-    grid = (tma_aligned_T, n_groups * heads_per_group)
-    _fused_inv_rope_fp8_quant_per_head[grid](
-        o,
-        positions,
-        cos_sin_cache,
-        fp8_buf,
-        scale_buf,
-        num_tokens,
-        **common_args,
-        num_warps=1,
+
+def _fused_inv_rope_fp8_quant_kernel_fake(
+    o: torch.Tensor,
+    positions: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    heads_per_group: int,
+    quant_group_size: int,
+    chunks_per_head: int,
+    rope_start: int,
+    half_rope: int,
+    tma_aligned_scales: bool,
+    fp8_max: float,
+    tma_aligned_T: int,
+    num_tokens: int,
+    n_groups: int,
+    d: int,
+    scale_inner: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    fp8_buf = torch.empty(
+        (n_groups, num_tokens, d),
+        dtype=torch.float8_e4m3fn,
+        device=o.device,
     )
+    scale_dtype = torch.int32 if tma_aligned_scales else torch.float32
+    scale_buf = torch.empty(
+        n_groups * scale_inner * tma_aligned_T,
+        dtype=scale_dtype,
+        device=o.device,
+    ).as_strided(
+        (n_groups, num_tokens, scale_inner),
+        (scale_inner * tma_aligned_T, 1, tma_aligned_T),
+    )
+    return fp8_buf, scale_buf
 
-    return fp8_buf.transpose(0, 1), scale_buf.transpose(0, 1)
+
+direct_register_custom_op(
+    op_name="fused_inv_rope_fp8_quant_kernel",
+    op_func=_fused_inv_rope_fp8_quant_kernel_impl,
+    fake_impl=_fused_inv_rope_fp8_quant_kernel_fake,
+)
diff --git a/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
index 81cc489db0d8..5d0343ffd607 100644
--- a/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
+++ b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import importlib
+import math
 from importlib.util import find_spec
 
 import torch
+import torch.nn.functional as F
 
 from vllm.forward_context import get_forward_context
 from vllm.platforms import current_platform
@@ -13,8 +15,10 @@
 from vllm.v1.attention.backends.mla.indexer import DeepseekV32IndexerMetadata
 from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton
 
-if current_platform.is_cuda_alike():
-    from vllm import _custom_ops as ops
+if current_platform.is_rocm():
+    from vllm.platforms.rocm import _ON_GFX942
+else:
+    _ON_GFX942 = False
 
 
 @triton.jit
@@ -97,7 +101,8 @@ def indexer_k_quant_and_cache_triton(
     # In real layout, we store the first portion as kv cache value
     # and second portion as kv cache scale
     kv_cache = kv_cache.view(num_blocks, -1)
-    kv_cache_value = kv_cache[:, : block_size * head_dim]
+    fp8_dtype = current_platform.fp8_dtype()
+    kv_cache_value = kv_cache[:, : block_size * head_dim].view(fp8_dtype)
     kv_cache_scale = kv_cache[:, block_size * head_dim :].view(torch.float32)
     head_tile_size = head_tile_size // kv_cache.element_size()
     grid = (num_tokens,)
@@ -111,7 +116,7 @@ def indexer_k_quant_and_cache_triton(
         block_size,
         num_tokens,
         head_dim,
-        "NHD",
+        "SHUFFLE",
         block_tile_size,
         head_tile_size,
         IS_FNUZ=current_platform.fp8_dtype() == torch.float8_e4m3fnuz,
@@ -212,7 +217,7 @@ def cp_gather_indexer_k_quant_cache_triton(
         block_table_stride,
         k_cache_value.stride(0),
         k_cache_scale.stride(0),
-        "NHD",
+        "SHUFFLE",
         head_dim,
         block_tile_size,
         head_tile_size,
@@ -232,6 +237,43 @@ def fp8_paged_mqa_logits_torch(
 
     fp8_dtype = current_platform.fp8_dtype()
     batch_size, next_n, _, dim = q.size()
+    if next_n == 1:
+        block_size = kv_cache.shape[1]
+        logits = torch.full(
+            [batch_size, max_model_len],
+            float("-inf"),
+            device=q.device,
+            dtype=torch.float32,
+        )
+        if context_lens.dim() > 1:
+            context_lens = context_lens.squeeze(-1)
+        kv_cache_flat = kv_cache.view(-1, block_size * (dim + 4))
+        for i in range(batch_size):
+            q_i = q[i, 0].to(torch.float32)
+            q_scale = weights[i]
+            seq_len = int(context_lens[i].item())
+            assert seq_len <= max_model_len
+            num_pages = cdiv(seq_len, block_size)
+            padded_seq_len = num_pages * block_size
+            pages = block_tables[i, :num_pages]
+            cache = kv_cache_flat[pages]
+            scale_offset = block_size * dim
+            cache_value = (
+                cache[..., :scale_offset].view(dtype=fp8_dtype).to(torch.float32)
+            )
+            cache_scale = (
+                cache[..., scale_offset:].view(dtype=torch.float32).contiguous()
+            )
+            cache_value = cache_value.view(padded_seq_len, dim)
+            cache_scale = cache_scale.view(padded_seq_len)
+            score = F.linear(cache_value, q_i)
+            score = F.relu(score)
+            score *= q_scale[None, :]
+            score = score.sum(dim=1)
+            score *= cache_scale
+            logits[i, :seq_len] = score[:seq_len]
+        return logits
+
     kv_cache, scale = kv_cache[..., :dim], kv_cache[..., dim:]
     scale = scale.contiguous().view(torch.float)
     q = q.float()
@@ -243,20 +285,30 @@ def fp8_paged_mqa_logits_torch(
         device=q.device,
         dtype=torch.float32,
     )
-    context_lens = context_lens.tolist()
     for i in range(batch_size):
         context_len = context_lens[i]
-        q_offsets = torch.arange(context_len - next_n, context_len, device="cuda")
+        if context_len.ndim == 0:
+            context_len_i = int(context_len.item())
+            q_offsets = torch.arange(
+                context_len_i - next_n, context_len_i, device=q.device
+            )
+            context_limit = torch.full(
+                (next_n,), context_len_i, dtype=torch.int32, device=q.device
+            )
+        else:
+            context_limit = context_len.to(device=q.device, dtype=torch.int32)
+            q_offsets = context_limit - 1
         weight_slice = (
             weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous()
         )
-        for block_rk in range(cdiv(context_len, block_size)):
+        max_context_len = int(context_limit.max().item())
+        for block_rk in range(cdiv(max_context_len, block_size)):
             block_idx = block_tables[i][block_rk]
             qx, kx = q[i], kv_cache[block_idx]
             k_offsets = torch.arange(
-                block_rk * block_size, (block_rk + 1) * block_size, device="cuda"
+                block_rk * block_size, (block_rk + 1) * block_size, device=q.device
             )
-            mask = (k_offsets[None, :] < context_len) & (
+            mask = (k_offsets[None, :] < context_limit[:, None]) & (
                 k_offsets[None, :] <= q_offsets[:, None]
             )
             s = torch.where(
@@ -325,10 +377,39 @@ def rocm_fp8_paged_mqa_logits(
     from vllm._aiter_ops import rocm_aiter_ops
 
     aiter_paged_mqa_logits_module = None
+    # if rocm_aiter_ops.is_enabled():
+    batch_size, next_n, heads, head_dim = q_fp8.shape
+    num_blocks, block_size, _, _ = kv_cache_fp8.shape
+
     if rocm_aiter_ops.is_enabled():
         aiter_paged_mqa_logits_module = paged_mqa_logits_module()
 
     if aiter_paged_mqa_logits_module is not None:
+        if _ON_GFX942:
+            deepgemm_fp8_paged_mqa_logits = (
+                aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits
+            )
+            batch_size, next_n, heads, _ = q_fp8.shape
+            out_logits = torch.full(
+                [batch_size * next_n, max_model_len],
+                float("-inf"),
+                device="cuda",
+                dtype=torch.float32,
+            )
+            deepgemm_fp8_paged_mqa_logits(
+                q_fp8,
+                kv_cache_fp8,
+                weights,
+                out_logits,
+                context_lens,
+                block_tables,
+                max_model_len,
+                ChunkK=256,
+                Preshuffle=block_size == 64,
+                KVBlockSize=block_size,
+                WavePerEU=2,
+            )
+            return out_logits
         deepgemm_fp8_paged_mqa_logits_stage1 = (
             aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits_stage1
         )
@@ -339,8 +420,6 @@ def rocm_fp8_paged_mqa_logits(
             device="cuda",
             dtype=torch.float32,
         )
-        # TODO: 1. Replace _stage1 and out_qk.sum with another fused variant;
-        #       2. Remove ChunkQ when AITER PR #2891 merged
         deepgemm_fp8_paged_mqa_logits_stage1(
             q_fp8,
             kv_cache_fp8,
@@ -461,6 +540,27 @@ def rocm_fp8_mqa_logits(
         return fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)
 
 
+def _topk_indices_torch(logits: torch.Tensor, topk_tokens: int) -> torch.Tensor:
+    k = min(topk_tokens, logits.shape[-1])
+    values, indices = torch.topk(logits, k=k, dim=-1)
+    indices = indices.to(torch.int32)
+    indices = torch.where(
+        values == float("-inf"),
+        torch.full_like(indices, -1, dtype=torch.int32),
+        indices,
+    )
+    if k == topk_tokens:
+        return indices
+    padded = torch.full(
+        (logits.shape[0], topk_tokens),
+        -1,
+        dtype=torch.int32,
+        device=logits.device,
+    )
+    padded[:, :k] = indices
+    return padded
+
+
 def rocm_aiter_sparse_attn_indexer_fake(
     hidden_states: torch.Tensor,
     k_cache_prefix: LayerNameType,
@@ -479,8 +579,9 @@ def rocm_aiter_sparse_attn_indexer_fake(
     # profile run
     # NOTE(Chen): create the max possible flattened_kv. So that
     # profile_run can get correct memory usage.
+    device = hidden_states.device if k is None else k.device
     _flattened_kv = torch.empty(
-        [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8
+        [total_seq_lens, head_dim + 4], device=device, dtype=torch.uint8
     )
     fp8_dtype = current_platform.fp8_dtype()
     _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous()
@@ -488,7 +589,7 @@ def rocm_aiter_sparse_attn_indexer_fake(
     return topk_indices_buffer
 
 
-def rocm_aiter_sparse_attn_indexer(
+def rocm_aiter_sparse_attn_indexer_native(
     hidden_states: torch.Tensor,
     k_cache_prefix: LayerNameType,
     kv_cache: torch.Tensor,
@@ -502,10 +603,12 @@ def rocm_aiter_sparse_attn_indexer(
     max_model_len: int,
     total_seq_lens: int,
     topk_indices_buffer: torch.Tensor | None,
+    skip_k_cache_insert: bool = False,
 ) -> torch.Tensor:
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
     fp8_dtype = current_platform.fp8_dtype()
+    from vllm import _custom_ops as ops
     from vllm.utils.torch_utils import _resolve_layer_name
 
     k_cache_prefix = _resolve_layer_name(k_cache_prefix)
@@ -534,19 +637,33 @@ def rocm_aiter_sparse_attn_indexer(
     has_decode = layer_attn_metadata.num_decodes > 0
     has_prefill = layer_attn_metadata.num_prefills > 0
     num_decode_tokens = layer_attn_metadata.num_decode_tokens
+    device = hidden_states.device if k is None else k.device
 
     # during speculative decoding, k may be padded to the CUDA graph batch
     # size while slot_mapping only covers actual tokens.
     num_tokens = slot_mapping.shape[0]
-    k = k[:num_tokens]
+    if k is not None:
+        k = k[:num_tokens]
+    elif not skip_k_cache_insert:
+        raise ValueError("k must be provided when skip_k_cache_insert is False")
 
-    ops.indexer_k_quant_and_cache(
-        k,
-        kv_cache,
-        slot_mapping,
-        quant_block_size,
-        scale_fmt,
-    )
+    if not skip_k_cache_insert:
+        if _ON_GFX942:
+            ops.indexer_k_quant_and_cache(
+                k,
+                kv_cache,
+                slot_mapping,
+                quant_block_size,
+                scale_fmt,
+            )
+        else:
+            indexer_k_quant_and_cache_triton(
+                k,
+                kv_cache,
+                slot_mapping,
+                quant_block_size,
+                scale_fmt,
+            )
 
     topk_indices_buffer[: hidden_states.shape[0]] = -1
     if has_prefill:
@@ -555,22 +672,31 @@ def rocm_aiter_sparse_attn_indexer(
         for chunk in prefill_metadata.chunks:
             k_fp8 = torch.empty(
                 [chunk.total_seq_lens, head_dim],
-                device=k.device,
+                device=device,
                 dtype=fp8_dtype,
             )
             k_scale = torch.empty(
                 [chunk.total_seq_lens, 4],
-                device=k.device,
+                device=device,
                 dtype=torch.uint8,
             )
-
-            ops.cp_gather_indexer_k_quant_cache(
-                kv_cache,
-                k_fp8,
-                k_scale,
-                chunk.block_table,
-                chunk.cu_seq_lens,
-            )
+            if _ON_GFX942:
+                ops.cp_gather_indexer_k_quant_cache(
+                    kv_cache,
+                    k_fp8,
+                    k_scale,
+                    chunk.block_table,
+                    chunk.cu_seq_lens,
+                )
+            else:
+                cp_gather_indexer_k_quant_cache_triton(
+                    kv_cache,
+                    k_fp8,
+                    k_scale,
+                    chunk.block_table,
+                    chunk.cu_seq_lens,
+                    token_to_seq=chunk.token_to_seq,
+                )
 
             logits = rocm_fp8_mqa_logits(
                 q_fp8[chunk.token_start : chunk.token_end],
@@ -579,21 +705,10 @@ def rocm_aiter_sparse_attn_indexer(
                 chunk.cu_seqlen_ks,
                 chunk.cu_seqlen_ke,
             )
-            num_rows = logits.shape[0]
-            assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
             topk_indices = topk_indices_buffer[
                 chunk.token_start : chunk.token_end, :topk_tokens
             ]
-            torch.ops._C.top_k_per_row_prefill(
-                logits,
-                chunk.cu_seqlen_ks,
-                chunk.cu_seqlen_ke,
-                topk_indices,
-                num_rows,
-                logits.stride(0),
-                logits.stride(1),
-                topk_tokens,
-            )
+            topk_indices.copy_(_topk_indices_torch(logits, topk_tokens))
 
     if has_decode:
         decode_metadata = layer_attn_metadata.decode
@@ -630,19 +745,8 @@ def rocm_aiter_sparse_attn_indexer(
             max_model_len=max_model_len,
         )
 
-        num_rows = logits.shape[0]
-        assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
         topk_indices = topk_indices_buffer[:num_decode_tokens, :topk_tokens]
-        torch.ops._C.top_k_per_row_decode(
-            logits,
-            next_n,
-            decode_metadata.seq_lens,
-            topk_indices,
-            num_rows,
-            logits.stride(0),
-            logits.stride(1),
-            topk_tokens,
-        )
+        topk_indices.copy_(_topk_indices_torch(logits, topk_tokens)[:num_decode_tokens])
 
         if decode_metadata.requires_padding:
             # if padded, we need to unpack
@@ -656,3 +760,370 @@ def rocm_aiter_sparse_attn_indexer(
             )
 
     return topk_indices_buffer
+
+
+def rocm_aiter_sparse_attn_indexer(
+    hidden_states: torch.Tensor,
+    k_cache_prefix: LayerNameType,
+    kv_cache: torch.Tensor,
+    q_fp8: torch.Tensor,
+    k: torch.Tensor,
+    weights: torch.Tensor,
+    quant_block_size: int,
+    scale_fmt: str | None,
+    topk_tokens: int,
+    head_dim: int,
+    max_model_len: int,
+    total_seq_lens: int,
+    topk_indices_buffer: torch.Tensor | None,
+) -> torch.Tensor:
+    return rocm_aiter_sparse_attn_indexer_native(
+        hidden_states,
+        k_cache_prefix,
+        kv_cache,
+        q_fp8,
+        k,
+        weights,
+        quant_block_size,
+        scale_fmt,
+        topk_tokens,
+        head_dim,
+        max_model_len,
+        total_seq_lens,
+        topk_indices_buffer,
+        skip_k_cache_insert=False,
+    )
+
+
+def _decode_e8m0_scales(scale: torch.Tensor) -> torch.Tensor:
+    if scale.dtype == torch.float8_e8m0fnu:
+        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+            _upcast_e8m0_to_fp32,
+        )
+
+        return _upcast_e8m0_to_fp32(scale).contiguous()
+    return scale.to(torch.float32)
+
+
+def _expand_2d_block_scales(
+    scale: torch.Tensor,
+    rows: int,
+    cols: int,
+) -> torch.Tensor:
+    scale = _decode_e8m0_scales(scale)
+    row_blocks, col_blocks = scale.shape[-2:]
+    row_block = math.ceil(rows / row_blocks)
+    col_block = math.ceil(cols / col_blocks)
+    scale = torch.repeat_interleave(scale, row_block, dim=-2)[..., :rows, :]
+    scale = torch.repeat_interleave(scale, col_block, dim=-1)[..., :, :cols]
+    return scale
+
+
+def _apply_gptj_inv_rope_ref(
+    x: torch.Tensor,
+    positions: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    rope_dim: int,
+) -> torch.Tensor:
+    if rope_dim == 0 or x.numel() == 0:
+        return x
+    half_rot = rope_dim // 2
+    nope_dim = x.shape[-1] - rope_dim
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    cache = cos_sin_cache.index_select(0, positions.to(torch.long))
+    cos = cache[:, :half_rot].to(torch.float32)
+    sin = cache[:, half_rot : 2 * half_rot].to(torch.float32)
+    view_shape = (positions.shape[0],) + (1,) * (x.dim() - 2) + (half_rot,)
+    cos = cos.view(view_shape)
+    sin = sin.view(view_shape)
+    rope = x[..., nope_dim:]
+    y_even = rope[..., 0::2]
+    y_odd = rope[..., 1::2]
+    rope_out = torch.stack(
+        (y_even * cos + y_odd * sin, y_odd * cos - y_even * sin),
+        dim=-1,
+    ).flatten(-2)
+    x = x.clone()
+    x[..., nope_dim:] = rope_out
+    return x.to(dtype)
+
+
+def _apply_inv_rope_ref(
+    rotary_emb: torch.nn.Module,
+    x: torch.Tensor,
+    positions: torch.Tensor,
+    rope_dim: int,
+) -> torch.Tensor:
+    if hasattr(rotary_emb, "forward_native"):
+        try:
+            query, _ = rotary_emb.forward_native(
+                positions,
+                x.clone(),
+                None,
+                inverse=True,
+            )
+            return query
+        except TypeError:
+            pass
+    return _apply_gptj_inv_rope_ref(x, positions, rotary_emb.cos_sin_cache, rope_dim)
+
+
+def rocm_inv_rope_einsum(
+    rotary_emb: torch.nn.Module,
+    o: torch.Tensor,
+    positions: torch.Tensor,
+    rope_head_dim: int,
+    n_local_groups: int,
+    o_lora_rank: int,
+    wo_a: torch.nn.Module,
+) -> torch.Tensor:
+    """Reference inverse-RoPE + WO_A einsum path used on ROCm."""
+    o_ref = _apply_inv_rope_ref(rotary_emb, o, positions, rope_head_dim).to(
+        torch.bfloat16
+    )
+    o_ref = o_ref.view(o.shape[0], n_local_groups, -1)
+
+    hidden_dim = o_ref.shape[-1]
+    if hasattr(wo_a, "weight_scale_inv"):
+        wo_a_weight = wo_a.weight.view(n_local_groups, o_lora_rank, hidden_dim).to(
+            torch.float32
+        )
+        wo_a_scale = _expand_2d_block_scales(
+            wo_a.weight_scale_inv.view(
+                n_local_groups, -1, wo_a.weight_scale_inv.shape[-1]
+            ),
+            o_lora_rank,
+            hidden_dim,
+        )
+        wo_a_weight = (wo_a_weight * wo_a_scale).to(torch.bfloat16)
+    else:
+        wo_a_weight = wo_a.weight.view(n_local_groups, o_lora_rank, hidden_dim).to(
+            torch.bfloat16
+        )
+
+    return torch.einsum("tgd,grd->tgr", o_ref, wo_a_weight)
+
+
+def rocm_ref_sparse_attn_prefill(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    indices: torch.Tensor,
+    topk_length: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    attn_sink: torch.Tensor | None,
+) -> torch.Tensor:
+    indices = indices.clone().squeeze(1)
+    s_q, h_q, d_qk = q.shape
+    topk = indices.shape[-1]
+    s_kv = kv.shape[0]
+    if topk_length is not None:
+        mask = torch.arange(topk, device=indices.device).unsqueeze(
+            0
+        ) >= topk_length.unsqueeze(1)
+        indices[mask] = -1
+    invalid_mask = (indices < 0) | (indices >= s_kv)
+    indices[invalid_mask] = 0
+
+    qf = q.float()
+    gathered_kv = kv.index_select(0, indices.flatten()).reshape(s_q, topk, d_qk).float()
+    scores = qf @ gathered_kv.transpose(1, 2)
+    scores *= scale
+    scores[invalid_mask.unsqueeze(1).expand_as(scores)] = float("-inf")
+
+    orig_lse = torch.logsumexp(scores, dim=-1)
+    lse_for_o = orig_lse
+    if attn_sink is not None:
+        lse_for_o = torch.logsumexp(
+            torch.stack(
+                [orig_lse, attn_sink[:h_q].view(1, h_q).expand_as(orig_lse)],
+                dim=0,
+            ),
+            dim=0,
+        )
+    lse_for_o = lse_for_o.clone()
+    lse_for_o[lse_for_o == float("-inf")] = float("+inf")
+    probs = torch.exp(scores - lse_for_o.unsqueeze(-1))
+    out = probs @ gathered_kv[..., :head_dim]
+    lonely_q_mask = orig_lse == float("-inf")
+    out[lonely_q_mask.unsqueeze(-1).expand_as(out)] = 0.0
+    return out.to(torch.bfloat16)
+
+
+def rocm_sparse_attn_prefill(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    indices: torch.Tensor,
+    topk_length: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    attn_sink: torch.Tensor | None,
+    output: torch.Tensor,
+) -> None:
+    output_chunk = rocm_ref_sparse_attn_prefill(
+        q=q,
+        kv=kv,
+        indices=indices,
+        topk_length=topk_length,
+        scale=scale,
+        head_dim=head_dim,
+        attn_sink=attn_sink,
+    )
+    output.copy_(output_chunk.to(output.dtype))
+
+
+def rocm_dequantize_blocked_k_cache(
+    quant_k_cache: torch.Tensor,
+    head_dim: int,
+    nope_head_dim: int,
+    rope_head_dim: int,
+) -> torch.Tensor:
+    fp8_dtype = current_platform.fp8_dtype()
+    tile_size = 64
+    num_tiles = nope_head_dim // tile_size
+
+    num_blocks, block_size, _ = quant_k_cache.shape
+    quant_k_cache = quant_k_cache.view(num_blocks, -1)
+    input_nope_rope = quant_k_cache[
+        :, : block_size * (nope_head_dim + 2 * rope_head_dim)
+    ].view(num_blocks, block_size, nope_head_dim + 2 * rope_head_dim)
+    input_nope = input_nope_rope[:, :, :nope_head_dim].view(fp8_dtype)
+    input_rope = input_nope_rope[:, :, nope_head_dim:].view(torch.bfloat16)
+    input_scale = (
+        quant_k_cache[:, block_size * (nope_head_dim + 2 * rope_head_dim) :]
+        .view(num_blocks, block_size, 8)[:, :, :num_tiles]
+        .view(torch.float8_e8m0fnu)
+    )
+
+    result = torch.empty(
+        (num_blocks, block_size, 1, head_dim),
+        dtype=torch.bfloat16,
+        device=quant_k_cache.device,
+    )
+    result[..., nope_head_dim:] = input_rope.unsqueeze(2)
+    for tile_idx in range(num_tiles):
+        cur_nope = input_nope[
+            ..., tile_idx * tile_size : (tile_idx + 1) * tile_size
+        ].to(torch.bfloat16)
+        cur_scales = input_scale[:, :, tile_idx].to(torch.bfloat16).unsqueeze(-1)
+        result[..., tile_idx * tile_size : (tile_idx + 1) * tile_size] = (
+            cur_nope * cur_scales
+        ).unsqueeze(2)
+    return result
+
+
+def rocm_ref_sparse_attn_decode(
+    q: torch.Tensor,
+    blocked_k: torch.Tensor,
+    indices_in_kvcache: torch.Tensor,
+    topk_length: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    attn_sink: torch.Tensor | None,
+    extra_blocked_k: torch.Tensor | None = None,
+    extra_indices_in_kvcache: torch.Tensor | None = None,
+    extra_topk_length: torch.Tensor | None = None,
+) -> torch.Tensor:
+    b, s_q, h_q, d_qk = q.shape
+
+    def process_scope(
+        cur_blocked_k: torch.Tensor,
+        cur_indices: torch.Tensor,
+        cur_topk_length: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        cur_indices = cur_indices.reshape(b, s_q, -1)
+        topk = cur_indices.size(-1)
+        fixed_indices = torch.clamp_min(cur_indices, 0)
+        gathered_kv = (
+            cur_blocked_k.view(-1, d_qk)
+            .index_select(0, fixed_indices.view(-1))
+            .view(b, s_q, topk, d_qk)
+        )
+        invalid_mask = cur_indices == -1
+        if cur_topk_length is not None:
+            cur_topk_length = cur_topk_length.reshape(b)
+            invalid_mask |= torch.arange(0, topk, device=invalid_mask.device).view(
+                1, 1, topk
+            ) >= cur_topk_length.view(b, 1, 1)
+        return gathered_kv, invalid_mask
+
+    gathered_kv, invalid_mask = process_scope(
+        blocked_k, indices_in_kvcache, topk_length
+    )
+    if extra_blocked_k is not None:
+        assert extra_indices_in_kvcache is not None
+        gathered_kv1, invalid_mask1 = process_scope(
+            extra_blocked_k, extra_indices_in_kvcache, extra_topk_length
+        )
+        gathered_kv = torch.cat([gathered_kv, gathered_kv1], dim=2)
+        invalid_mask = torch.cat([invalid_mask, invalid_mask1], dim=2)
+
+    gathered_kv = gathered_kv.view(b * s_q, -1, d_qk).float()
+    gathered_kv[gathered_kv != gathered_kv] = 0.0
+    qf = q.float().view(b * s_q, h_q, d_qk)
+    attn_weight = qf @ gathered_kv.transpose(-1, -2)
+    attn_weight *= scale
+    attn_weight[
+        invalid_mask.view(b * s_q, 1, -1).expand(b * s_q, h_q, invalid_mask.size(-1))
+    ] = float("-inf")
+    lse = attn_weight.logsumexp(dim=-1)
+    attn_weight = torch.exp(attn_weight - lse.unsqueeze(-1))
+    output = attn_weight @ gathered_kv[..., :head_dim]
+    output = output.view(b, s_q, h_q, head_dim)
+    lse = lse.view(b, s_q, h_q)
+
+    if attn_sink is not None:
+        output *= (1.0 / (1.0 + torch.exp(attn_sink.view(1, 1, h_q) - lse))).unsqueeze(
+            -1
+        )
+
+    lonely_q_mask = lse == float("-inf")
+    output[lonely_q_mask.unsqueeze(-1).expand_as(output)] = 0.0
+    return output.squeeze(1).to(torch.bfloat16)
+
+
+def rocm_forward_decode_fallback(
+    q: torch.Tensor,
+    kv_cache: torch.Tensor | None,
+    swa_k_cache: torch.Tensor,
+    swa_only: bool,
+    topk_indices: torch.Tensor | None,
+    topk_lens: torch.Tensor | None,
+    swa_indices: torch.Tensor,
+    swa_lens: torch.Tensor,
+    attn_sink: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    nope_head_dim: int,
+    rope_head_dim: int,
+    output: torch.Tensor,
+) -> None:
+    blocked_swa = rocm_dequantize_blocked_k_cache(
+        swa_k_cache,
+        head_dim=head_dim,
+        nope_head_dim=nope_head_dim,
+        rope_head_dim=rope_head_dim,
+    )
+    blocked_extra = None
+    if not swa_only:
+        assert kv_cache is not None
+        blocked_extra = rocm_dequantize_blocked_k_cache(
+            kv_cache,
+            head_dim=head_dim,
+            nope_head_dim=nope_head_dim,
+            rope_head_dim=rope_head_dim,
+        )
+    attn_out = rocm_ref_sparse_attn_decode(
+        q=q.unsqueeze(1),
+        blocked_k=blocked_swa,
+        indices_in_kvcache=swa_indices.unsqueeze(1),
+        topk_length=swa_lens,
+        scale=scale,
+        head_dim=head_dim,
+        attn_sink=attn_sink[: q.shape[1]] if attn_sink is not None else None,
+        extra_blocked_k=blocked_extra,
+        extra_indices_in_kvcache=topk_indices,
+        extra_topk_length=topk_lens,
+    )
+    output.copy_(attn_out.to(output.dtype))
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index d53666f0d460..65993e804153 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -85,6 +85,7 @@ def get_num_blocks_to_allocate(
         num_encoder_tokens: int,
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
@@ -101,6 +102,10 @@ def get_num_blocks_to_allocate(
             num_tokens_main_model: The number of tokens for the main model (aka target
                 model in spec decode). w/o spec decode, it is num_tokens;
                 with spec decode, it is num_tokens - num_lookahead_tokens.
+            apply_admission_cap: If True, apply the recycling-aware
+                per-request admission cap (SWA / chunked-local). Set only by
+                the full-sequence admission gate; per-step allocation must
+                leave it False so the predictor matches `allocate_new_blocks`.
 
         Returns:
             The number of blocks to allocate.
@@ -111,7 +116,12 @@ def get_num_blocks_to_allocate(
                 # For cross-attention, we issue a single static allocation
                 # of blocks based on the number of encoder input tokens.
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                    request_id, num_encoder_tokens, [], 0, num_encoder_tokens
+                    request_id,
+                    num_encoder_tokens,
+                    [],
+                    0,
+                    num_encoder_tokens,
+                    apply_admission_cap=apply_admission_cap,
                 )
             else:
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
@@ -120,6 +130,7 @@ def get_num_blocks_to_allocate(
                     new_computed_blocks[i],
                     total_computed_tokens,
                     num_tokens_main_model,
+                    apply_admission_cap=apply_admission_cap,
                 )
         return num_blocks_to_allocate
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 83aa26bd96f0..431776870cf4 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -222,45 +222,6 @@ def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]:
 
         return self.create_kv_cache_blocks(computed_blocks), num_new_computed_tokens
 
-    def can_fit_full_sequence(
-        self,
-        request: Request,
-        num_new_computed_tokens: int = 0,
-        new_computed_blocks: KVCacheBlocks | None = None,
-        num_external_computed_tokens: int = 0,
-        num_encoder_tokens: int = 0,
-    ) -> bool:
-        """Check if the KV cache has enough free blocks to hold the full
-        sequence, accounting for prefix cache hits and sliding window.
-
-        This is used as an admission gate to prevent over-admitting requests
-        when chunked prefill would otherwise only check the first chunk.
-        """
-        if new_computed_blocks is not None:
-            new_computed_block_list = new_computed_blocks.blocks
-        else:
-            new_computed_block_list = self.empty_kv_cache_blocks.blocks
-
-        num_local_computed_tokens = (
-            request.num_computed_tokens + num_new_computed_tokens
-        )
-        total_computed_tokens = min(
-            num_local_computed_tokens + num_external_computed_tokens,
-            self.max_model_len,
-        )
-        full_num_tokens = min(request.num_tokens, self.max_model_len)
-
-        num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
-            request_id=request.request_id,
-            num_tokens=full_num_tokens,
-            new_computed_blocks=new_computed_block_list,
-            num_encoder_tokens=num_encoder_tokens,
-            total_computed_tokens=total_computed_tokens,
-            num_tokens_main_model=full_num_tokens,
-        )
-
-        return num_blocks_to_allocate <= self.block_pool.get_num_free_blocks()
-
     def allocate_slots(
         self,
         request: Request,
@@ -271,6 +232,7 @@ def allocate_slots(
         num_external_computed_tokens: int = 0,
         delay_cache_blocks: bool = False,
         num_encoder_tokens: int = 0,
+        full_sequence_must_fit: bool = False,
     ) -> KVCacheBlocks | None:
         """Add slots for a request with new tokens to append.
 
@@ -292,6 +254,10 @@ def allocate_slots(
             num_encoder_tokens: The number of encoder tokens to allocate for
                 cross-attention in encoder-decoder models(e.g., Whisper).
                 For decoder-only models, this should be 0.
+            full_sequence_must_fit: Only allocate blocks if the KV cache has enough
+                free blocks to hold the full sequence, accounting for prefix cache hits
+                and sliding window. Used as an admission gate to prevent over-admitting
+                requests when chunked prefill would otherwise only check the first chunk
 
         Blocks layout:
         ```
@@ -365,10 +331,26 @@ def allocate_slots(
             num_local_computed_tokens + num_external_computed_tokens,
             self.max_model_len,
         )
+
+        if full_sequence_must_fit:
+            # First check and fail if the full request sequence won't fit.
+            full_num_tokens = min(request.num_tokens, self.max_model_len)
+
+            num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
+                request_id=request.request_id,
+                num_tokens=full_num_tokens,
+                new_computed_blocks=new_computed_block_list,
+                num_encoder_tokens=num_encoder_tokens,
+                total_computed_tokens=total_computed_tokens,
+                num_tokens_main_model=full_num_tokens,
+                apply_admission_cap=True,
+            )
+            if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
+                return None
+
         num_tokens_main_model = total_computed_tokens + num_new_tokens
         num_tokens_need_slot = min(
-            num_tokens_main_model + num_lookahead_tokens,
-            self.max_model_len,
+            num_tokens_main_model + num_lookahead_tokens, self.max_model_len
         )
 
         # Free the blocks that are skipped during the attention computation
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 3e0e7fcb8c5b..b57e10b67faa 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -890,31 +890,48 @@ def get_max_concurrency_for_kv_cache_config(
     return max_concurrency
 
 
-def may_override_num_blocks(
-    vllm_config: VllmConfig, num_blocks: int, suppress_log: bool = False
-) -> int:
+def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int:
     """
     Override the number of kv cache blocks if `num_gpu_blocks_override` is set.
+    The override is logged once, at the call site in `get_kv_cache_configs`.
     """
     if vllm_config.cache_config.num_gpu_blocks_override is not None:
-        num_gpu_blocks_override = vllm_config.cache_config.num_gpu_blocks_override
-        if not suppress_log:
-            logger.info(
-                "Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
-                num_blocks,
-                num_gpu_blocks_override,
-            )
-        num_blocks = num_gpu_blocks_override
-
+        num_blocks = vllm_config.cache_config.num_gpu_blocks_override
     return num_blocks
 
 
+def _pool_bytes_per_block(kv_cache_groups: list[KVCacheGroupSpec]) -> int:
+    """
+    Bytes consumed by one block in the worker's shared KV cache pool, mirroring
+    the divisor used by `get_kv_cache_config_from_groups` to convert
+    `available_memory` into `num_blocks`. Used to compute the effective KV cache
+    capacity once `num_gpu_blocks_override` is applied.
+    """
+    if len(kv_cache_groups) == 1 and isinstance(
+        kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
+    ):
+        return kv_cache_groups[0].kv_cache_spec.page_size_bytes
+    if all(
+        isinstance(g.kv_cache_spec, UniformTypeKVCacheSpecs) for g in kv_cache_groups
+    ):
+        # DeepseekV4: shared layout sized by the largest per-page-size bucket.
+        full_mla_spec = cast(UniformTypeKVCacheSpecs, kv_cache_groups[0].kv_cache_spec)
+        layer_tuple_page_bytes = sum(full_mla_spec.get_page_sizes())
+        num_layer_tuples = max(
+            cast(UniformTypeKVCacheSpecs, g.kv_cache_spec).get_num_layer_tuples()
+            for g in kv_cache_groups
+        )
+        return layer_tuple_page_bytes * num_layer_tuples
+    group_size = max(len(g.layer_names) for g in kv_cache_groups)
+    page_size = get_uniform_page_size([g.kv_cache_spec for g in kv_cache_groups])
+    return page_size * group_size
+
+
 def get_num_blocks(
     vllm_config: VllmConfig,
     num_layers: int,
     available_memory: int,
     page_size: int,
-    suppress_log: bool = False,
 ) -> int:
     """
     Get the number of kv cache blocks.
@@ -924,15 +941,10 @@ def get_num_blocks(
         num_layers: The number of layers
         available_memory: Memory available for KV cache in bytes.
         page_size: The page size of the KV cache.
-        suppress_log: Whether to suppress override log messages. Used when creating a
-            temporary/dummy KV cache config, e.g. during CG memory profiling
     """
     num_blocks = int(available_memory // page_size // num_layers)
     num_blocks = max(num_blocks, 0)
-    num_blocks = may_override_num_blocks(
-        vllm_config, num_blocks, suppress_log=suppress_log
-    )
-    return num_blocks
+    return may_override_num_blocks(vllm_config, num_blocks)
 
 
 def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
@@ -1220,7 +1232,6 @@ def get_kv_cache_config_from_groups(
     vllm_config: VllmConfig,
     kv_cache_groups: list[KVCacheGroupSpec],
     available_memory: int,
-    suppress_log: bool = False,
 ) -> KVCacheConfig:
     """
     Generate the KV cache configuration from the KV cache groups and spec
@@ -1252,9 +1263,7 @@ def get_kv_cache_config_from_groups(
         num_blocks = (
             available_memory // kv_cache_groups[0].kv_cache_spec.page_size_bytes
         )
-        num_blocks = may_override_num_blocks(
-            vllm_config, num_blocks, suppress_log=suppress_log
-        )
+        num_blocks = may_override_num_blocks(vllm_config, num_blocks)
         per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
         kv_cache_tensors = [
             KVCacheTensor(
@@ -1288,11 +1297,7 @@ def get_kv_cache_config_from_groups(
         )
         assert group_size > 0, "group_size must be greater than 0"
         num_blocks = get_num_blocks(
-            vllm_config,
-            group_size,
-            available_memory,
-            page_size,
-            suppress_log=suppress_log,
+            vllm_config, group_size, available_memory, page_size
         )
         kv_cache_tensors = []
         for i in range(group_size):
@@ -1688,36 +1693,24 @@ def _report_kv_cache_config(
         vllm_config: The global VllmConfig
         kv_cache_config: The resolved KV cache configuration
     """
-    min_block_size = min(
-        [group.kv_cache_spec.block_size for group in kv_cache_config.kv_cache_groups]
-    )
-
-    # Log the KV cache size and maximum concurrency.
-    num_tokens = (
-        kv_cache_config.num_blocks
-        // len(kv_cache_config.kv_cache_groups)
-        * min_block_size
-    )
-    dcp_size = vllm_config.parallel_config.decode_context_parallel_size
-    pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
-    if pcp_size * dcp_size > 1:
-        num_tokens *= pcp_size * dcp_size
-        logger.info(
-            "Multiplying the GPU KV cache size by the cp_world_size %d "
-            "(pcp_world_size %d * dcp_world_size %d).",
-            pcp_size * dcp_size,
-            pcp_size,
-            dcp_size,
-        )
-    num_tokens_str = f"{num_tokens:,}"
-    logger.info_once("GPU KV cache size: %s tokens", num_tokens_str)
-    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_model_len = vllm_config.model_config.max_model_len
     max_concurrency = get_max_concurrency_for_kv_cache_config(
         vllm_config, kv_cache_config
     )
+
+    # GPU KV cache size in tokens = max_concurrency * max_model_len: the total
+    # tokens of context the pool can hold at peak utilization. Sourcing this
+    # from the concurrency calculation handles hybrid layouts correctly: SWA /
+    # chunked-local groups have a per-request block count that's capped by
+    # their window, so a naive `num_blocks // num_groups * block_size` formula
+    # underestimates capacity for these models. DCP/PCP sharding is already
+    # accounted for in each spec's `max_memory_usage_bytes`.
+    num_tokens = int(max_concurrency * max_model_len)
+
+    logger.info_once("GPU KV cache size: %s tokens", f"{num_tokens:,}")
     logger.info_once(
         "Maximum concurrency for %s tokens per request: %.2fx",
-        max_model_len_str,
+        f"{max_model_len:,}",
         max_concurrency,
     )
 
@@ -1988,6 +1981,28 @@ def get_kv_cache_configs(
         for worker_spec in kv_cache_specs
     ]
 
+    # If `num_gpu_blocks_override` is set, the cache size that will actually
+    # be allocated is decoupled from the profiled `available_memory`:
+    # `may_override_num_blocks` in `get_kv_cache_config_from_groups` clamps
+    # `num_blocks` to the override. Reflect that in `available_memory` here so
+    # auto-fit, the admission check, and the per-worker config builder all
+    # plan against the same effective capacity.
+    override = vllm_config.cache_config.num_gpu_blocks_override
+    if override is not None:
+        adjusted_memory: list[int] = []
+        for groups, avail_mem in zip(projected_groups_per_worker, available_memory):
+            if not groups:
+                adjusted_memory.append(avail_mem)
+                continue
+            bytes_per_block = _pool_bytes_per_block(groups)
+            logger.info(
+                "Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
+                avail_mem // bytes_per_block,
+                override,
+            )
+            adjusted_memory.append(override * bytes_per_block)
+        available_memory = adjusted_memory
+
     if vllm_config.model_config.original_max_model_len == -1:
         _auto_fit_max_model_len(
             vllm_config, projected_groups_per_worker, available_memory
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index bdb97decadfe..b2e9dd8b1719 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -38,6 +38,7 @@ class NewRequestData:
     num_computed_tokens: int
     lora_request: LoRARequest | None
     prompt_embeds: "torch.Tensor | None" = None
+    prompt_is_token_ids: list[bool] | None = None
 
     # Only used for v2 model runner.
     prefill_token_ids: list[int] | None = None
@@ -59,6 +60,7 @@ def from_request(
             num_computed_tokens=request.num_computed_tokens,
             lora_request=request.lora_request,
             prompt_embeds=request.prompt_embeds,
+            prompt_is_token_ids=request.prompt_is_token_ids,
             prefill_token_ids=prefill_token_ids,
         )
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 395fa80bfe53..032767cdf3b0 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -741,20 +741,6 @@ def schedule(self) -> SchedulerOutput:
                         for i in encoder_inputs_to_schedule
                     )
 
-                if (
-                    self.scheduler_reserve_full_isl
-                    and not self.kv_cache_manager.can_fit_full_sequence(
-                        request,
-                        num_new_computed_tokens=num_new_local_computed_tokens,
-                        new_computed_blocks=new_computed_blocks,
-                        num_external_computed_tokens=num_external_computed_tokens,
-                        num_encoder_tokens=num_encoder_tokens,
-                    )
-                ):
-                    if request.has_encoder_inputs:
-                        self.encoder_cache_manager.free(request)
-                    break
-
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens,
@@ -764,6 +750,7 @@ def schedule(self) -> SchedulerOutput:
                     num_external_computed_tokens=num_external_computed_tokens,
                     delay_cache_blocks=load_kv_async,
                     num_encoder_tokens=num_encoder_tokens,
+                    full_sequence_must_fit=self.scheduler_reserve_full_isl,
                 )
 
                 if new_blocks is None:
@@ -1448,7 +1435,7 @@ def update_from_output(
             # Extract sample logprobs if needed.
             if (
                 request.sampling_params is not None
-                and request.sampling_params.logprobs is not None
+                and request.sampling_params.num_logprobs is not None
                 and logprobs
             ):
                 new_logprobs = logprobs.slice_request(req_index, len(new_token_ids))
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 0aa08f35801f..e8d3a6f75688 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -92,6 +92,7 @@ def get_num_blocks_to_allocate(
         new_computed_blocks: Sequence[KVCacheBlock],
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
@@ -107,13 +108,16 @@ def get_num_blocks_to_allocate(
             num_tokens_main_model: The number of tokens for the main model (aka target
                 model in spec decode). w/o spec decode, it is num_tokens;
                 with spec decode, it is num_tokens - num_lookahead_tokens.
+            apply_admission_cap: If True, clamp by `num_required_blocks` by
+                `_max_admission_blocks_per_request`for recycling-aware specs
+                (SWA, chunked-local).
 
         Returns:
             The number of blocks to allocate.
         """
 
         num_required_blocks = cdiv(num_tokens, self.block_size)
-        if self._max_admission_blocks_per_request is not None:
+        if apply_admission_cap and self._max_admission_blocks_per_request is not None:
             # Recycling-aware specs (SWA, chunked-local) cap the per-request
             # reservation here so admission matches the startup pool sizer
             # (`SlidingWindowSpec.max_admission_blocks_per_request` / its
@@ -893,6 +897,7 @@ def get_num_blocks_to_allocate(
         new_computed_blocks: Sequence[KVCacheBlock],
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         assert isinstance(self.kv_cache_spec, MambaSpec)
         if (
@@ -917,6 +922,7 @@ def get_num_blocks_to_allocate(
                 new_computed_blocks,
                 total_computed_tokens,
                 num_tokens_main_model,
+                apply_admission_cap=apply_admission_cap,
             )
         else:
             # We don't allocate blocks for lookahead tokens in align mode, because if
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index d5c5dba63475..8172ead08319 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -94,6 +94,12 @@ class EngineCoreRequest(
     data_parallel_rank: int | None
     prompt_embeds: torch.Tensor | None = None
 
+    # Per-position mask for mixed-mode inputs (e.g chat completion with
+    # prompt_embeds content parts). `True` means the position is a real
+    # token ID; `False` means the position uses a pre-computed entry from
+    # `prompt_embeds`. `None` for pure-tokens and pure-embeds requests.
+    prompt_is_token_ids: list[bool] | None = None
+
     # Index of the client, used to ensure outputs are sent back to the same
     # client for this request when scaling out the front-end.
     client_index: int = 0
@@ -114,6 +120,7 @@ class EngineCoreRequest(
     external_req_id: str | None = None
 
     reasoning_ended: bool | None = None
+    reasoning_parser_kwargs: dict[str, Any] | None = None
 
     @property
     def params(self) -> SamplingParams | PoolingParams:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 45ae416529ec..0e55a685bd94 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -293,6 +293,7 @@ async def add_request(
         data_parallel_rank: int | None = None,
         prompt_text: str | None = None,
         reasoning_ended: bool | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
@@ -313,7 +314,7 @@ async def add_request(
             )
 
         if isinstance(prompt, AsyncGenerator):
-            if reasoning_ended is not None:
+            if reasoning_ended is not None or reasoning_parser_kwargs is not None:
                 raise NotImplementedError
 
             # Streaming input case.
@@ -361,6 +362,8 @@ async def add_request(
 
         if reasoning_ended is not None:
             request.reasoning_ended = reasoning_ended
+        if reasoning_parser_kwargs is not None:
+            request.reasoning_parser_kwargs = reasoning_parser_kwargs
 
         self.input_processor.assign_request_id(request)
 
@@ -534,6 +537,7 @@ async def generate(
         priority: int = 0,
         data_parallel_rank: int | None = None,
         reasoning_ended: bool | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -563,6 +567,7 @@ async def generate(
                 data_parallel_rank=data_parallel_rank,
                 prompt_text=prompt_text,
                 reasoning_ended=reasoning_ended,
+                reasoning_parser_kwargs=reasoning_parser_kwargs,
             )
 
             # The output_handler task pushes items into the queue.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 36864ba738bf..11c5ee19a664 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1571,7 +1571,8 @@ def engine_idle_callback(engine: "EngineCoreProc", future: Future[Any]) -> None:
 
         pause_state = PauseState.PAUSED_ALL if mode == "keep" else PauseState.PAUSED_NEW
         self.scheduler.set_pause_state(pause_state)
-        if not self.has_work():
+
+        if self._pause_complete():
             if clear_cache:
                 self._reset_caches()
             return None
@@ -1580,6 +1581,13 @@ def engine_idle_callback(engine: "EngineCoreProc", future: Future[Any]) -> None:
         self._idle_state_callbacks.append(partial(engine_idle_callback, future=future))
         return future
 
+    def _pause_complete(self) -> bool:
+        """Returns True if the pause has fully completed and the caller can
+        return ``None`` synchronously; False if the pause is still pending
+        and the caller should register an idle-state callback to finish it.
+        """
+        return not self.has_work()
+
     def _send_finish_outputs_to_client(
         self, req_ids: list[str], client_index: int, finish_reason: FinishReason
     ) -> None:
@@ -1635,6 +1643,14 @@ def __init__(
         self.current_wave = 0
         self.last_counts = (0, 0)
 
+        # Two-phase pause protocol state. When pending_pause is True, the
+        # engine keeps stepping (dummy batches) while waiting for all DP
+        # ranks to also set pending_pause. Once all ranks agree via
+        # all-reduce, ignore_start_dp_wave is set so that stale
+        # START_DP_WAVE messages cannot re-wake the engines.
+        self.pending_pause = False
+        self.ignore_start_dp_wave = False
+
         from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
 
         self.eep_scaling_state: ElasticEPScalingState | None = None
@@ -1664,6 +1680,7 @@ def _init_data_parallel(self, vllm_config: VllmConfig):
         assert 0 <= local_dp_rank <= dp_rank < dp_size
 
         self.dp_rank = dp_rank
+        self.dp_size = dp_size
         dp_group, dp_store = parallel_config.stateless_init_dp_group(return_store=True)
         self.dp_group, self.dp_store = dp_group, dp_store
 
@@ -1672,6 +1689,24 @@ def shutdown(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
+    def _pause_complete(self) -> bool:
+        """Two-phase DP-aware pause.
+
+        Phase 1: Set local pause state and ``pending_pause`` flag. If the
+        engines are idle, kick-start them by setting ``engines_running`` to
+        True so ranks enter the stepping loop and reach the all-reduce
+        consensus checkpoint in ``_has_global_unfinished_reqs``.
+
+        Phase 2 (in ``_has_global_unfinished_reqs``): Once the all-reduce
+        confirms that **all** ranks have ``pending_pause`` set, collectively
+        stop stepping and set ``ignore_start_dp_wave`` so that stale
+        ``START_DP_WAVE`` messages cannot re-wake any engine.
+        """
+        self.pending_pause = True
+        self.engines_running = True
+
+        return False
+
     def add_request(self, request: Request, request_wave: int = 0):
         super().add_request(request, request_wave)
         if self.has_coordinator and request_wave != self.current_wave:
@@ -1681,36 +1716,60 @@ def add_request(self, request: Request, request_wave: int = 0):
                 not self.engines_running
                 and self.scheduler.pause_state == PauseState.UNPAUSED
             ):
-                self.engines_running = True
                 # Request received for an already-completed wave, notify
                 # front-end that we need to start the next one.
+                self.engines_running = True
                 self.output_queue.put_nowait(
                     (-1, EngineCoreOutputs(start_wave=self.current_wave))
                 )
 
     def resume_scheduler(self):
-        super().resume_scheduler()
-        if (
-            self.has_coordinator
-            and not self.engines_running
-            and self.scheduler.has_unfinished_requests()
-        ):
-            # Wake up other DP engines.
-            self.output_queue.put_nowait(
-                (-1, EngineCoreOutputs(start_wave=self.current_wave))
+        if self.pending_pause or (self.engines_running and self.ignore_start_dp_wave):
+            raise RuntimeError(
+                "resume_scheduler called while pause is still in "
+                "flight. Wait for the pause future to resolve before "
+                "resuming."
             )
+        if self.engines_running:
+            logger.debug("Resume called while engines are not paused, ignoring.")
+            return
+
+        super().resume_scheduler()
+        self.ignore_start_dp_wave = False
+
+        # Barrier: wait for all DP ranks to have resumed (and cleared
+        # ignore_start_dp_wave) before any rank starts stepping. Uses
+        # the existing all-reduce which is safe because engines are
+        # stopped.
+        has_global_unfinished = ParallelConfig.has_unfinished_dp(
+            self.dp_group, self.scheduler.has_unfinished_requests()
+        )
+
+        if has_global_unfinished:
+            self.engines_running = True
+
+    def barrier(self):
+        """Blocking barrier on the DP process group (test-only utility)."""
+        import torch.distributed as dist
+
+        dist.barrier(group=self.dp_group)
 
     def _handle_client_request(
         self, request_type: EngineCoreRequestType, request: Any
     ) -> None:
         if request_type == EngineCoreRequestType.START_DP_WAVE:
+            if self.ignore_start_dp_wave:
+                return
             new_wave, exclude_eng_index = request
             if exclude_eng_index != self.engine_index and (
                 new_wave >= self.current_wave
             ):
                 self.current_wave = new_wave
                 if not self.engines_running:
-                    logger.debug("EngineCore starting idle loop for wave %d.", new_wave)
+                    logger.debug(
+                        "EngineCore starting idle loop for wave %d.",
+                        new_wave,
+                    )
                     self.engines_running = True
         else:
             super()._handle_client_request(request_type, request)
@@ -1790,7 +1849,18 @@ def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
         if self.step_counter % 32 != 0:
             return True
 
-        return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished)
+        has_unfinished, pause_consensus = ParallelConfig.sync_dp_state(
+            self.dp_group,
+            has_unfinished=local_unfinished,
+            pending_pause=self.pending_pause,
+        )
+
+        if pause_consensus:
+            self.ignore_start_dp_wave = True
+            self.pending_pause = False
+            logger.debug("DP pause consensus reached, ignoring START_DP_WAVE.")
+
+        return has_unfinished
 
     def reinitialize_distributed(
         self, reconfig_request: ReconfigureDistributedRequest
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index b01163b36d05..c579c92baf37 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -292,11 +292,13 @@ def process_inputs(
 
         # Mypy can be conservative for TypedDict unions; normalize access.
         if decoder_inputs["type"] == "embeds":
-            prompt_token_ids = None
             prompt_embeds = decoder_inputs["prompt_embeds"]
+            prompt_token_ids = decoder_inputs.get("prompt_token_ids")
+            prompt_is_token_ids = decoder_inputs.get("is_token_ids")
         else:
             prompt_token_ids = decoder_inputs["prompt_token_ids"]
             prompt_embeds = None
+            prompt_is_token_ids = None
 
         sampling_params = None
         pooling_params = None
@@ -361,6 +363,7 @@ def process_inputs(
             request_id=request_id,
             prompt_token_ids=prompt_token_ids,
             prompt_embeds=prompt_embeds,
+            prompt_is_token_ids=prompt_is_token_ids,
             mm_features=mm_features,
             sampling_params=sampling_params,
             pooling_params=pooling_params,
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 9ada6eda48ce..74a45ab1e4d4 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -47,7 +47,7 @@ def from_new_request(
     ) -> "LogprobsProcessor":
         sampling_params = request.sampling_params
         assert sampling_params is not None
-        num_logprobs = sampling_params.logprobs
+        num_logprobs = sampling_params.num_logprobs
         num_prompt_logprobs = sampling_params.prompt_logprobs
         return cls(
             tokenizer=tokenizer,
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 53cad2bc153f..1f0b9bbb19d5 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -403,6 +403,11 @@ def __init__(
             range(dp_size), local_dp_ranks, placement_groups
         ):
             dp_vllm_config = copy.deepcopy(vllm_config)
+            if dp_size > 1:
+                # Append the DP rank to instance_id so that per-engine
+                # identifiers (e.g. Ray actor names in RayExecutorV2) are
+                # unique across DP replicas.
+                dp_vllm_config.instance_id = f"{dp_vllm_config.instance_id}_dp{index}"
             dp_vllm_config.parallel_config.placement_group = pg
             local_client = index < local_engine_count
 
@@ -952,7 +957,7 @@ def get_engine_zmq_addresses(
 
     # In offline mode there is an LLM instance per DP rank and
     # one core engine per LLM, see
-    # examples/offline_inference/data_parallel.py.
+    # examples/features/data_parallel/data_parallel_offline.py.
     offline_mode = local_start_index is not None
 
     # client_local_only = True for cases where this front-end
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index b616c3b7b8ad..d006946079e7 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -147,7 +147,7 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
     offline inference with tensor parallelism.
 
     see https://github.com/vllm-project/vllm/issues/11400 for
-    the motivation, and examples/offline_inference/torchrun_example.py
+    the motivation, and examples/features/torchrun/torchrun_example_offline.py
     for the usage example.
 
     The key idea: although it is tensor-parallel inference, we only
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 2545c440368a..19438fb1e42d 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -151,6 +151,16 @@ def page_size_bytes(self) -> int:
 
     @property
     def real_page_size_bytes(self) -> int:
+        if self.kv_quant_mode.is_nvfp4:
+            # Packed layout: fp4 data + fp8 block scales per head.
+            full_dim = nvfp4_kv_cache_full_dim(self.head_size)
+            return (
+                2
+                * self.block_size
+                * self.num_kv_heads
+                * full_dim
+                * get_dtype_size(self.dtype)
+            )
         return (
             2
             * self.block_size
diff --git a/vllm/v1/kv_offload/abstract.py b/vllm/v1/kv_offload/abstract.py
deleted file mode 100644
index 8f809ceaa08a..000000000000
--- a/vllm/v1/kv_offload/abstract.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-OffloadingManager class for managing KV data offloading in vLLM v1
-
-This class runs in the scheduler, tracks which blocks are offloaded
-and their address.
-
-The class provides the following primitives:
-    lookup() - check whether a single block is offloaded and ready.
-    prepare_load() - prepare given blocks to be read.
-        The given blocks will be protected from eviction.
-        This function returns a LoadSpec which encapsulates
-        information required for performing the load.
-    touch() - marks the give blocks as recently used. Can be used
-        to track block's LRU. This function is separated from the
-        prepare_load function to allow setting block recency even
-        for blocks which do not need reading from the cache, such as
-        blocks that are cached by the GPU prefix cache.
-    complete_load() - mark blocks which were previously prepared to be
-        loaded as done loading. This is to re-allow their eviction.
-    prepare_store() - prepare the given blocks to be written.
-        Returns a StoreSpec encapsulating offloading information,
-        as well as a list of blocks that were evicted as a result.
-    complete_store() - marks a previous store as completed.
-        Following this call, the given blocks will become loadable.
-"""
-
-from abc import ABC, abstractmethod
-from collections.abc import Iterable
-from dataclasses import dataclass
-from typing import Any, NewType
-
-# `OffloadKey` identifies an offloaded block. It combines a block hash with
-# its KV cache group index, encoded as raw bytes to avoid tuple GC overhead.
-# Use the helper functions below to construct / decompose keys.
-OffloadKey = NewType("OffloadKey", bytes)
-
-
-def make_offload_key(block_hash: bytes, group_idx: int) -> OffloadKey:
-    """Pack a block hash and group index into an `OffloadKey`."""
-    return OffloadKey(block_hash + group_idx.to_bytes(4, "big", signed=False))
-
-
-def get_offload_block_hash(key: OffloadKey) -> bytes:
-    """Extract the block hash from an `OffloadKey`."""
-    return key[:-4]
-
-
-def get_offload_group_idx(key: OffloadKey) -> int:
-    """Extract the group index from an `OffloadKey`."""
-    return int.from_bytes(key[-4:], "big", signed=False)
-
-
-@dataclass
-class ReqContext:
-    kv_transfer_params: dict[str, Any] | None = None
-
-
-class LoadStoreSpec(ABC):
-    """
-    Abstract metadata that encapsulates information allowing a worker
-    to load, and optionally also to store, blocks of KV data.
-    """
-
-    @staticmethod
-    @abstractmethod
-    def medium() -> str:
-        """
-        Returns a string representation of the medium type
-        this store/load targets.
-        """
-        pass
-
-
-@dataclass
-class PrepareStoreOutput:
-    keys_to_store: list[OffloadKey]
-    store_spec: LoadStoreSpec
-    evicted_keys: list[OffloadKey]
-
-
-@dataclass
-class OffloadingEvent:
-    keys: list[OffloadKey]
-    medium: str
-    # True if blocks are removed, False if stored
-    removed: bool
-
-
-class OffloadingManager(ABC):
-    @abstractmethod
-    def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
-        """
-        Checks whether a single block is offloaded and ready to be read.
-
-        Args:
-            key: the key identifying the block to lookup.
-            req_context: per-request context (e.g. kv_transfer_params).
-
-        Returns:
-            True if the block is offloaded and ready, False if not,
-            or None if the lookup should be retried later.
-            Returning None will delay the request handling by the vLLM
-            scheduler.
-        """
-        pass
-
-    @abstractmethod
-    def prepare_load(
-        self,
-        keys: Iterable[OffloadKey],
-        req_context: ReqContext,
-    ) -> LoadStoreSpec:
-        """
-        Prepare the given blocks to be read.
-        The given blocks will be protected from eviction until
-        complete_load is called.
-        It assumes all given blocks are offloaded.
-
-        Args:
-            keys: the keys identifying the blocks.
-            req_context: per-request context (e.g. kv_transfer_params).
-
-        Returns:
-            A LoadStoreSpec that can be used by a worker to locate and load
-            the actual offloaded KV data.
-        """
-        pass
-
-    def touch(self, keys: Iterable[OffloadKey]):
-        """
-        Mark the given blocks as recently used.
-        This could in practice mean moving them to the end of an LRU list.
-
-        Args:
-            keys: the keys identifying the blocks.
-        """
-        return
-
-    def complete_load(self, keys: Iterable[OffloadKey]):
-        """
-        Marks previous blocks that were prepared to load as done loading.
-
-        Args:
-            keys: the keys identifying the blocks.
-        """
-        return
-
-    @abstractmethod
-    def prepare_store(
-        self,
-        keys: Iterable[OffloadKey],
-        req_context: ReqContext,
-    ) -> PrepareStoreOutput | None:
-        """
-        Prepare the given blocks to be offloaded.
-        The given blocks will be protected from eviction until
-        complete_store is called.
-
-        Args:
-            keys: the keys identifying the blocks.
-            req_context: per-request context (e.g. kv_transfer_params).
-
-        Returns:
-            A PrepareStoreOutput indicating which blocks need storing,
-            where to store them (LoadStoreSpec), and list of blocks that
-            were evicted as a result.
-            None is returned if the blocks cannot be stored.
-        """
-        pass
-
-    def complete_store(self, keys: Iterable[OffloadKey], success: bool = True):
-        """
-        Marks blocks which were previously prepared to be stored, as stored.
-        Following this call, the blocks become loadable.
-        If if_success is False, blocks that were not marked as stored will be
-        removed.
-
-        Args:
-            keys: the keys identifying the blocks.
-            success: whether the blocks were stored successfully.
-        """
-        return
-
-    def take_events(self) -> Iterable[OffloadingEvent]:
-        """
-        Take the offloading events from the manager.
-
-        Yields:
-            New OffloadingEvents collected since the last call.
-        """
-        return ()
-
-    def shutdown(self) -> None:
-        """Shutdown the manager and release any resources."""
-        return
diff --git a/vllm/v1/kv_offload/base.py b/vllm/v1/kv_offload/base.py
new file mode 100644
index 000000000000..3d403ea50837
--- /dev/null
+++ b/vllm/v1/kv_offload/base.py
@@ -0,0 +1,398 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Core abstractions for KV cache offloading in vLLM v1.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from collections.abc import Collection, Iterable, Iterator, Sequence
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, NewType
+
+import numpy as np
+import torch
+
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.kv_offload.worker.worker import OffloadingHandler
+
+# `OffloadKey` identifies an offloaded block. It combines a block hash with
+# its KV cache group index, encoded as raw bytes to avoid tuple GC overhead.
+# Use the helper functions below to construct / decompose keys.
+OffloadKey = NewType("OffloadKey", bytes)
+
+logger = init_logger(__name__)
+
+
+def make_offload_key(block_hash: bytes, group_idx: int) -> OffloadKey:
+    """Pack a block hash and group index into an `OffloadKey`."""
+    return OffloadKey(block_hash + group_idx.to_bytes(4, "big", signed=False))
+
+
+def get_offload_block_hash(key: OffloadKey) -> bytes:
+    """Extract the block hash from an `OffloadKey`."""
+    return key[:-4]
+
+
+def get_offload_group_idx(key: OffloadKey) -> int:
+    """Extract the group index from an `OffloadKey`."""
+    return int.from_bytes(key[-4:], "big", signed=False)
+
+
+@dataclass
+class ReqContext:
+    kv_transfer_params: dict[str, Any] | None = None
+
+
+class LoadStoreSpec(ABC):
+    """
+    Abstract metadata that encapsulates information allowing a worker
+    to load, and optionally also to store, blocks of KV data.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def medium() -> str:
+        """
+        Returns a string representation of the medium type
+        this store/load targets.
+        """
+        pass
+
+
+@dataclass
+class PrepareStoreOutput:
+    keys_to_store: list[OffloadKey]
+    store_spec: LoadStoreSpec
+    evicted_keys: list[OffloadKey]
+
+
+@dataclass
+class OffloadingEvent:
+    keys: list[OffloadKey]
+    medium: str
+    # True if blocks are removed, False if stored
+    removed: bool
+
+
+"""
+OffloadingManager class for managing KV data offloading in vLLM v1
+
+This class runs in the scheduler, tracks which blocks are offloaded
+and their address.
+
+The class provides the following primitives:
+    lookup() - check whether a single block is offloaded and ready.
+    prepare_load() - prepare given blocks to be read.
+        The given blocks will be protected from eviction.
+        This function returns a LoadSpec which encapsulates
+        information required for performing the load.
+    touch() - marks the give blocks as recently used. Can be used
+        to track block's LRU. This function is separated from the
+        prepare_load function to allow setting block recency even
+        for blocks which do not need reading from the cache, such as
+        blocks that are cached by the GPU prefix cache.
+    complete_load() - mark blocks which were previously prepared to be
+        loaded as done loading. This is to re-allow their eviction.
+    prepare_store() - prepare the given blocks to be written.
+        Returns a StoreSpec encapsulating offloading information,
+        as well as a list of blocks that were evicted as a result.
+    complete_store() - marks a previous store as completed.
+        Following this call, the given blocks will become loadable.
+"""
+
+
+class OffloadingManager(ABC):
+    @abstractmethod
+    def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
+        """
+        Checks whether a single block is offloaded and ready to be read.
+
+        Args:
+            key: the key identifying the block to lookup.
+            req_context: per-request context (e.g. kv_transfer_params).
+
+        Returns:
+            True if the block is offloaded and ready, False if not,
+            or None if the lookup should be retried later.
+            Returning None will delay the request handling by the vLLM
+            scheduler.
+        """
+        pass
+
+    @abstractmethod
+    def prepare_load(
+        self,
+        keys: Collection[OffloadKey],
+        req_context: ReqContext,
+    ) -> LoadStoreSpec:
+        """
+        Prepare the given blocks to be read.
+        The given blocks will be protected from eviction until
+        complete_load is called.
+        It assumes all given blocks are offloaded.
+
+        Args:
+            keys: the keys identifying the blocks.
+            req_context: per-request context (e.g. kv_transfer_params).
+
+        Returns:
+            A LoadStoreSpec that can be used by a worker to locate and load
+            the actual offloaded KV data.
+        """
+        pass
+
+    def touch(self, keys: Collection[OffloadKey]):
+        """
+        Mark the given blocks as recently used.
+        This could in practice mean moving them to the end of an LRU list.
+
+        Args:
+            keys: the keys identifying the blocks.
+        """
+        return
+
+    def complete_load(self, keys: Collection[OffloadKey]):
+        """
+        Marks previous blocks that were prepared to load as done loading.
+
+        Args:
+            keys: the keys identifying the blocks.
+        """
+        return
+
+    @abstractmethod
+    def prepare_store(
+        self,
+        keys: Collection[OffloadKey],
+        req_context: ReqContext,
+    ) -> PrepareStoreOutput | None:
+        """
+        Prepare the given blocks to be offloaded.
+        The given blocks will be protected from eviction until
+        complete_store is called.
+
+        Args:
+            keys: the keys identifying the blocks.
+            req_context: per-request context (e.g. kv_transfer_params).
+
+        Returns:
+            A PrepareStoreOutput indicating which blocks need storing,
+            where to store them (LoadStoreSpec), and list of blocks that
+            were evicted as a result.
+            None is returned if the blocks cannot be stored.
+        """
+        pass
+
+    def complete_store(self, keys: Collection[OffloadKey], success: bool = True):
+        """
+        Marks blocks which were previously prepared to be stored, as stored.
+        Following this call, the blocks become loadable.
+        If if_success is False, blocks that were not marked as stored will be
+        removed.
+
+        Args:
+            keys: the keys identifying the blocks.
+            success: whether the blocks were stored successfully.
+        """
+        return
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        """
+        Take the offloading events from the manager.
+
+        Yields:
+            New OffloadingEvents collected since the last call.
+        """
+        return ()
+
+    def shutdown(self) -> None:
+        """Shutdown the manager and release any resources."""
+        return
+
+
+class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
+    """
+    Spec for loading/storing KV blocks from given block numbers.
+    """
+
+    def __init__(self, block_ids: list[int]):
+        self.block_ids = np.array(block_ids, dtype=np.int64)
+
+    def __repr__(self) -> str:
+        return repr(self.block_ids)
+
+
+class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to GPU memory.
+
+    If there are multiple KV groups, the blocks are expected to be
+    ordered by the group index.
+    In that case, group_sizes[i] determines the number of blocks
+    per the i-th KV group, and thus sum(group_sizes) == len(block_ids).
+    group_sizes=None indicates a single KV group.
+
+    If block_indices is given, each group (determined by group_sizes) of block IDs
+    will correspond to logically contiguous blocks, e.g. blocks 5-10 of a some request.
+    block_indices[i] will represent the block index of the first block in group #i.
+    Thus, len(block_indices) == len(group_sizes) = number of KV cache groups.
+    This information is required in order to support off/loading from offloaded blocks
+    which are larger than GPU blocks.
+    In such cases, the first GPU block per each group may be unaligned to the offloaded
+    block size, and so knowing block_indices[i] allows the worker to correctly
+    skip part of the first matching offloaded block.
+    """
+
+    def __init__(
+        self,
+        block_ids: list[int],
+        group_sizes: Sequence[int],
+        block_indices: Sequence[int],
+    ):
+        super().__init__(block_ids)
+        assert sum(group_sizes) == len(block_ids)
+        assert len(block_indices) == len(group_sizes)
+        self.group_sizes: Sequence[int] = group_sizes
+        self.block_indices: Sequence[int] = block_indices
+
+    @staticmethod
+    def medium() -> str:
+        return "GPU"
+
+
+@dataclass
+class CanonicalKVCacheTensor:
+    """
+    A canonicalized KV cache tensor whose first dimension is num_blocks.
+
+    For attention backends where the raw tensor has num_blocks at a
+    non-leading physical dimension (e.g. FlashAttention's
+    (2, num_blocks, ...) layout), the tensor is split so that each
+    resulting CanonicalKVCacheTensor starts with (num_blocks, ...).
+    """
+
+    # The KV cache tensor with shape (num_blocks, ...)
+    tensor: torch.Tensor
+    # The (possibly padded) page size per block in bytes
+    page_size_bytes: int
+
+
+@dataclass
+class CanonicalKVCacheRef:
+    """
+    Per-layer (or group of layers) reference to a specific (by index)
+    CanonicalKVCacheTensor and records the un-padded page size used by that layer.
+    """
+
+    # Index into the list of CanonicalKVCacheTensor objects
+    tensor_idx: int
+    # The un-padded page size per block in bytes
+    page_size_bytes: int
+
+
+@dataclass
+class CanonicalKVCaches:
+    """
+    Canonicalized block-level representation of the KV caches.
+
+    Composed of:
+        - Unique list of KV cache data tensors,
+          each with shape (num_blocks, page_size_in_bytes) and int8 dtype.
+        - Per-group data references of the tensors.
+          i.e. how each KV cache group maps to the tensors.
+    """
+
+    # Ordered list of unique block tensors, each with shape
+    # (num_blocks, ...).
+    tensors: list[CanonicalKVCacheTensor]
+    # Per-KV-cache-group list of data references that map each layer
+    # in the group to the appropriate entry in the tensors list.
+    group_data_refs: list[list[CanonicalKVCacheRef]]
+
+
+class OffloadingSpec(ABC):
+    """Spec for an offloading connector"""
+
+    def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig):
+        logger.warning(
+            "Initializing OffloadingSpec. This API is experimental and "
+            "subject to change in the future as we iterate the design."
+        )
+        self.vllm_config = vllm_config
+        self.kv_cache_config = kv_cache_config
+
+        kv_transfer_config = vllm_config.kv_transfer_config
+        assert kv_transfer_config is not None
+        self.extra_config = kv_transfer_config.kv_connector_extra_config
+
+        parallel_config = vllm_config.parallel_config
+        context_parallel_factor = (
+            parallel_config.decode_context_parallel_size
+            * parallel_config.prefill_context_parallel_size
+        )
+
+        # block size used by vLLM for hashing request tokens for the sake
+        # of enabling prefix caching
+        self.hash_block_size = (
+            vllm_config.cache_config.block_size * context_parallel_factor
+        )
+        # gpu block size per group
+        self.gpu_block_size: tuple[int, ...] = tuple(
+            kv_cache_group.kv_cache_spec.block_size * context_parallel_factor
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        )
+
+        for block_size in self.gpu_block_size:
+            assert block_size % self.hash_block_size == 0, (
+                f"gpu_block_size={block_size} not divisible by "
+                f"hash_block_size={self.hash_block_size}. "
+                f"Hybrid models (e.g. Mamba+Attention) need "
+                f"--enable-prefix-caching to align block sizes."
+            )
+
+        # offloaded_block_size / gpu_block_size
+        self.block_size_factor: int = 1
+
+        offloaded_block_size = self.extra_config.get("block_size")
+        if offloaded_block_size is not None:
+            offloaded_block_size_int = int(offloaded_block_size)
+            gpu_block_sizes = set(self.gpu_block_size)
+            assert len(gpu_block_sizes) == 1, (
+                "If 'block_size' is specified in kv_connector_extra_config, "
+                "there must be at least one KV cache group, "
+                "and all groups must have the same block size."
+            )
+            gpu_block_size = gpu_block_sizes.pop()
+
+            assert offloaded_block_size_int % gpu_block_size == 0
+            self.block_size_factor = offloaded_block_size_int // gpu_block_size
+
+    @abstractmethod
+    def get_manager(self) -> OffloadingManager:
+        """
+        Get an OffloadingManager that will be used
+        by the scheduler-side offloading connector to track
+        offloaded blocks and manage evictions.
+        """
+        pass
+
+    @abstractmethod
+    def get_handlers(
+        self, kv_caches: CanonicalKVCaches
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
+        """
+        Get offloading handlers along with their respective src and dst types.
+
+        Args:
+            kv_caches: Canonicalized KV caches.
+
+        Yields:
+            Tuples of (src_type, dst_type, offloading_handler).
+        """
+        pass
diff --git a/vllm/v1/kv_offload/cpu/common.py b/vllm/v1/kv_offload/cpu/common.py
new file mode 100644
index 000000000000..cf5b2b39dd6b
--- /dev/null
+++ b/vllm/v1/kv_offload/cpu/common.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.kv_offload.base import BlockIDsLoadStoreSpec
+
+
+class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to CPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "CPU"
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/cpu/gpu_worker.py
similarity index 99%
rename from vllm/v1/kv_offload/worker/cpu_gpu.py
rename to vllm/v1/kv_offload/cpu/gpu_worker.py
index aab57ef2be4d..c8cd0ad3b46c 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/cpu/gpu_worker.py
@@ -11,9 +11,13 @@
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.kv_offload.base import (
+    BlockIDsLoadStoreSpec,
+    CanonicalKVCacheRef,
+    CanonicalKVCaches,
+    GPULoadStoreSpec,
+)
 from vllm.v1.kv_offload.cpu.shared_offload_region import SharedOffloadRegion
-from vllm.v1.kv_offload.mediums import BlockIDsLoadStoreSpec, GPULoadStoreSpec
-from vllm.v1.kv_offload.spec import CanonicalKVCacheRef, CanonicalKVCaches
 from vllm.v1.kv_offload.worker.worker import (
     OffloadingHandler,
     TransferResult,
diff --git a/vllm/v1/kv_offload/cpu/manager.py b/vllm/v1/kv_offload/cpu/manager.py
index fcfaa919a3b3..80bcb568f99a 100644
--- a/vllm/v1/kv_offload/cpu/manager.py
+++ b/vllm/v1/kv_offload/cpu/manager.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
+from collections.abc import Collection, Iterable
 from typing import Literal
 
-from vllm.v1.kv_offload.abstract import (
+from vllm.v1.kv_offload.base import (
     LoadStoreSpec,
     OffloadingEvent,
     OffloadingManager,
@@ -11,10 +11,10 @@
     PrepareStoreOutput,
     ReqContext,
 )
-from vllm.v1.kv_offload.cpu.policies.abstract import BlockStatus, CachePolicy
+from vllm.v1.kv_offload.cpu.common import CPULoadStoreSpec
 from vllm.v1.kv_offload.cpu.policies.arc import ARCCachePolicy
+from vllm.v1.kv_offload.cpu.policies.base import BlockStatus, CachePolicy
 from vllm.v1.kv_offload.cpu.policies.lru import LRUCachePolicy
-from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
 
 _CACHE_POLICIES: dict[str, type[CachePolicy]] = {
     "lru": LRUCachePolicy,
@@ -86,11 +86,15 @@ def _get_load_store_spec(
 
     def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
         block = self._policy.get(key)
-        return block is not None and block.is_ready
+        if block is None:
+            return False
+        if not block.is_ready:
+            return None  # write in-flight; caller should retry
+        return True
 
     def prepare_load(
         self,
-        keys: Iterable[OffloadKey],
+        keys: Collection[OffloadKey],
         req_context: ReqContext,
     ) -> LoadStoreSpec:
         blocks = []
@@ -102,10 +106,10 @@ def prepare_load(
             blocks.append(block)
         return self._get_load_store_spec(keys, blocks)
 
-    def touch(self, keys: Iterable[OffloadKey]) -> None:
+    def touch(self, keys: Collection[OffloadKey]) -> None:
         self._policy.touch(keys)
 
-    def complete_load(self, keys: Iterable[OffloadKey]) -> None:
+    def complete_load(self, keys: Collection[OffloadKey]) -> None:
         for key in keys:
             block = self._policy.get(key)
             assert block is not None, f"Block {key!r} not found"
@@ -114,13 +118,11 @@ def complete_load(self, keys: Iterable[OffloadKey]) -> None:
 
     def prepare_store(
         self,
-        keys: Iterable[OffloadKey],
+        keys: Collection[OffloadKey],
         req_context: ReqContext,
     ) -> PrepareStoreOutput | None:
-        keys_list = list(keys)
-
         # filter out blocks that are already stored
-        keys_to_store = [k for k in keys_list if self._policy.get(k) is None]
+        keys_to_store = [k for k in keys if self._policy.get(k) is None]
 
         if not keys_to_store:
             return PrepareStoreOutput(
@@ -135,7 +137,7 @@ def prepare_store(
         if num_blocks_to_evict > 0:
             # Blocks from the original input are excluded from eviction candidates:
             # a block that was already stored must remain in the cache after this call.
-            protected = set(keys_list)
+            protected = set(keys)
             evicted = self._policy.evict(num_blocks_to_evict, protected)
             if evicted is None:
                 return None
@@ -169,7 +171,9 @@ def prepare_store(
             evicted_keys=to_evict,
         )
 
-    def complete_store(self, keys: Iterable[OffloadKey], success: bool = True) -> None:
+    def complete_store(
+        self, keys: Collection[OffloadKey], success: bool = True
+    ) -> None:
         stored_keys: list[OffloadKey] = []
 
         if success:
diff --git a/vllm/v1/kv_offload/cpu/policies/arc.py b/vllm/v1/kv_offload/cpu/policies/arc.py
index 07dfa4af4b30..e2af991c0cc0 100644
--- a/vllm/v1/kv_offload/cpu/policies/arc.py
+++ b/vllm/v1/kv_offload/cpu/policies/arc.py
@@ -3,8 +3,8 @@
 from collections import OrderedDict
 from collections.abc import Iterable
 
-from vllm.v1.kv_offload.abstract import OffloadKey
-from vllm.v1.kv_offload.cpu.policies.abstract import BlockStatus, CachePolicy
+from vllm.v1.kv_offload.base import OffloadKey
+from vllm.v1.kv_offload.cpu.policies.base import BlockStatus, CachePolicy
 
 
 class ARCCachePolicy(CachePolicy):
diff --git a/vllm/v1/kv_offload/cpu/policies/abstract.py b/vllm/v1/kv_offload/cpu/policies/base.py
similarity index 97%
rename from vllm/v1/kv_offload/cpu/policies/abstract.py
rename to vllm/v1/kv_offload/cpu/policies/base.py
index 181df36baa06..ee4916956e65 100644
--- a/vllm/v1/kv_offload/cpu/policies/abstract.py
+++ b/vllm/v1/kv_offload/cpu/policies/base.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 
-from vllm.v1.kv_offload.abstract import OffloadKey
+from vllm.v1.kv_offload.base import OffloadKey
 
 
 class BlockStatus(ctypes.Structure):
diff --git a/vllm/v1/kv_offload/cpu/policies/lru.py b/vllm/v1/kv_offload/cpu/policies/lru.py
index be04831cf5f0..bf9a8b66e657 100644
--- a/vllm/v1/kv_offload/cpu/policies/lru.py
+++ b/vllm/v1/kv_offload/cpu/policies/lru.py
@@ -3,8 +3,8 @@
 from collections import OrderedDict
 from collections.abc import Iterable
 
-from vllm.v1.kv_offload.abstract import OffloadKey
-from vllm.v1.kv_offload.cpu.policies.abstract import BlockStatus, CachePolicy
+from vllm.v1.kv_offload.base import OffloadKey
+from vllm.v1.kv_offload.cpu.policies.base import BlockStatus, CachePolicy
 
 
 class LRUCachePolicy(CachePolicy):
diff --git a/vllm/v1/kv_offload/cpu/spec.py b/vllm/v1/kv_offload/cpu/spec.py
index 8d4b744a0b4f..54046d98f452 100644
--- a/vllm/v1/kv_offload/cpu/spec.py
+++ b/vllm/v1/kv_offload/cpu/spec.py
@@ -5,12 +5,17 @@
 from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.v1.kv_cache_interface import KVCacheConfig
-from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.base import (
+    CanonicalKVCaches,
+    GPULoadStoreSpec,
+    LoadStoreSpec,
+    OffloadingManager,
+    OffloadingSpec,
+)
+from vllm.v1.kv_offload.cpu.common import CPULoadStoreSpec
+from vllm.v1.kv_offload.cpu.gpu_worker import CpuGpuOffloadingHandlers
 from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
-from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from vllm.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
-from vllm.v1.kv_offload.spec import CanonicalKVCaches, OffloadingSpec
-from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 
diff --git a/vllm/v1/kv_offload/factory.py b/vllm/v1/kv_offload/factory.py
index ecbaebb0d967..ecfc270b6432 100644
--- a/vllm/v1/kv_offload/factory.py
+++ b/vllm/v1/kv_offload/factory.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
-from vllm.v1.kv_offload.spec import OffloadingSpec
+from vllm.v1.kv_offload.base import OffloadingSpec
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
diff --git a/vllm/v1/kv_offload/mediums.py b/vllm/v1/kv_offload/mediums.py
deleted file mode 100644
index 02e36a80a8e7..000000000000
--- a/vllm/v1/kv_offload/mediums.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from abc import ABC
-from collections.abc import Sequence
-
-import numpy as np
-
-from vllm.v1.kv_offload.abstract import LoadStoreSpec
-
-
-class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
-    """
-    Spec for loading/storing KV blocks from given block numbers.
-    """
-
-    def __init__(self, block_ids: list[int]):
-        self.block_ids = np.array(block_ids, dtype=np.int64)
-
-    def __repr__(self) -> str:
-        return repr(self.block_ids)
-
-
-class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
-    """
-    Spec for loading/storing a KV block to GPU memory.
-
-    If there are multiple KV groups, the blocks are expected to be
-    ordered by the group index.
-    In that case, group_sizes[i] determines the number of blocks
-    per the i-th KV group, and thus sum(group_sizes) == len(block_ids).
-    group_sizes=None indicates a single KV group.
-
-    If block_indices is given, each group (determined by group_sizes) of block IDs
-    will correspond to logically contiguous blocks, e.g. blocks 5-10 of a some request.
-    block_indices[i] will represent the block index of the first block in group #i.
-    Thus, len(block_indices) == len(group_sizes) = number of KV cache groups.
-    This information is required in order to support off/loading from offloaded blocks
-    which are larger than GPU blocks.
-    In such cases, the first GPU block per each group may be unaligned to the offloaded
-    block size, and so knowing block_indices[i] allows the worker to correctly
-    skip part of the first matching offloaded block.
-    """
-
-    def __init__(
-        self,
-        block_ids: list[int],
-        group_sizes: Sequence[int],
-        block_indices: Sequence[int],
-    ):
-        super().__init__(block_ids)
-        assert sum(group_sizes) == len(block_ids)
-        assert len(block_indices) == len(group_sizes)
-        self.group_sizes: Sequence[int] = group_sizes
-        self.block_indices: Sequence[int] = block_indices
-
-    @staticmethod
-    def medium() -> str:
-        return "GPU"
-
-
-class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
-    """
-    Spec for loading/storing a KV block to CPU memory.
-    """
-
-    @staticmethod
-    def medium() -> str:
-        return "CPU"
diff --git a/vllm/v1/kv_offload/reuse_manager.py b/vllm/v1/kv_offload/reuse_manager.py
index 96b8f969e758..6cb0a5f7591c 100644
--- a/vllm/v1/kv_offload/reuse_manager.py
+++ b/vllm/v1/kv_offload/reuse_manager.py
@@ -8,9 +8,9 @@
 """
 
 from collections import OrderedDict
-from collections.abc import Iterable
+from collections.abc import Collection, Iterable
 
-from vllm.v1.kv_offload.abstract import (
+from vllm.v1.kv_offload.base import (
     LoadStoreSpec,
     OffloadingEvent,
     OffloadingManager,
@@ -79,7 +79,7 @@ def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
         return self._backing.lookup(key, req_context)
 
     def prepare_store(
-        self, keys: Iterable[OffloadKey], req_context: ReqContext
+        self, keys: Collection[OffloadKey], req_context: ReqContext
     ) -> PrepareStoreOutput | None:
         """Filter out blocks below threshold, then delegate to backing.
 
@@ -87,7 +87,6 @@ def prepare_store(
         ``prepare_store`` so that blocks that would be skipped do not
         consume any CPU offload capacity.
         """
-        keys = list(keys)
         eligible = [
             key for key in keys if self.counts.get(key, 0) >= self.store_threshold
         ]
@@ -102,17 +101,19 @@ def prepare_store(
     # ------------------------------------------------------------------
 
     def prepare_load(
-        self, keys: Iterable[OffloadKey], req_context: ReqContext
+        self, keys: Collection[OffloadKey], req_context: ReqContext
     ) -> LoadStoreSpec:
         return self._backing.prepare_load(keys, req_context)
 
-    def touch(self, keys: Iterable[OffloadKey]) -> None:
+    def touch(self, keys: Collection[OffloadKey]) -> None:
         return self._backing.touch(keys)
 
-    def complete_load(self, keys: Iterable[OffloadKey]) -> None:
+    def complete_load(self, keys: Collection[OffloadKey]) -> None:
         return self._backing.complete_load(keys)
 
-    def complete_store(self, keys: Iterable[OffloadKey], success: bool = True) -> None:
+    def complete_store(
+        self, keys: Collection[OffloadKey], success: bool = True
+    ) -> None:
         return self._backing.complete_store(keys, success)
 
     def take_events(self) -> Iterable[OffloadingEvent]:
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
deleted file mode 100644
index b66b04ffbd65..000000000000
--- a/vllm/v1/kv_offload/spec.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from abc import ABC, abstractmethod
-from collections.abc import Iterator
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
-
-import torch
-
-from vllm.logger import init_logger
-from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
-from vllm.v1.kv_offload.worker.worker import OffloadingHandler
-
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-    from vllm.v1.kv_cache_interface import KVCacheConfig
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class CanonicalKVCacheTensor:
-    """
-    A canonicalized KV cache tensor whose first dimension is num_blocks.
-
-    For attention backends where the raw tensor has num_blocks at a
-    non-leading physical dimension (e.g. FlashAttention's
-    (2, num_blocks, ...) layout), the tensor is split so that each
-    resulting CanonicalKVCacheTensor starts with (num_blocks, ...).
-    """
-
-    # The KV cache tensor with shape (num_blocks, ...)
-    tensor: torch.Tensor
-    # The (possibly padded) page size per block in bytes
-    page_size_bytes: int
-
-
-@dataclass
-class CanonicalKVCacheRef:
-    """
-    Per-layer (or group of layers) reference to a specific (by index)
-    CanonicalKVCacheTensor and records the un-padded page size used by that layer.
-    """
-
-    # Index into the list of CanonicalKVCacheTensor objects
-    tensor_idx: int
-    # The un-padded page size per block in bytes
-    page_size_bytes: int
-
-
-@dataclass
-class CanonicalKVCaches:
-    """
-    Canonicalized block-level representation of the KV caches.
-
-    Composed of:
-        - Unique list of KV cache data tensors,
-          each with shape (num_blocks, page_size_in_bytes) and int8 dtype.
-        - Per-group data references of the tensors.
-          i.e. how each KV cache group maps to the tensors.
-    """
-
-    # Ordered list of unique block tensors, each with shape
-    # (num_blocks, ...).
-    tensors: list[CanonicalKVCacheTensor]
-    # Per-KV-cache-group list of data references that map each layer
-    # in the group to the appropriate entry in the tensors list.
-    group_data_refs: list[list[CanonicalKVCacheRef]]
-
-
-class OffloadingSpec(ABC):
-    """Spec for an offloading connector"""
-
-    def __init__(self, vllm_config: "VllmConfig", kv_cache_config: "KVCacheConfig"):
-        logger.warning(
-            "Initializing OffloadingSpec. This API is experimental and "
-            "subject to change in the future as we iterate the design."
-        )
-        self.vllm_config = vllm_config
-        self.kv_cache_config = kv_cache_config
-
-        kv_transfer_config = vllm_config.kv_transfer_config
-        assert kv_transfer_config is not None
-        self.extra_config = kv_transfer_config.kv_connector_extra_config
-
-        # block size used by vLLM for hashing request tokens for the sake
-        # of enabling prefix caching
-        self.hash_block_size = vllm_config.cache_config.block_size
-        # gpu block size per group
-        self.gpu_block_size: tuple[int, ...] = tuple(
-            kv_cache_group.kv_cache_spec.block_size
-            for kv_cache_group in kv_cache_config.kv_cache_groups
-        )
-
-        for block_size in self.gpu_block_size:
-            assert block_size % self.hash_block_size == 0, (
-                f"gpu_block_size={block_size} not divisible by "
-                f"hash_block_size={self.hash_block_size}. "
-                f"Hybrid models (e.g. Mamba+Attention) need "
-                f"--enable-prefix-caching to align block sizes."
-            )
-
-        # offloaded_block_size / gpu_block_size
-        self.block_size_factor: int = 1
-
-        offloaded_block_size = self.extra_config.get("block_size")
-        if offloaded_block_size is not None:
-            offloaded_block_size_int = int(offloaded_block_size)
-            gpu_block_sizes = set(self.gpu_block_size)
-            assert len(gpu_block_sizes) == 1, (
-                "If 'block_size' is specified in kv_connector_extra_config, "
-                "there must be at least one KV cache group, "
-                "and all groups must have the same block size."
-            )
-            gpu_block_size = gpu_block_sizes.pop()
-
-            assert offloaded_block_size_int % gpu_block_size == 0
-            self.block_size_factor = offloaded_block_size_int // gpu_block_size
-
-    @abstractmethod
-    def get_manager(self) -> OffloadingManager:
-        """
-        Get an OffloadingManager that will be used
-        by the scheduler-side offloading connector to track
-        offloaded blocks and manage evictions.
-        """
-        pass
-
-    @abstractmethod
-    def get_handlers(
-        self, kv_caches: CanonicalKVCaches
-    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
-        """
-        Get offloading handlers along with their respective src and dst types.
-
-        Args:
-            kv_caches: Canonicalized KV caches.
-
-        Yields:
-            Tuples of (src_type, dst_type, offloading_handler).
-        """
-        pass
diff --git a/vllm/v1/kv_offload/worker/worker.py b/vllm/v1/kv_offload/worker/worker.py
index 2a47cc8e9d07..2f0dd2471631 100644
--- a/vllm/v1/kv_offload/worker/worker.py
+++ b/vllm/v1/kv_offload/worker/worker.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 
 from vllm.logger import init_logger
-from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.base import LoadStoreSpec
 
 # a single transfer spec (src_blocks_spec, dst_blocks_spec)
 TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec]
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index a11b92680779..7e2100546e82 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import time
 
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorProm
@@ -28,10 +29,13 @@ def _get_replica_id() -> str | None:
 
 
 class RayPrometheusMetric:
+    _is_labeled: bool = False
+
     def __init__(self):
         if ray_metrics is None:
             raise ImportError("RayPrometheusMetric requires Ray to be installed.")
         self.metric: Metric = None
+        self._tags: dict[str, str] = {"ReplicaId": _get_replica_id() or ""}
 
     @staticmethod
     def _get_tag_keys(labelnames: list[str] | None) -> tuple[str, ...]:
@@ -39,7 +43,7 @@ def _get_tag_keys(labelnames: list[str] | None) -> tuple[str, ...]:
         labels.append("ReplicaId")
         return tuple(labels)
 
-    def labels(self, *labels, **labelskwargs):
+    def _build_tags(self, *labels, **labelskwargs) -> dict[str, str]:
         if labels:
             # -1 because ReplicaId was added automatically
             expected = len(self.metric._tag_keys) - 1
@@ -52,12 +56,15 @@ def labels(self, *labels, **labelskwargs):
 
         labelskwargs["ReplicaId"] = _get_replica_id() or ""
 
-        if labelskwargs:
-            for k, v in labelskwargs.items():
-                if not isinstance(v, str):
-                    labelskwargs[k] = str(v)
-            self.metric.set_default_tags(labelskwargs)
-        return self
+        return {k: v if isinstance(v, str) else str(v) for k, v in labelskwargs.items()}
+
+    def labels(self, *labels, **labelskwargs) -> "RayPrometheusMetric":
+        if self._is_labeled:
+            raise ValueError("labels() cannot be called on an already-labeled metric.")
+        clone = copy.copy(self)
+        clone._tags = self._build_tags(*labels, **labelskwargs)
+        clone._is_labeled = True
+        return clone
 
     @staticmethod
     def _get_sanitized_opentelemetry_name(name: str) -> str:
@@ -91,6 +98,7 @@ def __init__(
         # implemented at the observability layer (Prometheus/Grafana).
         del multiprocess_mode
 
+        super().__init__()
         tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
 
@@ -101,11 +109,11 @@ def __init__(
         )
 
     def set(self, value: int | float):
-        return self.metric.set(value)
+        return self.metric.set(value, tags=self._tags)
 
     def set_to_current_time(self):
         # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html
-        return self.metric.set(time.time())
+        return self.set(time.time())
 
 
 class RayCounterWrapper(RayPrometheusMetric):
@@ -118,6 +126,7 @@ def __init__(
         documentation: str | None = "",
         labelnames: list[str] | None = None,
     ):
+        super().__init__()
         tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
         self.metric = ray_metrics.Counter(
@@ -129,7 +138,7 @@ def __init__(
     def inc(self, value: int | float = 1.0):
         if value == 0:
             return
-        return self.metric.inc(value)
+        return self.metric.inc(value, tags=self._tags)
 
 
 class RayHistogramWrapper(RayPrometheusMetric):
@@ -143,6 +152,7 @@ def __init__(
         labelnames: list[str] | None = None,
         buckets: list[float] | None = None,
     ):
+        super().__init__()
         tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
 
@@ -155,7 +165,7 @@ def __init__(
         )
 
     def observe(self, value: int | float):
-        return self.metric.observe(value)
+        return self.metric.observe(value, tags=self._tags)
 
 
 class RaySpecDecodingProm(SpecDecodingProm):
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 678d57580cc7..738a68c83680 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -66,6 +66,7 @@ def __init__(
         client_index: int = 0,
         arrival_time: float | None = None,
         prompt_embeds: torch.Tensor | None = None,
+        prompt_is_token_ids: list[bool] | None = None,
         mm_features: list[MultiModalFeatureSpec] | None = None,
         lora_request: "LoRARequest | None" = None,
         cache_salt: str | None = None,
@@ -74,6 +75,7 @@ def __init__(
         block_hasher: Callable[["Request"], list["BlockHash"]] | None = None,
         resumable: bool = False,
         reasoning_ended: bool | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -86,6 +88,9 @@ def __init__(
         )
         if self.structured_output_request is not None:
             self.structured_output_request.reasoning_ended = reasoning_ended
+            self.structured_output_request.reasoning_parser_kwargs = (
+                reasoning_parser_kwargs
+            )
         self.arrival_time = arrival_time if arrival_time is not None else time.time()
 
         self.status = RequestStatus.WAITING
@@ -114,6 +119,10 @@ def __init__(
 
         self.prompt_token_ids = prompt_token_ids
         self.prompt_embeds = prompt_embeds
+        # Per-position mask used in mixed-mode (chat completion with
+        # prompt_embeds). `None` except when both `prompt_token_ids` and
+        # `prompt_embeds` are set and their positions are interleaved.
+        self.prompt_is_token_ids = prompt_is_token_ids
         # Cache per-block prompt-embed hashes to avoid rehashing the same
         # tensor slices when generating extra keys.
         self._prompt_embeds_per_block_hashes: dict[tuple[int, int], bytes] = {}
@@ -184,6 +193,7 @@ def from_engine_core_request(
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
             prompt_embeds=request.prompt_embeds,
+            prompt_is_token_ids=request.prompt_is_token_ids,
             mm_features=request.mm_features,
             sampling_params=request.sampling_params,
             pooling_params=request.pooling_params,
@@ -195,6 +205,7 @@ def from_engine_core_request(
             block_hasher=block_hasher,
             resumable=request.resumable,
             reasoning_ended=request.reasoning_ended,
+            reasoning_parser_kwargs=request.reasoning_parser_kwargs,
         )
 
     def append_output_token_ids(
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index fb4a046fc057..2cb89e1ea950 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -18,7 +18,6 @@
     LogitBiasLogitsProcessor,
     MinPLogitsProcessor,
     MinTokensLogitsProcessor,
-    ThinkingTokenBudgetLogitsProcessor,
     process_dict_updates,
 )
 from vllm.v1.sample.logits_processor.interface import (
@@ -51,7 +50,6 @@
     MinTokensLogitsProcessor,
     LogitBiasLogitsProcessor,
     MinPLogitsProcessor,
-    ThinkingTokenBudgetLogitsProcessor,
 ]
 
 
@@ -356,5 +354,4 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
     "STR_POOLING_REJECTS_LOGITSPROCS",
     "LOGITSPROCS_GROUP",
     "AdapterLogitsProcessor",
-    "ThinkingTokenBudgetLogitsProcessor",
 ]
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 1739452b44a0..11a52711d671 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Callable, Sequence
-from typing import TYPE_CHECKING, Any, TypeVar
+from typing import TYPE_CHECKING, TypeVar
 
 import numpy as np
 import torch
@@ -291,267 +291,6 @@ def apply_with_spec_decode(
         return logits
 
 
-class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
-    """Limits the number of tokens allowed inside a 'thinking' section."""
-
-    def __init__(
-        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
-    ):
-        reasoning_config = vllm_config.reasoning_config
-        max_num_reqs = vllm_config.scheduler_config.max_num_seqs
-
-        # Check if thinking is enabled
-        self.is_enabled = reasoning_config is not None and reasoning_config.enabled
-
-        self.reasoning_start_token_ids = getattr(
-            reasoning_config, "reasoning_start_token_ids", []
-        )
-        self.reasoning_end_token_ids = getattr(
-            reasoning_config, "reasoning_end_token_ids", []
-        )
-
-        self.pin_memory = is_pin_memory
-        self.device = device
-        # Per-request state tracking for thinking token management
-        # Key: request_index, Value: state dict containing:
-        # "in_think": bool - currently in thinking mode
-        # "in_end": bool - currently forcing end tokens output
-        # "check_count_down": int - steps remaining until next think
-        #                            start/end token parsing
-        # "think_count": int - number of thinking tokens generated
-        # "end_count": int - number of end tokens forced so far
-        # "thinking_token_budget": int - max allowed thinking tokens
-        # "output_tok_ids": list[int] - generated output tokens
-        # "prev_output_length": int - previous output length for
-        #                               incremental processing
-        self._state: dict[int, dict[str, Any]] = {}
-
-        # Preallocate reusable tensors
-        self.mask = torch.zeros(max_num_reqs, dtype=torch.bool, device=device)
-        self.force_token_ids = torch.full(
-            (max_num_reqs,), -1, dtype=torch.long, device=device
-        )
-
-    @staticmethod
-    def _find_last_sequence_index(target_list: list[int], token_ids: list[int]) -> int:
-        """
-        Returns the index of the last occurrence of token_ids in target_list.
-
-        Args:
-          target_list (list[int]): The list of token IDs.
-          token_ids (list[int]): The sequence of token IDs to find.
-        """
-        if not token_ids:
-            return -1
-        for i in range(len(target_list) - len(token_ids), -1, -1):
-            if target_list[i : i + len(token_ids)] == token_ids:
-                return i
-        return -1
-
-    def _init_state_entry(
-        self, prompt_tok_ids: list[int] | None, thinking_token_budget: int
-    ) -> dict[str, Any]:
-        """Initializes the tracking state for a given sequence index."""
-        if prompt_tok_ids is None:
-            last_start = -1
-            last_end = -1
-            in_think = False
-            think_count = 0
-        else:
-            last_start = self._find_last_sequence_index(
-                prompt_tok_ids, self.reasoning_start_token_ids
-            )
-            last_end = self._find_last_sequence_index(
-                prompt_tok_ids, self.reasoning_end_token_ids
-            )
-            in_think = last_start > last_end
-            if in_think:
-                think_count = len(prompt_tok_ids) - (
-                    last_start + len(self.reasoning_start_token_ids)
-                )
-            else:
-                think_count = 0
-
-        return {
-            "in_think": in_think,  # Currently in thinking mode
-            "in_end": in_think and thinking_token_budget == 0,
-            "check_count_down": thinking_token_budget,
-            "think_count": think_count,  # Number of tokens in thinking section
-            "end_count": 0,  # Number of end tokens forced so far
-            "prompt_tok_ids": prompt_tok_ids,
-            "output_tok_ids": [],
-            "thinking_token_budget": thinking_token_budget,
-            "prev_output_length": 0,
-            # Track previous output length for incremental updates
-        }
-
-    def _update_think_state(self, state: dict[str, Any]):
-        """Updates the state based on newly generated output tokens."""
-        if not state.get("in_end", False) and state.get("check_count_down", 0) > 0:
-            state["check_count_down"] -= 1
-            return
-
-        output = state.get("output_tok_ids", [])
-        if not output:
-            return
-
-        # Track previous output length for incremental processing
-        prev_length = state.get("prev_output_length", 0)
-        current_length = len(output)
-
-        if current_length <= prev_length:
-            return
-
-        # Process only newly added tokens
-        new_tokens = output[prev_length:]
-        state["prev_output_length"] = current_length
-
-        # Check if new tokens contain think start or end sequences
-        start_len = len(self.reasoning_start_token_ids)
-        end_len = len(self.reasoning_end_token_ids)
-
-        # Look for think sequences in recent tokens (including boundary)
-        # Check overlapping regions where sequences might span boundaries
-        check_start_idx = max(0, prev_length - max(start_len, end_len) + 1)
-        recent_tokens = output[check_start_idx:]
-
-        # Find any think start/end sequences in recent tokens
-        recent_start_pos = self._find_last_sequence_index(
-            recent_tokens, self.reasoning_start_token_ids
-        )
-        recent_end_pos = self._find_last_sequence_index(
-            recent_tokens, self.reasoning_end_token_ids
-        )
-
-        # Update state based on recent sequences
-        if not state["in_end"]:
-            if recent_start_pos >= 0 and recent_end_pos >= 0:
-                if recent_start_pos > recent_end_pos:
-                    # Case: ...<end>...<start>... - entering think mode
-                    absolute_start_pos = check_start_idx + recent_start_pos
-                    new_think_count = current_length - (absolute_start_pos + start_len)
-                    state["in_think"] = True
-                    state["think_count"] = new_think_count
-                else:
-                    # Case: ...<start>...<end>... - exiting think mode
-                    state["in_think"] = False
-                    state["think_count"] = 0
-            elif recent_start_pos >= 0:
-                # Found think start - entering think mode
-                absolute_start_pos = check_start_idx + recent_start_pos
-                new_think_count = current_length - (absolute_start_pos + start_len)
-                state["in_think"] = True
-                state["think_count"] = new_think_count
-            elif recent_end_pos >= 0:
-                # Found think end - exiting think mode
-                state["in_think"] = False
-                state["think_count"] = 0
-            elif state["in_think"]:
-                # Continue thinking mode, increment count by new tokens
-                state["think_count"] += len(new_tokens)
-
-            # Set countdown based on current state
-            if state["in_think"]:
-                remaining_budget = max(
-                    0, state["thinking_token_budget"] - state["think_count"]
-                )
-                state["check_count_down"] = max(0, remaining_budget - 1)
-            else:
-                state["check_count_down"] = state["thinking_token_budget"]
-
-            # Check if need to transition to end mode
-            if (
-                state["in_think"]
-                and state["think_count"] >= state["thinking_token_budget"]
-            ):
-                state["in_think"] = False
-                state["in_end"] = True
-                state["end_count"] = 0
-                state["check_count_down"] = state["thinking_token_budget"]
-        else:
-            # In end mode
-            state["end_count"] += 1
-            if state["end_count"] >= len(self.reasoning_end_token_ids):
-                state.update(
-                    {
-                        "in_end": False,
-                        "end_count": 0,
-                        "check_count_down": state["thinking_token_budget"],
-                    }
-                )
-
-    def is_argmax_invariant(self) -> bool:
-        """This logits processor can change the outcome of
-        greedy sampling by forcing that the thinking section
-        ends after a certain number of tokens."""
-        return False
-
-    def update_state(self, batch_update: BatchUpdate | None):
-        if not self.is_enabled:
-            return
-        if batch_update:
-            for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
-                thinking_token_budget = params.thinking_token_budget
-
-                if thinking_token_budget is not None:
-                    self._state[index] = self._init_state_entry(
-                        prompt_tok_ids, thinking_token_budget
-                    )
-                    self._state[index]["output_tok_ids"] = output_tok_ids
-                else:
-                    # Remove state if no thinking budget
-                    self._state.pop(index, None)
-
-            for index in batch_update.removed:
-                self._state.pop(index, {})
-
-            for i1, i2, direction in batch_update.moved:
-                if direction == MoveDirectionality.SWAP:
-                    state1 = self._state.pop(i1, None)
-                    state2 = self._state.pop(i2, None)
-                    if state1 is not None:
-                        self._state[i2] = state1
-                    if state2 is not None:
-                        self._state[i1] = state2
-                else:
-                    state = self._state.pop(i1, None)
-                    if state is not None:
-                        self._state[i2] = state
-
-        for state in self._state.values():
-            self._update_think_state(state)
-
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if not self.is_enabled or not self._state:
-            return logits
-
-        batch_size = logits.size(0)
-        self.mask[:batch_size] = False
-
-        for i in range(batch_size):
-            state = self._state.get(i)
-            if state and state["in_end"]:
-                self.mask[i] = True
-                self.force_token_ids[i] = self.reasoning_end_token_ids[
-                    state["end_count"]
-                ]
-
-        # Check in CPU first not to sync with GPU
-        has_active_thinking = any(
-            state.get("in_end", False) for state in self._state.values()
-        )
-
-        if has_active_thinking:
-            current_mask = self.mask[:batch_size]
-            active_indices = current_mask.nonzero(as_tuple=False).view(-1)
-            if len(active_indices) > 0:
-                force_tokens = self.force_token_ids[active_indices]
-                # Apply a large value for the end thinking token id index
-                logits[active_indices, force_tokens] = 1e9
-
-        return logits
-
-
 def process_dict_updates(
     req_entries: dict[int, T],
     batch_update: BatchUpdate | None,
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 4682cde1098b..fa4ceac8e71e 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 
 import torch
 
 from vllm.v1.sample.logits_processor import LogitsProcessors
+from vllm.v1.sample.thinking_budget_state import ThinkingBudgetStateHolder
 
 
 @dataclass
@@ -47,3 +50,6 @@ class SamplingMetadata:
 
     # Speculative token ids
     spec_token_ids: list[list[int]] | None = None
+    # When non-None, use ``holder.has_tracked_requests()`` to see if this batch applies
+    # thinking-token-budget logits (holder may exist with an empty tracking set).
+    thinking_budget_state_holder: ThinkingBudgetStateHolder | None = None
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 33f7090e4e3d..363b113f0a4f 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -41,23 +41,35 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
 
                 capability = current_platform.get_device_capability()
                 assert capability is not None
-                if not FlashInferBackend.supports_compute_capability(capability):
+                if FlashInferBackend.supports_compute_capability(capability):
+                    logger.info_once(
+                        "Using FlashInfer for top-p & top-k sampling.",
+                        scope="global",
+                    )
+                    self.forward = self.forward_cuda
+                elif envs.is_set("VLLM_USE_FLASHINFER_SAMPLER"):
+                    # User explicitly opted in but the GPU can't run FlashInfer.
                     capability_str = capability.as_version_str()
                     raise RuntimeError(
                         "FlashInfer does not support compute capability "
                         f"{capability_str}, unset VLLM_USE_FLASHINFER_SAMPLER=1."
                     )
-                # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
-                logger.info_once(
-                    "Using FlashInfer for top-p & top-k sampling.",
-                    scope="global",
-                )
-                self.forward = self.forward_cuda
+                else:
+                    # Default-on path; hardware can't run FlashInfer →
+                    # quietly fall back to the PyTorch-native sampler
+                    # instead of failing server startup.
+                    logger.warning_once(
+                        "FlashInfer top-p/top-k sampling not supported on "
+                        "compute capability %s; falling back to PyTorch-native "
+                        "sampler. Set VLLM_USE_FLASHINFER_SAMPLER=0 to silence.",
+                        capability.as_version_str(),
+                    )
+                    self.forward = self.forward_native
             else:
-                logger.debug_once(
-                    "FlashInfer top-p/top-k sampling is available but disabled "
-                    "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in "
-                    "after verifying accuracy for your workloads."
+                # User explicitly set VLLM_USE_FLASHINFER_SAMPLER=0.
+                logger.info_once(
+                    "FlashInfer top-p/top-k sampling disabled via "
+                    "VLLM_USE_FLASHINFER_SAMPLER=0; using PyTorch-native sampler."
                 )
                 self.forward = self.forward_native
 
@@ -70,6 +82,11 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
                 self.forward = self.forward_native
             else:
                 self.forward = self.forward_cpu
+        elif current_platform.is_xpu():
+            if envs.VLLM_XPU_USE_SAMPLER_KERNEL:
+                self.forward = self.forward_xpu
+            else:
+                self.forward = self.forward_native
         elif (
             logprobs_mode not in ("processed_logits", "processed_logprobs")
             and rocm_aiter_ops.is_enabled()
@@ -120,9 +137,9 @@ def forward_cuda(
         p: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """More optimized implementation for top-k and top-p sampling."""
-        # We prefer `random_sample` over `flashinfer_sample` when sorting is
-        # not needed. This is because `random_sample` does not require
-        # CPU-GPU synchronization while `flashinfer_sample` does.
+        # Fall back to the PyTorch-native path when FlashInfer has nothing
+        # to do (no top-k / top-p filter) or when per-request generators
+        # are present (unsupported by FlashInfer 0.2.3+).
         if (k is None and p is None) or generators:
             if generators:
                 logger.debug_once(
@@ -231,6 +248,49 @@ def aiter_sample(
             return torch.multinomial(renorm_probs, num_samples=1).view(-1)
         raise RuntimeError("aiter_sample was called with no active top-k or top-p.")
 
+    def forward_xpu(
+        self,
+        logits: torch.Tensor,
+        generators: dict[int, torch.Generator],
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if generators:
+            logger.warning_once(
+                "xpu kernel topk_topp_sampler does not support "
+                "per-request generators. Falling back to "
+                "PyTorch-native implementation."
+            )
+            return self.forward_native(logits, generators, k, p)
+        random_sampled = torch.empty(
+            logits.shape[0], dtype=torch.int64, device=logits.device
+        )
+        logits_to_return = None
+        if (
+            self.logprobs_mode == "processed_logits"
+            or self.logprobs_mode == "processed_logprobs"
+        ):
+            logits_to_return = torch.empty_like(logits)
+
+        assert len(generators) != logits.shape[0], (
+            "xpu kernel topk_topp_sampler does not support batch-wise generators."
+        )
+        generator = torch.xpu.default_generators[logits.device.index]
+
+        state = generator.get_state()
+        seed, offset = state.view(torch.int64)
+        seeds = torch.tensor(
+            [seed, offset], dtype=torch.int64, device=torch.device("cpu")
+        )
+        # The XPU kernel expects k as int64 (Long), but the input batch
+        # stores top_k as int32. Cast here to avoid dtype mismatch.
+        if k is not None:
+            k = k.to(torch.int64)
+        torch.ops.vllm.xpu_topk_topp_sampler(
+            random_sampled, logits_to_return, logits, k, p, self.logprobs_mode, seeds
+        )
+        return random_sampled, logits_to_return
+
 
 # Note: this is a workaround for
 # https://github.com/pytorch/pytorch/pull/151218
@@ -361,10 +421,6 @@ def flashinfer_sample(
     NOTE: The outputs of this function do not necessarily match the outputs of
     the `random_sample` function. It only guarantees that the outputs are
     statistically equivalent.
-
-    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
-    does not. Call this function at the end of the forward pass to minimize
-    the synchronization overhead.
     """
     import flashinfer
 
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 2b63893c0496..678654cb78a4 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -290,16 +290,24 @@ def apply_logits_processors(
         any_penalties_or_bad_words = (
             sampling_metadata.bad_words_token_ids or has_penalties
         )
+        holder = sampling_metadata.thinking_budget_state_holder
+        needs_thinking = holder is not None and holder.has_tracked_requests()
 
         output_token_ids = sampling_metadata.output_token_ids
-        if any_penalties_or_bad_words:
+        if any_penalties_or_bad_words or needs_thinking:
             output_token_ids = self._combine_outputs_with_spec_tokens(
                 output_token_ids,
                 sampling_metadata.spec_token_ids,
             )
 
         # Calculate indices of target logits.
-        if sampling_metadata.allowed_token_ids_mask is not None or has_penalties:
+        repeat_indices: torch.Tensor | None = None
+        need_repeat_indices = (
+            sampling_metadata.allowed_token_ids_mask is not None
+            or has_penalties
+            or needs_thinking
+        )
+        if need_repeat_indices:
             num_requests = len(metadata.num_draft_tokens)
             num_draft_tokens = torch.tensor(metadata.num_draft_tokens, device="cpu")
             original_indices = torch.arange(num_requests, device="cpu")
@@ -327,7 +335,12 @@ def apply_logits_processors(
                 logits = processor.apply_with_spec_decode(
                     logits, metadata.num_draft_tokens
                 )
-
+        if holder is not None and holder.has_tracked_requests():
+            logits = holder.apply_to_logits(
+                logits,
+                predict_bonus_token=False,
+                spec_token_ids=sampling_metadata.spec_token_ids,
+            )
         return logits
 
     @staticmethod
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 5341351352e3..a77eafba2556 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -364,9 +364,13 @@ def apply_logits_processors(
         any_penalties_or_bad_words = (
             bool(bad_words_token_ids) or not sampling_metadata.no_penalties
         )
+        holder = sampling_metadata.thinking_budget_state_holder
+        needs_thinking_combine = holder is not None and holder.has_tracked_requests()
 
         output_token_ids = sampling_metadata.output_token_ids
-        if predict_bonus_token and any_penalties_or_bad_words:
+        if predict_bonus_token and (
+            any_penalties_or_bad_words or needs_thinking_combine
+        ):
             # Combine base outputs with spec tokens when speculative decoding
             # is enabled.
             output_token_ids = self._combine_outputs_with_spec_tokens(
@@ -388,6 +392,17 @@ def apply_logits_processors(
 
         # Apply penalties (e.g., freq_penalties).
         logits = self.apply_penalties(logits, sampling_metadata, output_token_ids)
+        if holder is not None and holder.has_tracked_requests():
+            holder.update_state(
+                output_token_ids,
+                sampling_metadata.spec_token_ids,
+                repeat_indices=None,
+            )
+            logits = holder.apply_to_logits(
+                logits,
+                predict_bonus_token,
+                sampling_metadata.spec_token_ids,
+            )
         return logits
 
     @staticmethod
diff --git a/vllm/v1/sample/thinking_budget_state.py b/vllm/v1/sample/thinking_budget_state.py
new file mode 100644
index 000000000000..74599a1e8c55
--- /dev/null
+++ b/vllm/v1/sample/thinking_budget_state.py
@@ -0,0 +1,528 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Per-batch thinking token budget state; applied after penalties at sample time."""
+
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from vllm.v1.sample.logits_processor.interface import (
+    BatchUpdate,
+    MoveDirectionality,
+)
+
+if TYPE_CHECKING:
+    from vllm.config.reasoning import ReasoningConfig
+
+
+def maybe_create_thinking_budget_state_holder(
+    reasoning_config: "ReasoningConfig | None",
+    max_num_seqs: int,
+    num_spec_tokens: int,
+    device: torch.device,
+    is_pin_memory: bool,
+) -> "ThinkingBudgetStateHolder | None":
+    if reasoning_config is None:
+        return None
+    return ThinkingBudgetStateHolder(
+        reasoning_config, max_num_seqs, num_spec_tokens, device, is_pin_memory
+    )
+
+
+class ThinkingBudgetStateHolder:
+    """Tracks thinking sections and forces end tokens when budget is exceeded."""
+
+    think_start_token_ids: list[int]
+    think_end_token_ids: list[int]
+
+    def __init__(
+        self,
+        reasoning_config: "ReasoningConfig | None",
+        max_num_seqs: int,
+        num_spec_tokens: int,
+        device: torch.device,
+        is_pin_memory: bool,
+    ):
+        _ = is_pin_memory  # API parity with logits processors
+        max_num_reqs = max_num_seqs
+        self.in_spec_mode = num_spec_tokens > 0
+        self.num_spec_tokens = num_spec_tokens
+
+        # No separate enable flag: a non-``None`` ``reasoning_config`` is the switch.
+        self.is_enabled = reasoning_config is not None
+
+        if reasoning_config is None:
+            self.think_start_token_ids = []
+            self.think_end_token_ids = []
+        else:
+            rs = reasoning_config.reasoning_start_token_ids
+            re = reasoning_config.reasoning_end_token_ids
+            self.think_start_token_ids = rs if rs else []
+            self.think_end_token_ids = re if re else []
+
+        self.device = device
+        self._state: dict[int, dict[str, Any]] = {}
+        self.cu_num_tokens: dict[int, int] = {}
+
+        if self.num_spec_tokens > 0:
+            self.mask = torch.zeros(
+                max_num_reqs * (self.num_spec_tokens + 1),
+                dtype=torch.bool,
+                device=device,
+            )
+            self.force_token_ids = torch.full(
+                (max_num_reqs * (self.num_spec_tokens + 1),),
+                -1,
+                dtype=torch.long,
+                device=device,
+            )
+        else:
+            self.mask = torch.zeros(max_num_reqs, dtype=torch.bool, device=device)
+            self.force_token_ids = torch.full(
+                (max_num_reqs,), -1, dtype=torch.long, device=device
+            )
+
+    def has_tracked_requests(self) -> bool:
+        """True when ``sync_batch`` has state for a ``thinking_token_budget`` row.
+
+        Used to decide whether sampling needs output-token rows and spec combining;
+        distinct from merely having a holder instance (reasoning may be on with no
+        budgeted requests in this batch).
+        """
+        return bool(self._state)
+
+    def sync_batch(self, batch_update: BatchUpdate | None) -> None:
+        """Add/remove/move per-request state only (no _update_think_state)."""
+        if not self.is_enabled or not batch_update:
+            return
+        for index in batch_update.removed:
+            self._state.pop(index, None)
+
+        for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
+            thinking_token_budget = params.thinking_token_budget
+            if thinking_token_budget is not None:
+                self._state[index] = self._init_state_entry(
+                    prompt_tok_ids, thinking_token_budget
+                )
+                self._state[index]["output_tok_ids"] = output_tok_ids
+                self._state[index]["spec_token_ids"] = []
+            else:
+                self._state.pop(index, None)
+
+        for i1, i2, direction in batch_update.moved:
+            if direction == MoveDirectionality.SWAP:
+                state1 = self._state.get(i1)
+                state2 = self._state.get(i2)
+                if state1 is not None:
+                    self._state[i2] = state1
+                if state2 is not None:
+                    self._state[i1] = state2
+            else:
+                state = self._state.pop(i1, None)
+                if state is not None:
+                    self._state[i2] = state
+
+    def update_state(
+        self,
+        output_token_ids: list[list[int]],
+        spec_token_ids: list[list[int]] | None,
+        repeat_indices: torch.Tensor | None = None,
+    ) -> None:
+        """Refresh output/spec from sampling rows and recompute think state."""
+        if not self.is_enabled or not self._state:
+            return
+
+        spec_lists = spec_token_ids or []
+        last_row_for_req: dict[int, int] | None = None
+        if repeat_indices is not None:
+            last_row_for_req = {}
+            rpt = repeat_indices.cpu().tolist()
+            for batch_row, req_i in enumerate(rpt):
+                last_row_for_req[req_i] = batch_row
+
+        for seq_idx, state in list(self._state.items()):
+            if last_row_for_req is not None:
+                output_row: int | None = last_row_for_req.get(seq_idx)
+                if output_row is None or output_row >= len(output_token_ids):
+                    continue
+                state["output_tok_ids"] = output_token_ids[output_row]
+            elif seq_idx >= len(output_token_ids):
+                continue
+            else:
+                state["output_tok_ids"] = output_token_ids[seq_idx]
+            if seq_idx < len(spec_lists):
+                state["spec_token_ids"] = list(spec_lists[seq_idx])
+            else:
+                state["spec_token_ids"] = []
+            state["in_spec_mode"] = self.in_spec_mode
+            state["force_index"] = []
+            if len(state["output_tok_ids"]) > 0:
+                spec_len = len(state["spec_token_ids"])
+                # Only strip draft suffix when there are spec tokens; ``[:-0]`` would
+                # clear the whole list (Python treats stop index 0 as "up to empty").
+                if spec_len > 0 and len(state["output_tok_ids"]) >= spec_len:
+                    state["output_tok_ids"] = state["output_tok_ids"][:-spec_len]
+            self._update_think_state(state)
+
+    def apply_to_logits(
+        self,
+        logits: torch.Tensor,
+        predict_bonus_token: bool,
+        spec_token_ids: list[list[int]] | None,
+    ) -> torch.Tensor:
+        """Mask and bump logits for forced end-of-thinking tokens."""
+        if not self.is_enabled or not self._state:
+            return logits
+        spec_lists = spec_token_ids or []
+        return self._apply_forcing_to_logits(logits, predict_bonus_token, spec_lists)
+
+    @staticmethod
+    def _find_last_sequence_index(target_list: list[int], token_ids: list[int]) -> int:
+        if not token_ids:
+            return -1
+        for i in range(len(target_list) - len(token_ids), -1, -1):
+            if target_list[i : i + len(token_ids)] == token_ids:
+                return i
+        return -1
+
+    def _init_state_entry(
+        self, prompt_tok_ids: list[int] | None, thinking_token_budget: int
+    ) -> dict[str, Any]:
+        if prompt_tok_ids is None:
+            last_start = -1
+            last_end = -1
+            in_think = False
+            think_count = 0
+            start_thinking = -1
+            countdown = thinking_token_budget
+            continue_thinking = False
+            in_end = False
+        else:
+            start_thinking = -1
+            countdown = thinking_token_budget
+            continue_thinking = False
+            in_end = False
+            last_start = self._find_last_sequence_index(
+                prompt_tok_ids, self.think_start_token_ids
+            )
+            last_end = self._find_last_sequence_index(
+                prompt_tok_ids, self.think_end_token_ids
+            )
+            in_think = last_start > last_end
+            # load metrics such as think count, start thinking
+            # if request is in thinking mode, already
+            if in_think:
+                think_count = len(prompt_tok_ids) - (
+                    last_start + len(self.think_start_token_ids)
+                )
+                start_thinking = len(prompt_tok_ids) - think_count - 1
+                countdown -= think_count
+                continue_thinking = True
+                # check if the token is exhausted within prompt
+                token_exhausted = thinking_token_budget - think_count
+                in_end = token_exhausted <= 0
+            else:
+                think_count = 0
+
+        return {
+            "in_think": in_think,
+            "in_end": in_end,
+            "check_count_down": countdown,
+            "think_count": think_count,
+            "end_count": 0,
+            "prompt_tok_ids": prompt_tok_ids,
+            "output_tok_ids": [],
+            "thinking_token_budget": thinking_token_budget,
+            "prev_output_length": 0,
+            "spec_token_ids": [],
+            "force_index": [],
+            "start_thinking": start_thinking,
+            "end_thinking": -1,
+            "in_spec_mode": False,
+            "bonus_token_forced": False,
+            "continue_thinking": continue_thinking,
+        }
+
+    def _update_think_state(self, state: dict[str, Any]) -> None:
+        if state.get("thinking_token_budget", -1) == -1:
+            return
+        if len(self.think_end_token_ids) == 0:
+            state["thinking_token_budget"] = -1
+            state["in_end"] = False
+            state["force_index"] = []
+            return
+
+        if state["start_thinking"] == -1:
+            start_thinking = self._find_last_sequence_index(
+                state.get("output_tok_ids", []), self.think_start_token_ids
+            )
+            state["start_thinking"] = start_thinking
+        if state["end_thinking"] == -1:
+            end_thinking = self._find_last_sequence_index(
+                state.get("output_tok_ids", []), self.think_end_token_ids
+            )
+            state["end_thinking"] = end_thinking
+
+        if state["start_thinking"] == -1:
+            return
+
+        if state["continue_thinking"]:
+            sampled_tokens_from_previous_step = len(
+                state.get("output_tok_ids", [])
+            ) - state.get("prev_output_length", 0)
+        else:
+            if state["prev_output_length"] == 0:
+                sampled_tokens_from_previous_step = len(
+                    state.get("output_tok_ids", [])
+                ) - len(self.think_start_token_ids)
+            else:
+                sampled_tokens_from_previous_step = (
+                    len(state.get("output_tok_ids", [])) - state["prev_output_length"]
+                )
+        current_step_countdown = (
+            state["check_count_down"] - sampled_tokens_from_previous_step
+        )
+        predicted_countdown = current_step_countdown - len(state["spec_token_ids"]) - 1
+        # We only proceed further if we have counted down the thinking budget
+        # to 0 or less and when we are in the "in think" mode.
+        if (
+            not state.get("in_end", False)
+            and predicted_countdown >= 0
+            and state["start_thinking"] > -1
+        ):
+            state["check_count_down"] = current_step_countdown
+            state["prev_output_length"] = len(state.get("output_tok_ids", []))
+            return
+        output = state.get("output_tok_ids", [])
+        if not output:
+            # When in_end was set at init (budget=0, prompt already in think),
+            # we must force the first generated token to be the end token;
+            # otherwise apply() sees in_end=True but force_index=[] and
+            # allows an extra thinking token.
+            if state.get("in_end", False):
+                state["force_index"] = [0]
+            return
+
+        # Track previous output length for incremental processing
+        prev_length = state.get("prev_output_length", 0)
+        current_length = len(output)
+
+        if current_length <= prev_length:
+            if state.get("in_end", False):
+                remaining_budget = state["thinking_token_budget"] - state["think_count"]
+                spec_len = len(state["spec_token_ids"])
+                if spec_len > 0:
+                    if 0 < remaining_budget < spec_len:
+                        state["force_index"] = [remaining_budget]
+                    elif remaining_budget <= 0:
+                        state["force_index"] = [0]
+                    else:
+                        state["force_index"] = [spec_len]
+                else:
+                    state["force_index"] = [0]
+            return
+
+        state["prev_output_length"] = current_length
+
+        start_len = len(self.think_start_token_ids)
+        absolute_start_pos = state["start_thinking"]
+
+        if state["continue_thinking"] and state["end_thinking"] > -1:
+            absolute_end_pos = state["end_thinking"] + len(
+                state.get("prompt_tok_ids") or []
+            )
+        else:
+            absolute_end_pos = state["end_thinking"]
+        # Update state based on recent sequences
+        # This is the case where we are in end mode, but the rejection sampler
+        # rejected a token before the end token,
+        # so we need to go back to think mode and wait for the next end token
+        # eg with 999: [2,4,5,999] -> [3,-1,-1,-1]
+        if state["in_end"] and state["end_count"] == 0:
+            new_tokens = output[prev_length:]
+            stopping_thinking = (
+                self.think_end_token_ids[state["end_count"]] in new_tokens
+            )
+            if not stopping_thinking:
+                state["in_think"] = True
+                state["in_end"] = False
+                state["end_count"] = 0
+                state["bonus_token_forced"] = False
+
+        if not state["in_end"]:
+            if absolute_start_pos >= 0 and absolute_end_pos >= 0:
+                # Case: ...<end>...<start>... - entering think mode
+                if absolute_start_pos > absolute_end_pos:
+                    new_think_count = current_length - (absolute_start_pos + start_len)
+                    state["in_think"] = True
+                    state["think_count"] = new_think_count
+                else:
+                    # Case: ...<start>...<end>... - exiting think mode
+                    state["in_think"] = False
+                    state["think_count"] = 0
+
+            elif absolute_start_pos >= 0 and not state["continue_thinking"]:
+                # Found think start - entering think mode
+                new_think_count = current_length - (absolute_start_pos + start_len)
+                state["in_think"] = True
+                state["think_count"] = new_think_count
+
+            elif absolute_end_pos >= 0:
+                # Found think end - exiting think mode
+                state["in_think"] = False
+                state["think_count"] = 0
+
+            elif state["in_think"]:
+                # Continue thinking mode, increment count by new tokens
+                prompt_tok_ids = state.get("prompt_tok_ids") or []
+                think_tokens_in_prompt = len(prompt_tok_ids) - (
+                    absolute_start_pos + start_len
+                )
+                state["think_count"] = (
+                    len(state["output_tok_ids"]) + think_tokens_in_prompt
+                )
+            if state["in_think"]:
+                remaining_budget = max(
+                    0, state["thinking_token_budget"] - state["think_count"]
+                )
+                state["check_count_down"] = remaining_budget
+            else:
+                state["check_count_down"] = state["thinking_token_budget"]
+
+            total_thinking_tokens = (
+                state["think_count"] + len(state["spec_token_ids"]) + 1
+            )
+            # Check if need to transition to end mode
+            # If we have more thinking tokens than the budget,
+            # we need to transition to end mode
+            if (
+                state["in_think"]
+                and total_thinking_tokens > state["thinking_token_budget"]
+            ):
+                # Calculate force_index: position within spec_token_ids where
+                # forcing starts. If we're already over budget without spec
+                # tokens, force from position 0. Force from the position
+                # where budget is exceeded.
+                state["in_think"] = False
+                state["in_end"] = True
+                state["end_count"] = 0
+                state["check_count_down"] = state["thinking_token_budget"]
+                remaining_budget = state["thinking_token_budget"] - state["think_count"]
+                spec_len = len(state["spec_token_ids"])
+                if 0 < remaining_budget < spec_len:
+                    state["force_index"] = [remaining_budget]
+
+                elif remaining_budget <= 0:
+                    state["force_index"] = [0]
+
+                else:
+                    # remaining_budget >= spec_len: all spec tokens are within
+                    # budget; force the bonus token position
+                    state["force_index"] = [len(state["spec_token_ids"])]
+
+        else:
+            state["force_index"] = []
+            if len(state["spec_token_ids"]) > 0:
+                for i, token_id in enumerate(state["spec_token_ids"]):
+                    if state["end_count"] + 1 < len(self.think_end_token_ids):
+                        if token_id == self.think_end_token_ids[state["end_count"] + 1]:
+                            state["end_count"] += 1
+                        else:
+                            state["end_count"] += 1
+                            state["force_index"] = [i]
+                            break
+                    else:
+                        state["end_count"] += 1
+                if len(state["force_index"]) == 0:
+                    state["end_count"] += 1
+                    state["force_index"] = [len(state["spec_token_ids"])]
+            else:
+                state["end_count"] += 1
+                state["force_index"] = [0]
+            if state["end_count"] >= len(self.think_end_token_ids):
+                state.update(
+                    {
+                        "in_end": False,
+                        "end_count": 0,
+                        "check_count_down": state["thinking_token_budget"],
+                    }
+                )
+
+    def _apply_forcing_to_logits(
+        self,
+        logits: torch.Tensor,
+        predict_bonus_token: bool,
+        spec_token_ids_for_layout: list[list[int]],
+    ) -> torch.Tensor:
+        self.mask[:] = False
+        cumulative_total = 0
+        self.cu_num_tokens.clear()
+
+        n_layout = len(spec_token_ids_for_layout)
+        if self._state:
+            n_layout = max(n_layout, max(self._state.keys()) + 1)
+
+        for index in range(n_layout):
+            self.cu_num_tokens[index] = cumulative_total
+            spec_tokens = (
+                spec_token_ids_for_layout[index]
+                if index < len(spec_token_ids_for_layout)
+                else []
+            )
+            if self.in_spec_mode:
+                cumulative_total += len(spec_tokens) if not predict_bonus_token else 1
+            else:
+                cumulative_total += 1
+
+        for seq_idx in sorted(self._state.keys()):
+            if seq_idx not in self.cu_num_tokens:
+                continue
+            state = self._state[seq_idx]
+            if state.get("in_end", False):
+                # logits processor in spec mode are called twice
+                # once for bonus token logits and
+                # second time for the target logits
+                # in case the force index is bonus token index
+                # we change the force index to 0
+                if predict_bonus_token:
+                    if state.get("force_index") and state["force_index"][0] < len(
+                        state["spec_token_ids"]
+                    ):
+                        continue
+                    else:
+                        state["force_index"] = [0]
+                # continue enforcing the end thinking tokens
+                if state["end_count"] > 0:
+                    state["bonus_token_forced"] = False
+                if state and not state["bonus_token_forced"]:
+                    force_index = state.get("force_index", [])
+                    if len(force_index) == 0:
+                        continue
+                    end_count = state.get("end_count", 0)
+                    for force_idx in force_index:
+                        if end_count < len(self.think_end_token_ids):
+                            mask_idx = self.cu_num_tokens[seq_idx] + force_idx
+                            if mask_idx < len(self.mask) and mask_idx < logits.shape[0]:
+                                self.mask[mask_idx] = True
+                                self.force_token_ids[mask_idx] = (
+                                    self.think_end_token_ids[end_count]
+                                )
+                            if predict_bonus_token:
+                                if state["end_count"] > 0:
+                                    state["bonus_token_forced"] = False
+                                    state["force_index"] = []
+                                else:
+                                    state["bonus_token_forced"] = True
+
+        has_active_thinking = any(
+            state.get("in_end", False) for state in self._state.values()
+        )
+
+        if has_active_thinking:
+            active_indices = self.mask.nonzero(as_tuple=False).view(-1)
+
+            if len(active_indices) > 0:
+                force_tokens = self.force_token_ids[active_indices]
+                logits[active_indices, force_tokens] = 1e9
+
+        return logits
diff --git a/vllm/v1/simple_kv_offload/cuda_mem_ops.py b/vllm/v1/simple_kv_offload/cuda_mem_ops.py
index 03338421c457..b4c68aff3ca9 100644
--- a/vllm/v1/simple_kv_offload/cuda_mem_ops.py
+++ b/vllm/v1/simple_kv_offload/cuda_mem_ops.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Low-level CUDA memory helpers: pinning and batch DMA transfers."""
+"""Low-level CUDA/HIP memory helpers: pinning and batch DMA transfers."""
 
 import ctypes
 from typing import Any, NamedTuple
@@ -9,6 +9,7 @@
 import torch
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -39,7 +40,7 @@ class _CUmemcpyAttributes(ctypes.Structure):
 
 
 _BATCH_MEMCPY_FUNC_TYPE = ctypes.CFUNCTYPE(
-    ctypes.c_uint,  # CUresult
+    ctypes.c_uint,  # CUresult / hipError_t
     ctypes.c_void_p,
     ctypes.c_void_p,
     ctypes.c_void_p,
@@ -56,7 +57,42 @@ class _CUmemcpyAttributes(ctypes.Structure):
 
 
 def _resolve_batch_memcpy():
-    """Resolve cuMemcpyBatchAsync via cuGetProcAddress (one-time)."""
+    """Resolve the platform batch-memcpy entry point (one-time).
+
+    * CUDA: ``cuMemcpyBatchAsync`` via ``cuGetProcAddress`` (uses
+      srcAccessOrder=STREAM via one attributes entry).
+    * ROCm: ``hipMemcpyBatchAsync`` from libamdhip64 (ROCm 7.1+). ROCm
+      7.2.1 or 7.2.2 rejects any call with ``numAttrs > 0``
+      (see ROCm/clr @ rocm-7.2.1 hipamd/src/hip_memory.cpp:2819-2822), so
+      we call with ``numAttrs=0``.
+
+    Raises ``RuntimeError`` if the symbol is unavailable (older CUDA
+    driver, ROCm < 7.1, unusual install). The connector requires the
+    batch API.
+    """
+    if current_platform.is_rocm():
+        try:
+            lib = ctypes.CDLL("libamdhip64.so", mode=ctypes.RTLD_GLOBAL)
+            fn = lib.hipMemcpyBatchAsync
+        except (OSError, AttributeError) as e:
+            raise RuntimeError(
+                "hipMemcpyBatchAsync is unavailable in this ROCm install; "
+                "SimpleCPUOffloadConnector requires ROCm 7.1+."
+            ) from e
+        fn.restype = ctypes.c_uint
+        fn.argtypes = [
+            ctypes.c_void_p,  # dsts
+            ctypes.c_void_p,  # srcs
+            ctypes.c_void_p,  # sizes
+            ctypes.c_size_t,  # count
+            ctypes.c_void_p,  # attrs
+            ctypes.c_void_p,  # attrIdxs
+            ctypes.c_size_t,  # numAttrs
+            ctypes.c_void_p,  # failIdx
+            ctypes.c_void_p,  # stream
+        ]
+        return fn
+
     from cuda.bindings import driver as drv
 
     err, ptr, _ = drv.cuGetProcAddress(b"cuMemcpyBatchAsync", 12080, 0)
@@ -70,6 +106,8 @@ class BatchMemcpyParams(NamedTuple):
     dst_bases: np.ndarray  # [num_layers] uint64
     bpb: np.ndarray  # [num_layers] uint64 — bytes per block
     num_layers: int
+    # CUDA only: one attributes entry with srcAccessOrder=ANY. Unused on
+    # ROCm (7.2.1 or 7.2.2) because the current runtime rejects numAttrs > 0.
     attrs: _CUmemcpyAttributes
     attrs_idx: ctypes.c_size_t
     # NOTE: cuMemcpyBatchAsync_v2() removed fail_idx field, but we use
@@ -99,8 +137,10 @@ def build_params(
         dst_bases.append(d.data_ptr())
         bpb.append(s_bpb)
 
-    # Refer to https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6f1ff58e3065df3eb4b573dba77ad31f for details.  # noqa: E501
-    attrs = _CUmemcpyAttributes(srcAccessOrder=3)  # ANY
+    # ``srcAccessOrder=3`` == CU_MEMCPY_SRC_ACCESS_ORDER_ANY /
+    # hipMemcpySrcAccessOrderAny. See
+    # https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6f1ff58e3065df3eb4b573dba77ad31f  # noqa: E501
+    attrs = _CUmemcpyAttributes(srcAccessOrder=3)
 
     return BatchMemcpyParams(
         src_bases=np.array(src_bases, dtype=np.uint64),
@@ -119,7 +159,7 @@ def copy_blocks(
     dst_block_ids: list[int],
     params: BatchMemcpyParams,
 ) -> None:
-    """Copy blocks via cuMemcpyBatchAsync."""
+    """Copy blocks via cuMemcpyBatchAsync / hipMemcpyBatchAsync."""
     n = len(src_block_ids)
     if n == 0:
         return
@@ -134,8 +174,13 @@ def copy_blocks(
         params.dst_bases[:, None] + dst_ids[None, :] * params.bpb[:, None]
     ).ravel()
     sz_all = np.repeat(params.bpb, n)
-
     total = n * params.num_layers
+
+    # ROCm 7.2.1/7.2.2 rejects any call with numAttrs>0 (hipMemcpyBatchAsync
+    # hipamd/src/hip_memory.cpp:2819-2822); CUDA uses one attrs entry so
+    # srcAccessOrder is honored. attrs / attrsIdxs are ignored when
+    # numAttrs==0, so we pass the same values from both paths.
+    num_attrs = 0 if current_platform.is_rocm() else 1
     err = _batch_memcpy_fn(
         dst_all.ctypes.data,
         src_all.ctypes.data,
@@ -143,11 +188,11 @@ def copy_blocks(
         total,
         ctypes.addressof(params.attrs),
         ctypes.byref(params.attrs_idx),
-        1,
+        num_attrs,
         ctypes.byref(params.fail_idx),
         params.stream_handle,
     )
     if err != 0:
         raise RuntimeError(
-            f"cuMemcpyBatchAsync failed: err={err} failIdx={params.fail_idx.value}"
+            f"batch memcpy failed: err={err} failIdx={params.fail_idx.value}"
         )
diff --git a/vllm/v1/spec_decode/dflash.py b/vllm/v1/spec_decode/dflash.py
index 2143b6a3a98e..db74044f4fde 100644
--- a/vllm/v1/spec_decode/dflash.py
+++ b/vllm/v1/spec_decode/dflash.py
@@ -80,9 +80,8 @@ def _create_draft_vllm_config(self) -> VllmConfig:
         )
 
     @override
-    def _raise_if_multimodal(self):
+    def _warn_if_multimodal(self):
         # Override to allow multimodal inputs since DFlash supports Qwen3.5 models
-        # Support for multimodal inputs has not been tested.
         pass
 
     @override
diff --git a/vllm/v1/spec_decode/gemma4.py b/vllm/v1/spec_decode/gemma4.py
new file mode 100644
index 000000000000..b0a02774faf6
--- /dev/null
+++ b/vllm/v1/spec_decode/gemma4.py
@@ -0,0 +1,335 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Gemma4 MTP (Multi-Token Prediction) proposer for speculative decoding.
+
+The Gemma4 assistant model runs all decoder layers per draft step
+(producing one token), and all its attention layers share KV cache
+with the target model via cross-model KV sharing.
+"""
+
+from collections import defaultdict
+from copy import copy
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig, get_layers_from_vllm_config, replace
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.attention.backend import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import (
+    KVCacheConfig,
+    KVCacheSpec,
+    UniformTypeKVCacheSpecs,
+)
+from vllm.v1.spec_decode.llm_base_proposer import SpecDecodeBaseProposer
+from vllm.v1.worker.utils import AttentionGroup
+
+logger = init_logger(__name__)
+
+
+class Gemma4Proposer(SpecDecodeBaseProposer):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+        runner=None,
+    ):
+        super().__init__(
+            vllm_config,
+            device,
+            pass_hidden_states_to_model=True,
+            runner=runner,
+        )
+        # All draft steps predict from the same position (the last
+        # target-model position), so positions and seq_lens must not
+        # advance between steps.
+        self.constant_draft_positions = True
+
+        # Per-group block tables for multi-group KV cache models.
+        # Populated by gpu_model_runner during _prepare_inputs.
+        self._per_group_block_tables: dict[int, torch.Tensor] = {}
+
+        # Centroids CUDA graphs — populated in load_model if centroids
+        # masking is active. _centroids_sizes is pre-sorted for fast
+        # lookup in _greedy_sample.
+        self._centroids_sizes: list[int] = []
+        self._centroids_graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self._centroids_inputs: dict[int, torch.Tensor] = {}
+        self._centroids_outputs: dict[int, torch.Tensor] = {}
+
+    def set_per_group_block_table(self, gid: int, block_table: torch.Tensor) -> None:
+        self._per_group_block_tables[gid] = block_table
+
+    def model_returns_tuple(self) -> bool:
+        # forward() returns (draft_hidden_states, backbone_hidden_states).
+        # The proposer uses draft_hidden_states for compute_logits and
+        # backbone_hidden_states for the hidden-state feedback buffer.
+        return True
+
+    def build_per_group_and_layer_attn_metadata(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        draft_index: int = 0,
+    ) -> tuple[list[object], dict[str, object]]:
+        """Build attention metadata using the correct block table per group.
+
+        Gemma4 has multiple KV cache groups (sliding vs full attention)
+        with different block tables.  The base class receives a single
+        common_attn_metadata whose block_table belongs to one group.
+        We swap in the correct block table for each draft attention group.
+        """
+        per_group_attn_metadata: list[object] = []
+        per_layer_attn_metadata: dict[str, object] = {}
+        for attn_group in self.draft_attn_groups:
+            gid = attn_group.kv_cache_group_id
+            if gid in self._per_group_block_tables:
+                cm = copy(common_attn_metadata)
+                cm.block_table_tensor = self._per_group_block_tables[gid]
+            else:
+                cm = common_attn_metadata
+            attn_metadata = attn_group.get_metadata_builder().build_for_drafting(
+                common_attn_metadata=cm, draft_index=draft_index
+            )
+            per_group_attn_metadata.append(attn_metadata)
+            for layer_name in attn_group.layer_names:
+                per_layer_attn_metadata[layer_name] = attn_metadata
+        return per_group_attn_metadata, per_layer_attn_metadata
+
+    def _greedy_sample(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self._centroids_sizes:
+            T = hidden_states.shape[0]
+            for size in self._centroids_sizes:
+                if size >= T:
+                    self._centroids_inputs[size][:T].copy_(hidden_states)
+                    self._centroids_graphs[size].replay()
+                    return self._centroids_outputs[size][:T].clone()
+            return self.model.get_top_tokens(hidden_states)
+        return super()._greedy_sample(hidden_states)
+
+    def _setup_centroids_cuda_graphs(self) -> None:
+        """Capture CUDA graphs for centroids get_top_tokens at key sizes."""
+        masked_emb = self.model.masked_embedding
+        lm_head_weight = self.model._get_full_lm_head_weight()
+
+        for size in [1, 2, 4, 8, 16, 32, 64]:
+            static_input = torch.zeros(
+                size,
+                masked_emb.hidden_size,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            for _ in range(3):
+                masked_emb.get_top_tokens(static_input, lm_head_weight)
+            torch.accelerator.synchronize()
+
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                static_output = masked_emb.get_top_tokens(
+                    static_input,
+                    lm_head_weight,
+                )
+            self._centroids_graphs[size] = g
+            self._centroids_inputs[size] = static_input
+            self._centroids_outputs[size] = static_output
+
+        self._centroids_sizes = sorted(self._centroids_graphs)
+        logger.info(
+            "Gemma4 MTP: captured centroids CUDA graphs for sizes %s.",
+            self._centroids_sizes,
+        )
+
+    def _create_draft_vllm_config(self) -> VllmConfig:
+        """Preserve the target's forced TRITON_ATTN backend for draft layers.
+
+        Gemma4 forces TRITON_ATTN due to heterogeneous head dimensions
+        (head_dim=256 sliding, global_head_dim=512 full). The base class
+        resets attention_config.backend to None for draft models, causing
+        sliding layers to fall back to FLASH_ATTN which cannot handle
+        KV-shared cache. Override to carry the target's backend through.
+        """
+        base = super()._create_draft_vllm_config()
+        target_backend = self.vllm_config.attention_config.backend
+        if target_backend is not None:
+            base = replace(
+                base,
+                attention_config=replace(
+                    base.attention_config,
+                    backend=target_backend,
+                ),
+            )
+        return base
+
+    def _maybe_share_lm_head(self, target_language_model: nn.Module) -> None:
+        """Gemma4 MTP always keeps its own draft-dim lm_head.
+
+        The draft model's lm_head operates in draft hidden_size (e.g. 256),
+        which differs from the target's backbone hidden_size (e.g. 1536).
+        Sharing would break compute_logits (and centroids masking when
+        use_ordered_embeddings is enabled).
+        """
+        logger.info(
+            "Gemma4 MTP: keeping draft model's own lm_head (draft_dim != backbone_dim)."
+        )
+
+    def load_model(self, target_model: nn.Module) -> None:
+        target_attn_layer_names = set(
+            get_layers_from_vllm_config(
+                self.vllm_config,
+                AttentionLayerBase,  # type: ignore[type-abstract]
+            ).keys()
+        )
+
+        super().load_model(target_model)
+
+        self._setup_gemma4_kv_sharing(target_attn_layer_names)
+
+        if getattr(self.model, "masked_embedding", None) is not None:
+            self._setup_centroids_cuda_graphs()
+
+    def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
+        """Draft layers span multiple KV cache groups (sliding + full
+        attention with different head dimensions), so skip the base
+        class single-group assertion."""
+
+    def initialize_attn_backend(
+        self,
+        kv_cache_config: KVCacheConfig,
+        kernel_block_sizes: list[int] | None = None,
+    ) -> None:
+        """Create separate AttentionGroup objects per KV cache spec
+        so that each head-dim variant gets its own metadata builder."""
+        all_attn_layers = get_layers_from_vllm_config(
+            self.vllm_config,
+            AttentionLayerBase,  # type: ignore[type-abstract]
+        )
+
+        layer_to_gid: dict[str, int] = {}
+        layer_to_spec: dict[str, KVCacheSpec] = {}
+        for gid, group in enumerate(kv_cache_config.kv_cache_groups):
+            group_spec = group.kv_cache_spec
+            for ln in group.layer_names:
+                layer_to_gid[ln] = gid
+                if isinstance(group_spec, UniformTypeKVCacheSpecs):
+                    if ln in group_spec.kv_cache_specs:
+                        layer_to_spec[ln] = group_spec.kv_cache_specs[ln]
+                    else:
+                        tgt = getattr(
+                            all_attn_layers.get(ln),
+                            "kv_sharing_target_layer_name",
+                            None,
+                        )
+                        if tgt and tgt in group_spec.kv_cache_specs:
+                            layer_to_spec[ln] = group_spec.kv_cache_specs[tgt]
+                        else:
+                            layer_to_spec[ln] = group_spec
+                else:
+                    layer_to_spec[ln] = group_spec
+
+        attention_groups: dict[tuple[tuple[str, str], KVCacheSpec], AttentionGroup] = {}
+        for layer_name in self._draft_attn_layer_names:
+            if layer_name not in layer_to_spec:
+                continue
+            attn_layer = all_attn_layers[layer_name]
+            attn_backend = attn_layer.get_attn_backend()
+            spec = layer_to_spec[layer_name]
+            gid = layer_to_gid[layer_name]
+            group_key = (attn_backend.full_cls_name(), spec)
+
+            if group_key not in attention_groups:
+                kernel_block_size = (
+                    kernel_block_sizes[gid]
+                    if kernel_block_sizes is not None and gid < len(kernel_block_sizes)
+                    else None
+                )
+                attn_group = AttentionGroup(
+                    backend=attn_backend,
+                    layer_names=[layer_name],
+                    kv_cache_spec=spec,
+                    kv_cache_group_id=gid,
+                )
+                attn_group.create_metadata_builders(
+                    self.vllm_config,
+                    self.device,
+                    kernel_block_size=kernel_block_size,
+                )
+                attention_groups[group_key] = attn_group
+            else:
+                attention_groups[group_key].layer_names.append(layer_name)
+
+        self.draft_attn_groups = list(attention_groups.values())
+        if self.draft_attn_groups:
+            self.kv_cache_gid = self.draft_attn_groups[0].kv_cache_group_id
+            self.block_size = (
+                self.draft_attn_groups[0]
+                .get_metadata_builder()
+                .kv_cache_spec.block_size
+            )
+        else:
+            self.kv_cache_gid = 0
+            self.block_size = kv_cache_config.kv_cache_groups[
+                0
+            ].kv_cache_spec.block_size
+        logger.debug("Using block size %d for drafting layers", self.block_size)
+
+    def _setup_gemma4_kv_sharing(
+        self,
+        target_attn_layer_names: set[str],
+    ) -> None:
+        """Wire draft layers to share KV with the target model.
+
+        Each draft decoder layer is mapped to the last non-KV-shared
+        target layer of the same attention type (sliding or full).
+        """
+        draft_config = self.speculative_config.draft_model_config.hf_config
+        draft_text_config = draft_config.get_text_config()
+        target_config = self.vllm_config.model_config.hf_config
+        target_text_config = target_config.get_text_config()
+        target_layer_types = getattr(target_text_config, "layer_types", [])
+
+        if not (hasattr(self.model, "model") and hasattr(self.model.model, "layers")):
+            return
+
+        target_num_kv_shared = getattr(target_text_config, "num_kv_shared_layers", 0)
+        num_non_shared = len(target_layer_types) - target_num_kv_shared
+        type_to_target_indices: dict[str, list[int]] = defaultdict(list)
+        for idx, lt in enumerate(target_layer_types[:num_non_shared]):
+            type_to_target_indices[lt].append(idx)
+
+        target_prefix = "model.layers"
+        for name in target_attn_layer_names:
+            if ".layers." in name:
+                target_prefix = name.split(".layers.")[0] + ".layers"
+                break
+
+        draft_layer_types = getattr(draft_text_config, "layer_types", [])
+        for draft_idx, layer in enumerate(self.model.model.layers):
+            if not hasattr(layer, "self_attn"):
+                continue
+            attn = getattr(layer.self_attn, "attn", None)
+            if attn is None:
+                continue
+
+            draft_layer_type = (
+                draft_layer_types[draft_idx]
+                if draft_idx < len(draft_layer_types)
+                else "full_attention"
+            )
+            candidates = type_to_target_indices.get(draft_layer_type, [])
+            if not candidates:
+                logger.warning(
+                    "No target layer of type '%s' for draft layer %d",
+                    draft_layer_type,
+                    draft_idx,
+                )
+                continue
+
+            target_idx = candidates[-1]
+            target_layer_name = f"{target_prefix}.{target_idx}.self_attn.attn"
+            attn.kv_sharing_target_layer_name = target_layer_name
+            logger.info(
+                "Gemma4 MTP: draft layer %d (%s) -> %s",
+                draft_idx,
+                draft_layer_type,
+                target_layer_name,
+            )
diff --git a/vllm/v1/spec_decode/llm_base_proposer.py b/vllm/v1/spec_decode/llm_base_proposer.py
index 1c59704de602..8ee349a1cc0d 100644
--- a/vllm/v1/spec_decode/llm_base_proposer.py
+++ b/vllm/v1/spec_decode/llm_base_proposer.py
@@ -105,6 +105,12 @@ def __init__(
         )
         self.needs_extra_input_slots = self.net_num_new_slots_per_request > 0
 
+        # When True, all draft steps reuse the same position as the
+        # first step instead of advancing by one each iteration.
+        # Used by draft models with Q-only attention that share KV
+        # with the target and always predict from the same position.
+        self.constant_draft_positions: bool = False
+
         self.parallel_drafting_token_id: int = 0
         self.parallel_drafting_hidden_state_tensor: torch.Tensor | None = None
         if self.parallel_drafting:
@@ -193,7 +199,7 @@ def __init__(
 
         if self.needs_extra_input_slots:
             self._raise_if_padded_drafter_batch_disabled()
-            self._raise_if_multimodal()
+            self._warn_if_multimodal()
             self._raise_if_mrope()
 
         self.is_rejected_token_mask: torch.Tensor | None = None
@@ -309,11 +315,12 @@ def _raise_if_padded_drafter_batch_disabled(self):
                 "disable_padded_drafter_batch in the speculative_config."
             )
 
-    def _raise_if_multimodal(self):
+    def _warn_if_multimodal(self):
         if self.supports_mm_inputs:
-            raise NotImplementedError(
+            logger.warning(
                 "Speculative Decoding with draft models or parallel drafting "
-                "does not support multimodal models yet"
+                "does not fully support multimodal models yet. "
+                "Proceeding with text-only speculative decoding."
             )
 
     def _raise_if_mrope(self):
@@ -388,9 +395,9 @@ def _get_slot_mapping(
         return {name: view for name in self._draft_attn_layer_names}
 
     def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode) -> None:
-        """Initialize cudagraph dispatcher keys for eagle.
+        """Initialize cudagraph dispatcher keys for the drafter.
 
-        Eagle only supports PIECEWISE cudagraphs (via mixed_mode).
+        Only supports PIECEWISE cudagraphs (via mixed_mode).
         This should be called after adjust_cudagraph_sizes_for_spec_decode.
         """
         if (
@@ -499,6 +506,12 @@ def propose(
             positions = self.positions[token_indices_to_sample]
         hidden_states = hidden_states[token_indices_to_sample]
 
+        if self.constant_draft_positions:
+            # Write the sampling positions into the front of the
+            # positions buffer so that subsequent loop iterations
+            # (which read via _get_positions) use the correct values.
+            self.positions[:batch_size] = positions
+
         if any(isinstance(md, TreeAttentionMetadata) for md in per_group_attn_metadata):
             # Draft using tree attention - requires full logits for top-k
             logits = self.model.compute_logits(sample_hidden_states)
@@ -556,59 +569,25 @@ def propose(
             # cast to int32 is crucial when eagle model is compiled.
             # tensor.argmax() returns int64 by default.
             input_ids = draft_token_ids_list[-1].int()
-            # Use fused kernel for slot mapping and metadata updates.
-            # Write clamped positions directly into the positions buffer to
-            # avoid an extra D2D copy for the common (non-mrope) case.
-            positions_1d = positions[0] if self.uses_mrope else positions
-            if self.uses_mrope:
-                out_pos = self.mrope_positions[0, :batch_size]
-            elif self.uses_xdrope_dim > 0 and self.draft_uses_xdrope_dim > 0:
-                out_pos = self.xdrope_positions[0, :batch_size]
-            else:
-                out_pos = self.positions[:batch_size]
-            eagle_step_update_slot_mapping_and_metadata(
-                positions_1d=positions_1d,
-                block_table_tensor=common_attn_metadata.block_table_tensor,
-                seq_lens=common_attn_metadata.seq_lens,
-                block_size=block_size,
-                max_model_len=self.max_model_len,
-                out_clamped_positions=out_pos,
-                out_slot_mapping=self._slot_mapping_buffer[:input_batch_size],
-                input_batch_size=input_batch_size,
-            )
-            common_attn_metadata.slot_mapping = self._slot_mapping_buffer[:batch_size]
-            if self.uses_mrope:
-                self.mrope_positions[1:, :batch_size] = self.mrope_positions[
-                    0, :batch_size
-                ]
-                positions = self.mrope_positions[:, :batch_size]
-            elif self.uses_xdrope_dim > 0 and self.draft_uses_xdrope_dim > 0:
-                self.xdrope_positions[1:, :batch_size] = self.xdrope_positions[
-                    0, :batch_size
-                ]
-                positions = self.xdrope_positions[0, :batch_size]
-            else:
-                positions = self.positions[:batch_size]
-            # Increment the maximum sequence length. We increment max_seq_len
-            # unconditionally even though some seq_lens may have been capped above,
-            # as max_seq_len serves as an upper bound for sequence lengths.
-            common_attn_metadata.max_seq_len = min(
-                common_attn_metadata.max_seq_len + 1, self.max_model_len
-            )
 
-            # Also update the CPU-side shadow; NOTE: this is hacky and should be
-            # removed in when common_attn_metadata.seq_lens_cpu is deprecated.
-            if common_attn_metadata._seq_lens_cpu is not None:
-                common_attn_metadata._seq_lens_cpu += 1
-            if common_attn_metadata._num_computed_tokens_cpu is not None:
-                common_attn_metadata._num_computed_tokens_cpu += 1
-            if common_attn_metadata.seq_lens_cpu_upper_bound is not None:
-                common_attn_metadata.seq_lens_cpu_upper_bound += 1
-
-            # Rebuild attention metadata
-            _, per_layer_attn_metadata = self.build_per_group_and_layer_attn_metadata(
-                common_attn_metadata, draft_index=token_index + 1
-            )
+            if not self.constant_draft_positions:
+                positions = self._update_positions_dependent_metadata(
+                    positions,
+                    common_attn_metadata,
+                    batch_size,
+                    input_batch_size,
+                    block_size,
+                )
+
+            # Rebuild attention metadata. When draft positions are constant
+            # (e.g. Gemma4 MTP), common_attn_metadata is invariant across
+            # loop iterations so we build once and reuse.
+            if not self.constant_draft_positions or token_index == 0:
+                _, per_layer_attn_metadata = (
+                    self.build_per_group_and_layer_attn_metadata(
+                        common_attn_metadata, draft_index=token_index + 1
+                    )
+                )
 
             # copy inputs to buffer for cudagraph
             self.input_ids[:batch_size] = input_ids
@@ -654,6 +633,58 @@ def propose(
         draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
         return draft_token_ids
 
+    def _update_positions_dependent_metadata(
+        self,
+        positions: torch.Tensor,
+        common_attn_metadata,
+        batch_size: int,
+        input_batch_size: int,
+        block_size: int,
+    ) -> torch.Tensor:
+        """Update positions, slot mappings, and sequence metadata for the
+        next draft step. Returns the updated positions tensor."""
+        positions_1d = positions[0] if self.uses_mrope else positions
+        if self.uses_mrope:
+            out_pos = self.mrope_positions[0, :batch_size]
+        elif self.uses_xdrope_dim > 0 and self.draft_uses_xdrope_dim > 0:
+            out_pos = self.xdrope_positions[0, :batch_size]
+        else:
+            out_pos = self.positions[:batch_size]
+        eagle_step_update_slot_mapping_and_metadata(
+            positions_1d=positions_1d,
+            block_table_tensor=common_attn_metadata.block_table_tensor,
+            seq_lens=common_attn_metadata.seq_lens,
+            block_size=block_size,
+            max_model_len=self.max_model_len,
+            out_clamped_positions=out_pos,
+            out_slot_mapping=self._slot_mapping_buffer[:input_batch_size],
+            input_batch_size=input_batch_size,
+        )
+        common_attn_metadata.slot_mapping = self._slot_mapping_buffer[:batch_size]
+        if self.uses_mrope:
+            self.mrope_positions[1:, :batch_size] = self.mrope_positions[0, :batch_size]
+            positions = self.mrope_positions[:, :batch_size]
+        elif self.uses_xdrope_dim > 0 and self.draft_uses_xdrope_dim > 0:
+            self.xdrope_positions[1:, :batch_size] = self.xdrope_positions[
+                0, :batch_size
+            ]
+            positions = self.xdrope_positions[0, :batch_size]
+        else:
+            positions = self.positions[:batch_size]
+        common_attn_metadata.max_seq_len = min(
+            common_attn_metadata.max_seq_len + 1,
+            self.max_model_len,
+        )
+
+        if common_attn_metadata._seq_lens_cpu is not None:
+            common_attn_metadata._seq_lens_cpu += 1
+        if common_attn_metadata._num_computed_tokens_cpu is not None:
+            common_attn_metadata._num_computed_tokens_cpu += 1
+        if common_attn_metadata.seq_lens_cpu_upper_bound is not None:
+            common_attn_metadata.seq_lens_cpu_upper_bound += 1
+
+        return positions
+
     def set_inputs_first_pass(
         self,
         target_token_ids: torch.Tensor,
@@ -883,16 +914,12 @@ def prepare_next_token_ids_padded(
         is not sampled and comes from `request.get_token_id()` instead. This is denoted
         the "backup" token id. It also counts rejected tokens via `sampled_token_ids`.
         """
-        # Precompute get_token_id for when there is no valid next token
+        # Precompute backup token IDs for discarded requests.
         num_reqs = gpu_input_batch.num_reqs
-        seq_lens_list = (gpu_input_batch.num_tokens_no_spec[:num_reqs] - 1).tolist()
-        self.backup_next_token_ids.np[:num_reqs] = np.array(
-            [
-                requests[gpu_input_batch.req_ids[i]].get_token_id(seq_lens_list[i])
-                for i in range(num_reqs)
-            ],
-            dtype=np.int32,
-        )
+        for i in range(num_reqs):
+            self.backup_next_token_ids.np[i] = requests[
+                gpu_input_batch.req_ids[i]
+            ].get_token_id(gpu_input_batch.num_tokens_no_spec[i] - 1)
         self.backup_next_token_ids.copy_to_gpu(num_reqs)
         backup_tokens_gpu = self.backup_next_token_ids.gpu
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 213b49f28d91..3080fe30b0b5 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -37,7 +37,10 @@ class StructuredOutputManager:
 
     def __init__(self, vllm_config: VllmConfig):
         self.backend: StructuredOutputBackend | None = None
-        self.reasoner: ReasoningParser | None = None
+        # We only store the class of the reasoner in the manager.
+        # The parser instance is request-scoped because some reasoning parsers
+        # depend on per-request chat-template kwargs.
+        self.reasoner_cls: type[ReasoningParser] | None = None
         self.vllm_config = vllm_config
 
         # When in external_launcher mode, async grammar compilation causes deadlocks
@@ -85,15 +88,29 @@ def __init__(self, vllm_config: VllmConfig):
                 self.vllm_config.structured_outputs_config.reasoning_parser
             )
             if reasoning_parser:
-                reasoner_cls = ReasoningParserManager.get_reasoning_parser(
+                self.reasoner_cls = ReasoningParserManager.get_reasoning_parser(
                     reasoning_parser
                 )
-                self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
         self.enable_in_reasoning = (
             self.vllm_config.structured_outputs_config.enable_in_reasoning
         )
 
+    def _get_reasoner(self, request: "Request") -> "ReasoningParser | None":
+        structured_req = request.structured_output_request
+        if structured_req is None or self.reasoner_cls is None:
+            return None
+
+        if structured_req.reasoner is None:
+            # Lazily build the request-local parser so the structured-output
+            # gate observes the same template kwargs used by the frontend.
+            parser_kwargs = structured_req.reasoning_parser_kwargs or {}
+            structured_req.reasoner = self.reasoner_cls(
+                tokenizer=self.tokenizer,
+                **parser_kwargs,
+            )
+        return structured_req.reasoner
+
     def grammar_init(self, request: "Request") -> None:
         if request.structured_output_request is None:
             return
@@ -285,7 +302,8 @@ def should_fill_bitmask(self, request: "Request") -> bool:
         # NOTE (Hanchen) if enable_in_reasoning is True, it means that
         # the model needs to be constrained in reasoning. So we should always
         # enable the bitmask filling.
-        if self.reasoner is not None:
+        reasoner = self._get_reasoner(request)
+        if reasoner is not None:
             if self.enable_in_reasoning:
                 return True
             assert request.structured_output_request is not None
@@ -295,7 +313,7 @@ def should_fill_bitmask(self, request: "Request") -> bool:
                 # After unifying the `openai_gptoss` and non-`openai_gptoss` styles,
                 # it can be removed.
                 request.structured_output_request.reasoning_ended = (
-                    self.reasoner.is_reasoning_end(request.prompt_token_ids or [])
+                    reasoner.is_reasoning_end(request.prompt_token_ids or [])
                 )
             return request.structured_output_request.reasoning_ended
         return True
@@ -311,7 +329,8 @@ def should_advance(self, request: "Request") -> bool:
             assert request.structured_output_request.grammar is not None
         # by default, we should always advance
         # for cases that don't use thinking mode.
-        if self.reasoner is None:
+        reasoner = self._get_reasoner(request)
+        if reasoner is None:
             return True
 
         # if the model needs structured in reasoning, we should advance
@@ -328,7 +347,7 @@ def should_advance(self, request: "Request") -> bool:
         start = (
             delta_from if delta_from >= 0 else max(len(all_token_ids) + delta_from, 0)
         )
-        if self.reasoner.is_reasoning_end_streaming(
+        if reasoner.is_reasoning_end_streaming(
             all_token_ids, itertools.islice(all_token_ids, start, None)
         ):
             # Reasoning just ended, so we shouldn't advance til
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index b921a71b3cf1..dfa8c7efcae4 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -5,7 +5,7 @@
 import json
 from concurrent.futures import Future
 from concurrent.futures._base import TimeoutError
-from typing import cast
+from typing import TYPE_CHECKING, Any, cast
 
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.structured_output.backend_types import (
@@ -14,12 +14,19 @@
     StructuredOutputOptions,
 )
 
+if TYPE_CHECKING:
+    from vllm.reasoning import ReasoningParser
+
 
 @dataclasses.dataclass
 class StructuredOutputRequest:
     params: StructuredOutputsParams
     _grammar: Future[StructuredOutputGrammar] | StructuredOutputGrammar | None = None
     reasoning_ended: bool | None = None
+    reasoning_parser_kwargs: dict[str, Any] | None = None
+    # Cached per request; do not share reasoning parsers across requests because
+    # their behavior can depend on reasoning_parser_kwargs.
+    reasoner: "ReasoningParser | None" = None
 
     @staticmethod
     def from_sampling_params(
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index f46e8a8ed63c..87a2aac9d4ca 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -257,6 +257,13 @@ def __init__(
                 f"must match block_sizes length ({len(block_sizes)})"
             )
 
+        # Align to a multiple of (128 / block_size) as required
+        # by some attention backends such as TRTLLM (#39324)
+        max_num_blocks = [
+            cdiv(n, 128 // bs) * (128 // bs) if bs <= 128 else n
+            for n, bs in zip(max_num_blocks, block_sizes)
+        ]
+
         self.block_tables = [
             BlockTable(
                 block_size,
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 61fe44d251b3..22c1b5bd0917 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -21,6 +21,9 @@
 
 class CPUModelRunner(GPUModelRunner):
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        # avoid calling accelerator APIs for methods inherited from super class
+        _set_torch_accelerator_to_noop()
+
         with _torch_cuda_wrapper():
             super().__init__(vllm_config, device)
 
@@ -244,3 +247,11 @@ def _set_global_compilation_settings(config: VllmConfig):
         yield
     finally:
         torch_inductor_config.freezing = freezing_value
+
+
+def _set_torch_accelerator_to_noop() -> None:
+    def noop(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    torch.accelerator.synchronize = noop
+    torch.accelerator.empty_cache = noop
diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
index e79a7afbd81e..3061278b019b 100644
--- a/vllm/v1/worker/gpu/block_table.py
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -41,24 +41,18 @@ def __init__(
             # As a result, one block on the current rank covers `block_size * cp_size`
             # tokens in the full, global (unsharded) sequence.
             max_num_blocks = cdiv(self.max_model_len, block_size * self.cp_size)
+            # Align to a multiple of (128 / block_size) as required
+            # by some attention backends such as TRTLLM (#39324)
+            if block_size <= 128:
+                alignment = 128 // block_size
+                max_num_blocks = cdiv(max_num_blocks, alignment) * alignment
             block_table = StagedWriteTensor(
                 (self.max_num_reqs, max_num_blocks),
                 dtype=torch.int32,
                 device=device,
             )
             self.block_tables.append(block_table)
-        self.block_table_ptrs = self._make_ptr_tensor(
-            [b.gpu for b in self.block_tables]
-        )
-        self.block_table_strides = torch.tensor(
-            [b.gpu.stride(0) for b in self.block_tables],
-            dtype=torch.int64,
-            device=self.device,
-        )
 
-        self.block_sizes_tensor = torch.tensor(
-            self.block_sizes, dtype=torch.int32, device=self.device
-        )
         self.num_blocks = UvaBackedTensor(
             (self.num_kv_cache_groups, self.max_num_reqs),
             dtype=torch.int32,
@@ -69,7 +63,6 @@ def __init__(
         self.input_block_tables: list[torch.Tensor] = [
             torch.zeros_like(b.gpu) for b in self.block_tables
         ]
-        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
 
         self.slot_mappings = torch.zeros(
             self.num_kv_cache_groups,
@@ -78,12 +71,33 @@ def __init__(
             device=self.device,
         )
 
+        self.init_block_table_layout_tensors()
+
     def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor:
         # NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses.
         return torch.tensor(
             [t.data_ptr() for t in x], dtype=torch.uint64, device=self.device
         )
 
+    def init_block_table_layout_tensors(self) -> None:
+        # Called at init and after a CuMem kv_cache wake-up. The ptr tensors
+        # cache raw data_ptr() values that go stale once the underlying tensors
+        # are reallocated on wake; block_sizes_tensor needs re-populating
+        # because its storage lives under the kv_cache pool tag and comes back
+        # with undefined contents.
+        self.block_table_ptrs = self._make_ptr_tensor(
+            [b.gpu for b in self.block_tables]
+        )
+        self.block_table_strides = torch.tensor(
+            [b.gpu.stride(0) for b in self.block_tables],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.block_sizes_tensor = torch.tensor(
+            self.block_sizes, dtype=torch.int32, device=self.device
+        )
+        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
+
     def append_block_ids(
         self,
         req_index: int,
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 6bd404985dcd..662f92e58158 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -9,6 +9,7 @@
 import torch.nn as nn
 from tqdm import tqdm
 
+from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
 from vllm.config.compilation import CUDAGraphMode
 from vllm.distributed.parallel_state import (
@@ -237,6 +238,7 @@ def capture(
                             # the next forward pass.
                             get_offloader().join_after_forward()
                         self.graphs[desc] = graph
+                        compilation_counter.num_cudagraph_captured += 1
         self._graphs_captured = True
         return captured_attn_states
 
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index a3eb1589a3f5..bf882a8af311 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -27,6 +27,7 @@
 import torch
 import torch.nn as nn
 
+from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
 from vllm.config.compilation import CUDAGraphMode
 from vllm.distributed.parallel_state import (
@@ -90,6 +91,7 @@
 from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.prompt_logprob import PromptLogprobsWorker
 from vllm.v1.worker.gpu.sample.sampler import Sampler
+from vllm.v1.worker.gpu.shutdown import free_before_shutdown
 from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.eagle.eagle3_utils import (
     set_eagle3_aux_hidden_state_layers,
@@ -558,6 +560,9 @@ def profile_run(self) -> None:
         del hidden_states, sample_hidden_states
         gc.collect()
 
+    def post_kv_cache_wake_up(self) -> None:
+        self.block_tables.init_block_table_layout_tensors()
+
     def reset_mm_cache(self) -> None:
         if self.encoder_cache is not None:
             self.encoder_cache.reset_mm_cache()
@@ -584,6 +589,8 @@ def capture_model(self) -> int:
             )
             return 0
 
+        compilation_counter.num_gpu_runner_capture_triggers += 1
+
         start_time = time.perf_counter()
         gc.collect()
         torch.accelerator.empty_cache()
@@ -1333,6 +1340,24 @@ def postprocess_pool(self, input_batch: InputBatch) -> None:
             input_batch.num_scheduled_tokens
         )
 
+    def shutdown(self) -> None:
+        """Release GPU tensors (model weights, KV caches, workspace) so that
+        memory is reclaimable when running in the same process."""
+        torch.accelerator.synchronize()
+        if hasattr(self, "kv_caches"):
+            self.kv_caches.clear()
+        if hasattr(self, "attn_groups"):
+            self.attn_groups.clear()
+        if hasattr(self, "kv_cache_config"):
+            del self.kv_cache_config
+        free_before_shutdown(self.vllm_config)
+        if hasattr(self, "model"):
+            del self.model
+
+        gc.collect()
+        torch.accelerator.empty_cache()
+        logger.debug("Cleaned up model weights, KV caches, and workspace")
+
     ########### EPLB methods start ###########
     @property
     def eplb_state(self):
diff --git a/vllm/v1/worker/gpu/sample/gumbel.py b/vllm/v1/worker/gpu/sample/gumbel.py
index 62912491492e..a02dd62026ad 100644
--- a/vllm/v1/worker/gpu/sample/gumbel.py
+++ b/vllm/v1/worker/gpu/sample/gumbel.py
@@ -76,6 +76,8 @@ def gumbel_block_argmax(
     pos_ptr,
     processed_logits_ptr,
     processed_logits_stride,
+    processed_logits_col_ptr,
+    vocab_size,
     APPLY_TEMPERATURE: tl.constexpr,
 ):
     req_state_idx = tl.load(expanded_idx_mapping_ptr + token_idx)
@@ -88,8 +90,15 @@ def gumbel_block_argmax(
 
     if processed_logits_ptr is not None:
         # Store the temperature-applied logits.
+        if processed_logits_col_ptr is not None:
+            col = tl.load(processed_logits_col_ptr)
+        else:
+            col = 0
         tl.store(
-            processed_logits_ptr + req_state_idx * processed_logits_stride + block,
+            processed_logits_ptr
+            + req_state_idx * processed_logits_stride
+            + col * vocab_size
+            + block,
             logits,
             mask=mask,
         )
@@ -121,6 +130,7 @@ def _gumbel_sample_kernel(
     local_max_stride,
     processed_logits_ptr,
     processed_logits_stride,
+    processed_logits_col_ptr,
     logits_ptr,
     logits_stride,
     expanded_idx_mapping_ptr,
@@ -153,6 +163,8 @@ def _gumbel_sample_kernel(
         pos_ptr,
         processed_logits_ptr,
         processed_logits_stride,
+        processed_logits_col_ptr,
+        vocab_size,
         APPLY_TEMPERATURE=APPLY_TEMPERATURE,
     )
     token_id = block_idx * BLOCK_SIZE + idx
@@ -167,7 +179,8 @@ def gumbel_sample(
     seed: torch.Tensor,  # [max_num_reqs]
     pos: torch.Tensor,  # [num_tokens]
     apply_temperature: bool,
-    processed_logits_out: torch.Tensor | None = None,  # [num_reqs, vocab_size]
+    output_processed_logits: torch.Tensor | None = None,
+    output_processed_logits_col: torch.Tensor | None = None,
 ) -> torch.Tensor:
     num_tokens, vocab_size = logits.shape
     BLOCK_SIZE = 1024
@@ -179,8 +192,9 @@ def gumbel_sample(
         local_argmax.stride(0),
         local_max,
         local_max.stride(0),
-        processed_logits_out,
-        processed_logits_out.stride(0) if processed_logits_out is not None else 0,
+        output_processed_logits,
+        output_processed_logits.stride(0) if output_processed_logits is not None else 0,
+        output_processed_logits_col,
         logits,
         logits.stride(0),
         expanded_idx_mapping,
diff --git a/vllm/v1/worker/gpu/sample/logprob.py b/vllm/v1/worker/gpu/sample/logprob.py
index 4317cad9ce7f..7530337fcd12 100644
--- a/vllm/v1/worker/gpu/sample/logprob.py
+++ b/vllm/v1/worker/gpu/sample/logprob.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import numpy as np
 import torch
 
+from vllm.sampling_params import MAX_LOGPROB_TOKEN_IDS, SamplingParams
 from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.worker.gpu.buffer_utils import StagedWriteTensor, UvaBackedTensor
 
 
 @triton.jit
@@ -75,6 +78,9 @@ def _ranks_kernel(
 def compute_token_logprobs(
     logits: torch.Tensor, token_ids: torch.Tensor
 ) -> torch.Tensor:
+    # NOTE(woosuk): To save GPU memory, we do not materialize the full
+    # [batch_size, vocab_size] logprobs tensor. The kernel computes
+    # max + logsumexp per row and only emits logprobs at `token_ids`.
     batch_size, vocab_size = logits.shape
     token_ids = token_ids.to(torch.int64)
     num_logprobs = token_ids.shape[1]
@@ -97,18 +103,52 @@ def compute_topk_logprobs(
     num_logprobs: int,
     sampled_token_ids: torch.Tensor,
     cu_num_logits: list[int] | None = None,
+    logprob_token_ids_state: "LogprobTokenIdsState | None" = None,
+    expanded_idx_mapping: torch.Tensor | None = None,
+    max_per_req_token_ids: int = 0,
 ) -> LogprobsTensors:
     assert num_logprobs >= 0
     batch_size, vocab_size = logits.shape
-    logprob_token_ids = sampled_token_ids.unsqueeze(-1)
-    if num_logprobs > 0:
-        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
-        logprob_token_ids = torch.cat((logprob_token_ids, topk_indices), dim=1)
-
-    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
-    # logprobs tensor. Instead, we only compute and return the logprobs of
-    # the topk + 1 tokens.
-    logprobs = compute_token_logprobs(logits, logprob_token_ids)
+
+    if max_per_req_token_ids == 0:
+        # Fast path: no request asked for custom logprob_token_ids.
+        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
+        if num_logprobs > 0:
+            topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+            logprob_token_ids = torch.cat((logprob_token_ids, topk_indices), dim=1)
+        logprobs = compute_token_logprobs(logits, logprob_token_ids)
+    else:
+        # Some requests specified logprob_token_ids. Build the [batch_size,
+        # 1 + max_cols] token_ids matrix and validity mask on the GPU via a
+        # single triton kernel, overriding the topk columns with per-request
+        # tokens where applicable.
+        assert logprob_token_ids_state is not None
+        assert expanded_idx_mapping is not None
+        topk_indices = None
+        if num_logprobs > 0:
+            topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+
+        num_cols = max(num_logprobs, max_per_req_token_ids)
+        logprob_token_ids = sampled_token_ids.new_zeros((batch_size, 1 + num_cols))
+        valid_mask = torch.zeros_like(logprob_token_ids, dtype=torch.bool)
+        _fill_logprob_token_ids_kernel[(batch_size,)](
+            logprob_token_ids,
+            logprob_token_ids.stride(0),
+            valid_mask,
+            valid_mask.stride(0),
+            sampled_token_ids,
+            topk_indices if topk_indices is not None else logprob_token_ids,
+            topk_indices.stride(0) if topk_indices is not None else 0,
+            expanded_idx_mapping,
+            logprob_token_ids_state.num_token_ids.gpu,
+            logprob_token_ids_state.token_ids.gpu,
+            logprob_token_ids_state.token_ids.gpu.stride(0),
+            NUM_TOPK=num_logprobs,
+            PADDED_COLS=triton.next_power_of_2(num_cols),
+        )
+        logprobs = compute_token_logprobs(logits, logprob_token_ids)
+        logprobs = logprobs.masked_fill(~valid_mask, float("-inf"))
+
     token_ranks = torch.empty(batch_size, dtype=torch.int64, device=logits.device)
     _ranks_kernel[(batch_size,)](
         token_ranks,
@@ -124,3 +164,87 @@ def compute_topk_logprobs(
         selected_token_ranks=token_ranks,
         cu_num_generated_tokens=cu_num_logits,
     )
+
+
+@triton.jit
+def _fill_logprob_token_ids_kernel(
+    # [batch_size, 1 + num_cols]
+    out_token_ids_ptr,
+    out_token_ids_stride,
+    # [batch_size, 1 + num_cols]
+    out_valid_mask_ptr,
+    out_valid_mask_stride,
+    sampled_token_ids_ptr,  # [batch_size]
+    topk_indices_ptr,  # [batch_size, NUM_TOPK] (unused when NUM_TOPK == 0)
+    topk_indices_stride,
+    expanded_idx_mapping_ptr,  # [batch_size] -> req_state_idx
+    num_per_req_token_ids_ptr,  # [max_num_reqs]
+    per_req_token_ids_ptr,  # [max_num_reqs, MAX_LOGPROB_TOKEN_IDS]
+    per_req_token_ids_stride,
+    NUM_TOPK: tl.constexpr,
+    PADDED_COLS: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+
+    # Column 0: always the sampled token, always valid.
+    sampled = tl.load(sampled_token_ids_ptr + batch_idx)
+    tl.store(out_token_ids_ptr + batch_idx * out_token_ids_stride, sampled)
+    tl.store(out_valid_mask_ptr + batch_idx * out_valid_mask_stride, 1)
+
+    req_state_idx = tl.load(expanded_idx_mapping_ptr + batch_idx)
+    num_custom = tl.load(num_per_req_token_ids_ptr + req_state_idx)
+
+    col = tl.arange(0, PADDED_COLS)
+    tid_base = out_token_ids_ptr + batch_idx * out_token_ids_stride + 1
+    mask_base = out_valid_mask_ptr + batch_idx * out_valid_mask_stride + 1
+
+    if num_custom > 0:
+        # Override topk with per-request custom tokens.
+        src = per_req_token_ids_ptr + req_state_idx * per_req_token_ids_stride
+        valid = col < num_custom
+        # per_req_token_ids is int32; output is int64.
+        tokens = tl.load(src + col, mask=valid, other=0).to(tl.int64)
+    else:
+        # Fill with topk indices (no-op when NUM_TOPK == 0).
+        src = topk_indices_ptr + batch_idx * topk_indices_stride
+        valid = col < NUM_TOPK
+        tokens = tl.load(src + col, mask=valid, other=0)
+
+    tl.store(tid_base + col, tokens, mask=valid)
+    tl.store(mask_base + col, tl.full([PADDED_COLS], 1, tl.int1), mask=valid)
+
+
+class LogprobTokenIdsState:
+    """Per-request override of which token ids' logprobs to return.
+
+    See `SamplingParams.logprob_token_ids`.
+    """
+
+    def __init__(self, max_num_reqs: int, device: torch.device):
+        self.max_num_reqs = max_num_reqs
+        self.num_token_ids = UvaBackedTensor(max_num_reqs, dtype=torch.int32)
+        self.token_ids = StagedWriteTensor(
+            (max_num_reqs, MAX_LOGPROB_TOKEN_IDS),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def add_request(self, req_idx: int, sampling_params: SamplingParams) -> None:
+        token_ids = sampling_params.logprob_token_ids
+        if not token_ids:
+            self.num_token_ids.np[req_idx] = 0
+            return
+        n = len(token_ids)
+        if n > MAX_LOGPROB_TOKEN_IDS:
+            raise ValueError(
+                f"Too many logprob_token_ids: {n}. The max is {MAX_LOGPROB_TOKEN_IDS}."
+            )
+        self.num_token_ids.np[req_idx] = n
+        self.token_ids.stage_write(req_idx, 0, token_ids)
+
+    def apply_staged_writes(self) -> None:
+        self.num_token_ids.copy_to_uva()
+        self.token_ids.apply_write()
+
+    def max_num_token_ids(self, idx_mapping_np: np.ndarray) -> int:
+        return int(self.num_token_ids.np[idx_mapping_np].max(initial=0))
diff --git a/vllm/v1/worker/gpu/sample/prompt_logprob.py b/vllm/v1/worker/gpu/sample/prompt_logprob.py
index 11dbf6985279..baa48ebf900c 100644
--- a/vllm/v1/worker/gpu/sample/prompt_logprob.py
+++ b/vllm/v1/worker/gpu/sample/prompt_logprob.py
@@ -55,10 +55,8 @@ def compute_prompt_logprobs(
 
         num_prompt_logprobs = self.num_prompt_logprobs[idx_mapping_np]
         prompt_lens = prompt_lens[idx_mapping_np]
-        # NOTE(woosuk): -1 because the last prompt token's hidden state is not
-        # needed for prompt logprobs.
         computed_prefill = num_computed_prefill_tokens[idx_mapping_np]
-        includes_prompt = computed_prefill < prompt_lens - 1
+        includes_prompt = computed_prefill < prompt_lens
         # NOTE(woosuk): If the request was resumed after preemption, its prompt
         # logprobs must have been computed before preemption. Skip.
         resumed_after_prompt = prompt_lens < prefill_lens[idx_mapping_np]
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
index 6f73ca87ac67..5d91d5b2f097 100644
--- a/vllm/v1/worker/gpu/sample/sampler.py
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -12,7 +12,10 @@
 from vllm.v1.worker.gpu.sample.bad_words import BadWordsState
 from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
 from vllm.v1.worker.gpu.sample.logit_bias import LogitBiasState
-from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
+from vllm.v1.worker.gpu.sample.logprob import (
+    LogprobTokenIdsState,
+    compute_topk_logprobs,
+)
 from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.penalties import PenaltiesState
 from vllm.v1.worker.gpu.sample.states import NO_LOGPROBS, SamplingStates
@@ -38,6 +41,7 @@ def __init__(
         self.penalties_state = PenaltiesState(req_states)
         self.logit_bias_state = LogitBiasState(max_num_reqs, device)
         self.bad_words_state = BadWordsState(req_states)
+        self.logprob_token_ids_state = LogprobTokenIdsState(max_num_reqs, device)
         self.num_speculative_tokens = num_speculative_tokens
 
     def add_request(
@@ -47,12 +51,14 @@ def add_request(
         self.penalties_state.add_request(req_idx, sampling_params)
         self.logit_bias_state.add_request(req_idx, prompt_len, sampling_params)
         self.bad_words_state.add_request(req_idx, sampling_params)
+        self.logprob_token_ids_state.add_request(req_idx, sampling_params)
 
     def apply_staged_writes(self) -> None:
         self.sampling_states.apply_staged_writes()
         self.penalties_state.apply_staged_writes()
         self.logit_bias_state.apply_staged_writes()
         self.bad_words_state.apply_staged_writes()
+        self.logprob_token_ids_state.apply_staged_writes()
 
     def __call__(
         self,
@@ -79,13 +85,23 @@ def __call__(
         )
 
         max_num_logprobs = self.sampling_states.max_num_logprobs(idx_mapping_np)
-        if max_num_logprobs != NO_LOGPROBS:
+        max_per_req_token_ids = self.logprob_token_ids_state.max_num_token_ids(
+            idx_mapping_np
+        )
+        if max_num_logprobs != NO_LOGPROBS or max_per_req_token_ids > 0:
             if self.logprobs_mode == "processed_logprobs":
                 logits = processed_logits
             expanded_logits = logits.shape[0] != idx_mapping_np.shape[0]
             cu_num_logits = cu_num_logits_np.tolist() if expanded_logits else None
+            num_logprobs = max_num_logprobs if max_num_logprobs != NO_LOGPROBS else 0
             logprobs_tensors = compute_topk_logprobs(
-                logits, max_num_logprobs, sampled, cu_num_logits
+                logits,
+                num_logprobs,
+                sampled,
+                cu_num_logits,
+                logprob_token_ids_state=self.logprob_token_ids_state,
+                expanded_idx_mapping=input_batch.expanded_idx_mapping,
+                max_per_req_token_ids=max_per_req_token_ids,
             )
         else:
             logprobs_tensors = None
diff --git a/vllm/v1/worker/gpu/shutdown.py b/vllm/v1/worker/gpu/shutdown.py
new file mode 100644
index 000000000000..830083962347
--- /dev/null
+++ b/vllm/v1/worker/gpu/shutdown.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def free_before_shutdown(vllm_config: VllmConfig) -> None:
+    from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
+    from vllm.v1.worker.workspace import reset_workspace_manager
+
+    cache_config = vllm_config.cache_config
+    cache_config.num_gpu_blocks = None
+
+    compilation_config = vllm_config.compilation_config
+    compilation_config.static_forward_context.clear()
+
+    _ROPE_DICT.clear()
+    reset_workspace_manager()
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
index c6b0aa364f53..efe510f16e22 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
@@ -89,9 +89,13 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             dtype=torch.int64,
             device=device,
         )
+        self.current_draft_step = torch.tensor(0, dtype=torch.int64, device=device)
         self.last_token_indices = torch.zeros(
             self.max_num_reqs, dtype=torch.int64, device=device
         )
+        self.arange = torch.arange(
+            self.max_num_reqs + 1, dtype=torch.int32, device="cpu"
+        )
 
         self.supports_mm_inputs = MULTIMODAL_REGISTRY.supports_multimodal_inputs(
             self.draft_model_config
@@ -228,9 +232,10 @@ def _sample_draft(
         logits: torch.Tensor,
         idx_mapping: torch.Tensor,
         pos: torch.Tensor,
-        step: int,
+        draft_step: torch.Tensor,
+        draft_logits: torch.Tensor | None,
     ) -> torch.Tensor:
-        if self.draft_logits is not None:
+        if draft_logits is not None:
             # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
             # used for draft and target sampling.
             return gumbel_sample(
@@ -240,7 +245,8 @@ def _sample_draft(
                 self.seeds,
                 pos + 1,
                 apply_temperature=True,
-                processed_logits_out=self.draft_logits[:, step],
+                output_processed_logits=draft_logits,
+                output_processed_logits_col=draft_step,
             )
         else:
             return logits.argmax(dim=-1)
@@ -274,11 +280,63 @@ def prefill(
             logits,
             idx_mapping,
             pos,
-            step=0,
+            self.current_draft_step,
+            self.draft_logits,
         )
         self.hidden_states[:num_reqs] = hidden_states[last_token_indices]
         self.input_buffers.positions[:num_reqs] = pos
 
+    def multi_step_decode(
+        self,
+        num_reqs: int,
+        skip_attn: bool,
+        batch_desc: BatchExecutionDescriptor,
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> None:
+        positions = self.input_buffers.positions[:num_reqs]
+        query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
+        idx_mapping = self.idx_mapping[:num_reqs]
+
+        for step in range(1, self.num_speculative_steps):
+            attn_metadata = None
+            slot_mappings_by_layer = None
+            if not skip_attn:
+                # Build attention metadata and slot mappings for each draft
+                # decode step. It is necessary to rebuild the attention
+                # metadata even when replaying the FULL graph so that any
+                # attention metadata builder state is updated.
+                slot_mappings = self.block_tables.compute_slot_mappings(
+                    idx_mapping,
+                    query_start_loc,
+                    positions,
+                    batch_desc.num_tokens,
+                )
+                slot_mappings_by_layer = build_slot_mappings_by_layer(
+                    slot_mappings, self.kv_cache_config
+                )
+                attn_metadata = self._build_draft_attn_metadata(
+                    num_reqs=num_reqs,
+                    num_reqs_padded=batch_desc.num_reqs or num_reqs,
+                    num_tokens_padded=batch_desc.num_tokens,
+                )
+
+            # Update the current draft step.
+            self.current_draft_step.fill_(step)
+
+            # Generate draft tokens for the current step.
+            if batch_desc.cg_mode == CUDAGraphMode.FULL:
+                assert self.decode_cudagraph_manager is not None
+                self.decode_cudagraph_manager.run_fullgraph(batch_desc)
+            else:
+                self.generate_draft(
+                    num_reqs,
+                    batch_desc.num_tokens,
+                    attn_metadata,
+                    slot_mappings_by_layer,
+                    num_tokens_across_dp=num_tokens_across_dp,
+                    cudagraph_runtime_mode=batch_desc.cg_mode,
+                )
+
     def generate_draft(
         self,
         num_reqs: int,
@@ -288,59 +346,52 @@ def generate_draft(
         num_tokens_across_dp: torch.Tensor | None,
         cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
     ) -> None:
-        pos = self.input_buffers.positions[:num_reqs]
-        query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
         idx_mapping = self.idx_mapping[:num_reqs]
-        for step in range(1, self.num_speculative_steps):
-            # Run the eagle model.
-            last_hidden_states, hidden_states = self.run_model(
-                num_tokens_padded,
-                attn_metadata,
-                slot_mappings,
-                num_tokens_across_dp,
-                cudagraph_runtime_mode,
-            )
-            last_hidden_states = last_hidden_states[:num_reqs]
-            hidden_states = hidden_states[:num_reqs]
-            logits = self.model.compute_logits(last_hidden_states)
+        positions = self.input_buffers.positions[:num_reqs]
+        # Run the eagle model forward pass.
+        last_hidden_states, hidden_states = self.run_model(
+            num_tokens_padded,
+            attn_metadata,
+            slot_mappings,
+            num_tokens_across_dp,
+            cudagraph_runtime_mode,
+        )
+        last_hidden_states = last_hidden_states[:num_reqs]
 
-            draft_tokens = self._sample_draft(
-                logits,
-                idx_mapping,
-                pos,
-                step=step,
-            )
-            self.draft_tokens[:num_reqs, step] = draft_tokens
-
-            if step < self.num_speculative_steps - 1:
-                # Update the inputs for the next step.
-                update_eagle_inputs(
-                    draft_tokens,
-                    hidden_states,
-                    self.input_buffers,
-                    self.hidden_states,
-                    self.max_model_len,
-                )
-                if attn_metadata is not None:
-                    self.block_tables.compute_slot_mappings(
-                        idx_mapping, query_start_loc, pos, num_tokens_padded
-                    )
+        # Sample the draft tokens.
+        logits = self.model.compute_logits(last_hidden_states)
+        draft_tokens = self._sample_draft(
+            logits,
+            idx_mapping,
+            positions,
+            self.current_draft_step,
+            self.draft_logits,
+        )
+
+        # Update the inputs for the next step.
+        update_eagle_draft_inputs(
+            draft_tokens,
+            self.current_draft_step,
+            hidden_states,
+            self.draft_tokens,
+            self.hidden_states,
+            self.input_buffers,
+            num_reqs,
+            self.max_model_len,
+            self.num_speculative_steps,
+        )
 
     def _build_draft_attn_metadata(
         self,
         num_reqs: int,
         num_reqs_padded: int,
         num_tokens_padded: int,
-        max_query_len: int,
     ) -> dict[str, Any] | None:
         if not self.draft_attn_layer_names:
             return None
 
-        query_start_loc_cpu = (
-            torch.arange(num_reqs_padded + 1, dtype=torch.int32, device="cpu").clamp_(
-                max=num_reqs
-            )
-            * max_query_len
+        query_start_loc_cpu = torch.clamp(
+            self.arange[: num_reqs_padded + 1], max=num_reqs
         )
         block_tables = [
             x[:num_reqs_padded] for x in self.block_tables.input_block_tables
@@ -354,7 +405,7 @@ def _build_draft_attn_metadata(
                 : num_reqs_padded + 1
             ],
             query_start_loc_cpu=query_start_loc_cpu,
-            max_query_len=max_query_len,
+            max_query_len=1,
             seq_lens=self.input_buffers.seq_lens[:num_reqs_padded],
             max_seq_len=self.max_model_len,
             block_tables=block_tables,
@@ -373,7 +424,7 @@ def capture(
         self.last_token_indices.zero_()
 
         # Capture the prefill routine (model forward + compute_logits +
-        # gumbel_sample).
+        # sample).
         # For FULL graphs, the entire routine is recorded as one graph.
         # For PIECEWISE, only the model's compiled regions are captured
         # and the rest (compute_logits, gumbel_sample) runs eagerly.
@@ -387,10 +438,9 @@ def capture(
         if self.num_speculative_steps == 1:
             return
 
-        # Capture the decode draft generation loop (model forward +
-        # compute_logits + gumbel_sample + update_eagle_inputs, for
-        # each step). For FULL graphs, the entire multi-step loop is
-        # recorded as one graph.
+        # Capture the decode draft generation routine (model forward +
+        # compute_logits + sample + update_eagle_inputs) for a single
+        # step.
         assert self.decode_cudagraph_manager is not None
         self.decode_cudagraph_manager.capture(
             self.generate_draft,
@@ -461,9 +511,10 @@ def propose(
 
         # Get the input ids and last token indices for the speculator.
         prepare_eagle_inputs(
+            self.last_token_indices,
+            self.current_draft_step,
             self.input_buffers,
             input_batch,
-            self.last_token_indices,
             num_sampled,
             num_rejected,
             last_sampled,
@@ -473,12 +524,18 @@ def propose(
 
         # When all requests are decoding (no true prefills), each has
         # num_speculative_steps + 1 tokens, enabling FULL graph replay.
-        # Mixed or prefill-only batches fall back to PIECEWISE.
+        uniform_token_count = get_uniform_token_count(
+            num_reqs,
+            # Use the actual number of tokens without padding added by
+            # the target model during FULL cudagraph.
+            input_batch.num_tokens,
+            max_query_len,
+        )
         prefill_batch_desc, num_tokens_across_dp = dispatch_cg_and_sync_dp(
             self.prefill_cudagraph_manager,
             num_reqs,
             num_tokens,
-            get_uniform_token_count(num_reqs, num_tokens, max_query_len),
+            uniform_token_count,
             dp_size=self.dp_size,
             dp_rank=self.dp_rank,
             need_eager=is_profile,
@@ -528,48 +585,21 @@ def propose(
             need_eager=is_profile,
         )
 
-        attn_metadata_updated = None
-        slot_mappings_updated = None
-        if not (dummy_run and skip_attn_for_dummy_run):
-            # Build attention metadata and slot mappings for the draft
-            # decode steps. It is necessary to rebuild the attention
-            # metadata even when replaying the FULL graph so that any
-            # attention metadata builder state is updated.
-            slot_mappings = self.block_tables.compute_slot_mappings(
-                self.idx_mapping[:num_reqs],
-                self.input_buffers.query_start_loc[: num_reqs + 1],
-                self.input_buffers.positions[:num_reqs],
-                decode_batch_desc.num_tokens,
-            )
-            slot_mappings_updated = build_slot_mappings_by_layer(
-                slot_mappings, self.kv_cache_config
-            )
-            attn_metadata_updated = self._build_draft_attn_metadata(
-                num_reqs=num_reqs,
-                num_reqs_padded=decode_batch_desc.num_reqs or num_reqs,
-                num_tokens_padded=decode_batch_desc.num_tokens,
-                max_query_len=1,
-            )
+        # Generate the remaining num_speculative_steps - 1 draft tokens.
+        self.multi_step_decode(
+            num_reqs,
+            dummy_run and skip_attn_for_dummy_run,
+            decode_batch_desc,
+            num_tokens_across_dp,
+        )
 
-        if decode_batch_desc.cg_mode == CUDAGraphMode.FULL:
-            # Replay the full graph for draft generation.
-            assert self.decode_cudagraph_manager is not None
-            self.decode_cudagraph_manager.run_fullgraph(decode_batch_desc)
-        else:
-            self.generate_draft(
-                num_reqs,
-                decode_batch_desc.num_tokens,
-                attn_metadata_updated,
-                slot_mappings_updated,
-                num_tokens_across_dp=num_tokens_across_dp,
-                cudagraph_runtime_mode=decode_batch_desc.cg_mode,
-            )
         return self.draft_tokens[:num_reqs]
 
 
 @triton.jit
 def _prepare_eagle_inputs_kernel(
     last_token_indices_ptr,
+    eagle_current_draft_step_ptr,
     eagle_input_ids_ptr,
     eagle_positions_ptr,
     eagle_query_start_loc_ptr,
@@ -630,6 +660,8 @@ def _prepare_eagle_inputs_kernel(
     # Copy sequence lengths.
     tl.store(eagle_seq_lens_ptr + req_idx, seq_len)
     if req_idx == (num_reqs - 1):
+        # Reset the current draft step to 0.
+        tl.store(eagle_current_draft_step_ptr, 0)
         # Pad query_start_loc for CUDA graphs.
         for i in range(num_reqs, max_num_reqs + 1, BLOCK_SIZE):
             block = i + tl.arange(0, BLOCK_SIZE)
@@ -648,10 +680,11 @@ def _prepare_eagle_inputs_kernel(
 
 
 def prepare_eagle_inputs(
-    input_buffers: InputBuffers,
-    input_batch: InputBatch,
     # [num_reqs]
     last_token_indices: torch.Tensor,
+    current_draft_step: torch.Tensor,
+    input_buffers: InputBuffers,
+    input_batch: InputBatch,
     # [num_reqs]
     num_sampled: torch.Tensor,
     # [num_reqs]
@@ -665,6 +698,7 @@ def prepare_eagle_inputs(
     num_reqs = input_batch.num_reqs
     _prepare_eagle_inputs_kernel[(num_reqs,)](
         last_token_indices,
+        current_draft_step,
         input_buffers.input_ids,
         input_buffers.positions,
         input_buffers.query_start_loc,
@@ -685,7 +719,7 @@ def prepare_eagle_inputs(
 
 
 @triton.jit
-def _prepare_eagle_docode_kernel(
+def _prepare_eagle_decode_kernel(
     draft_tokens_ptr,
     draft_tokens_stride,
     target_seq_lens_ptr,
@@ -742,7 +776,7 @@ def prepare_eagle_decode(
     max_num_reqs: int,
 ):
     num_reqs = draft_tokens.shape[0]
-    _prepare_eagle_docode_kernel[(num_reqs + 1,)](
+    _prepare_eagle_decode_kernel[(num_reqs + 1,)](
         draft_tokens,
         draft_tokens.stride(0),
         target_seq_lens,
@@ -758,36 +792,55 @@ def prepare_eagle_decode(
 
 
 @triton.jit
-def _update_eagle_inputs_kernel(
+def _update_eagle_draft_inputs_kernel(
+    output_draft_tokens_ptr,
+    output_draft_tokens_stride,
+    next_input_hidden_states_ptr,
+    next_input_hidden_states_stride,
     input_ids_ptr,
     positions_ptr,
-    input_hidden_states_ptr,
-    input_hidden_states_stride,
     seq_lens_ptr,
-    max_model_len,
     draft_tokens_ptr,
-    output_hidden_states_ptr,
-    output_hidden_states_stride,
+    current_draft_step_ptr,
+    hidden_states_ptr,
+    hidden_states_stride,
     hidden_size,
+    max_model_len,
+    num_speculative_steps,
     BLOCK_SIZE: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
 
-    # Draft token -> Input ID.
+    # Write the sampled draft token into self.draft_tokens[req_idx, step].
     draft_token = tl.load(draft_tokens_ptr + req_idx)
+    step = tl.load(current_draft_step_ptr)
+    tl.store(
+        output_draft_tokens_ptr + req_idx * output_draft_tokens_stride + step,
+        draft_token,
+    )
+
+    if step >= num_speculative_steps - 1:
+        # This is the final step. Skip updating draft forward inputs.
+        return
+
+    # Write the sampled draft token into the input ids tensor for the next
+    # forward pass.
     tl.store(input_ids_ptr + req_idx, draft_token)
 
-    # Output hidden states -> Input hidden states.
+    # Copy hidden states into the input hidden states tensor for the next
+    # forward pass.
     for i in range(0, hidden_size, BLOCK_SIZE):
         block = i + tl.arange(0, BLOCK_SIZE)
         mask = block < hidden_size
-        output_hidden_states = tl.load(
-            output_hidden_states_ptr + req_idx * output_hidden_states_stride + block,
+        hidden_states = tl.load(
+            hidden_states_ptr + req_idx * hidden_states_stride + block,
             mask=mask,
         )
         tl.store(
-            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
-            output_hidden_states,
+            next_input_hidden_states_ptr
+            + req_idx * next_input_hidden_states_stride
+            + block,
+            hidden_states,
             mask=mask,
         )
 
@@ -803,24 +856,32 @@ def _update_eagle_inputs_kernel(
     tl.store(seq_lens_ptr + req_idx, seq_len)
 
 
-def update_eagle_inputs(
+def update_eagle_draft_inputs(
     draft_tokens: torch.Tensor,
-    output_hidden_states: torch.Tensor,
-    input_buffers: InputBuffers,
+    current_draft_step: torch.Tensor,
     hidden_states: torch.Tensor,
+    output_draft_tokens: torch.Tensor,
+    next_input_hidden_states: torch.Tensor,
+    input_buffers: InputBuffers,
+    num_reqs: int,
     max_model_len: int,
+    num_speculative_steps: int,
 ):
-    num_reqs, hidden_size = output_hidden_states.shape
-    _update_eagle_inputs_kernel[(num_reqs,)](
+    _, hidden_size = hidden_states.shape
+    _update_eagle_draft_inputs_kernel[(num_reqs,)](
+        output_draft_tokens,
+        output_draft_tokens.stride(0),
+        next_input_hidden_states,
+        next_input_hidden_states.stride(0),
         input_buffers.input_ids,
         input_buffers.positions,
-        hidden_states,
-        hidden_states.stride(0),
         input_buffers.seq_lens,
-        max_model_len,
         draft_tokens,
-        output_hidden_states,
-        output_hidden_states.stride(0),
+        current_draft_step,
+        hidden_states,
+        hidden_states.stride(0),
         hidden_size,
+        max_model_len,
+        num_speculative_steps,
         BLOCK_SIZE=1024,
     )
diff --git a/vllm/v1/worker/gpu/spec_decode/probabilistic_rejection_sampler_utils.py b/vllm/v1/worker/gpu/spec_decode/probabilistic_rejection_sampler_utils.py
index 9d86372e624b..10b29433efb2 100644
--- a/vllm/v1/worker/gpu/spec_decode/probabilistic_rejection_sampler_utils.py
+++ b/vllm/v1/worker/gpu/spec_decode/probabilistic_rejection_sampler_utils.py
@@ -392,8 +392,10 @@ def _resample_kernel(
         temp_ptr,
         seed_ptr,
         pos_ptr,
-        None,
-        0,
+        None,  # processed_logits_ptr
+        0,  # processed_logits_stride
+        None,  # processed_logits_col_ptr
+        vocab_size,
         APPLY_TEMPERATURE=False,
     )
     token_id = block_idx * BLOCK_SIZE + idx
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 89e63f3def7a..e9de08342b82 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -8,6 +8,7 @@
 import numpy as np
 import torch
 
+from vllm.config.reasoning import ReasoningConfig
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
@@ -22,6 +23,9 @@
     MoveDirectionality,
 )
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.thinking_budget_state import (
+    maybe_create_thinking_budget_state_holder,
+)
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 
@@ -45,6 +49,12 @@ class CachedRequestState:
 
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
+    # To accumulate prompt logprobs tensor chunks across prefill steps.
+    in_progress_prompt_logprobs_cpu: LogprobsTensors | None = None
+
+    # Per-position mask for mixed-mode inputs (e.g chat completion with
+    # prompt_embeds content parts). See `Request.prompt_is_token_ids`.
+    prompt_is_token_ids: list[bool] | None = None
 
     # Used when both async_scheduling and spec_decode are enabled.
     prev_num_draft_len: int = 0
@@ -92,12 +102,20 @@ def __init__(
         max_num_blocks_per_req: list[int] | None = None,
         logitsprocs: LogitsProcessors | None = None,
         logitsprocs_need_output_token_ids: bool = False,
-        is_spec_decode: bool = False,
+        num_spec_tokens: int = 0,
         is_pooling_model: bool = False,
         cp_kv_cache_interleave_size: int = 1,
+        reasoning_config: ReasoningConfig | None = None,
     ):
+        self.thinking_budget_state_holder = maybe_create_thinking_budget_state_holder(
+            reasoning_config,
+            max_num_reqs,
+            num_spec_tokens,
+            device,
+            pin_memory,
+        )
+        self.thinking_token_budget_reqs: set[str] = set()
         self.is_pooling_model = is_pooling_model
-        self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_batched_tokens = max_num_batched_tokens
@@ -239,9 +257,6 @@ def __init__(
         # More efficient than num_logprobs=-1 when only a few tokens are needed
         self.logprob_token_ids: dict[str, list[int]] = {}
 
-        # To accumulate prompt logprobs tensor chunks across prefill steps.
-        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
-
         # Internal representation of per-step batch state changes, used for
         # reordering persistent batch and generating logitsprocs batch state
         # updates. Should reset each step.
@@ -344,7 +359,12 @@ def add_request(
         end_idx = start_idx + len(request.output_token_ids)
         if request.prompt_token_ids is not None:
             self.token_ids_cpu[req_index, :num_prompt_tokens] = request.prompt_token_ids
-            self.is_token_ids[req_index, :num_prompt_tokens] = True
+            if request.prompt_is_token_ids is not None:
+                self.is_token_ids[req_index, :num_prompt_tokens] = (
+                    request.prompt_is_token_ids
+                )
+            else:
+                self.is_token_ids[req_index, :num_prompt_tokens] = True
         else:
             self.is_token_ids[req_index, :num_prompt_tokens] = False
         if request.prompt_embeds is not None:
@@ -484,6 +504,7 @@ def update_req_spec_token_ids(
         start_index = self.num_tokens_no_spec[req_index]
         end_token_index = start_index + num_spec_tokens
         self.token_ids_cpu[req_index, start_index:end_token_index] = spec_token_ids
+        self.is_token_ids[req_index, start_index:end_token_index] = True
         cur_spec_token_ids.extend(spec_token_ids)
 
     def remove_request(self, req_id: str) -> int | None:
@@ -531,7 +552,6 @@ def remove_request(self, req_id: str) -> int | None:
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.logprob_token_ids.pop(req_id, None)
-        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
         if self.prev_req_id_to_index is not None:
             self.prev_req_id_to_index.pop(req_id, None)
 
@@ -540,6 +560,7 @@ def remove_request(self, req_id: str) -> int | None:
             # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         self.bad_words_token_ids.pop(req_index, None)
+        self.thinking_token_budget_reqs.discard(req_id)
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
@@ -800,6 +821,8 @@ def refresh_metadata(self):
         # reset batch update tracking.
         # Update sampling metadata if batch state is changed.
         batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
+        if self.thinking_budget_state_holder is not None and batch_update:
+            self.thinking_budget_state_holder.sync_batch(batch_update)
         for logit_proc in self.logitsprocs.all:
             logit_proc.update_state(batch_update)
         if batch_update:
@@ -853,10 +876,15 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
 
         # Only set output_token_ids if required by the current requests'
         # sampling parameters.
+        holder = self.thinking_budget_state_holder
+        thinking_budget_tracks_reqs = (
+            holder is not None and holder.has_tracked_requests()
+        )
         needs_output_token_ids = (
             not self.no_penalties
             or bool(self.bad_words_token_ids)
             or self.logitsprocs_need_output_token_ids
+            or not thinking_budget_tracks_reqs
         )
         output_token_ids = (
             cast(list[list[int]], self.req_output_token_ids)
@@ -902,6 +930,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
             logitsprocs=self.logitsprocs,
+            thinking_budget_state_holder=self.thinking_budget_state_holder,
         )
 
     def get_pooling_params(self) -> list[PoolingParams]:
@@ -1076,6 +1105,13 @@ def no_penalties(self) -> bool:
             and len(self.repetition_penalties_reqs) == 0
         )
 
+    @property
+    def no_thinking_budget(self) -> bool:
+        return (
+            self.thinking_budget_state_holder is None
+            or len(self.thinking_token_budget_reqs) == 0
+        )
+
     @property
     def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a0ba47f945a7..f712f2c499f2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -169,6 +169,7 @@
 from vllm.v1.spec_decode.draft_model import DraftModelProposer
 from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesProposer
+from vllm.v1.spec_decode.gemma4 import Gemma4Proposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer_gpu import (
@@ -524,6 +525,7 @@ def __init__(
                 | DraftModelProposer
                 | MedusaProposer
                 | ExtractHiddenStatesProposer
+                | Gemma4Proposer
             )
             if self.speculative_config.method == "ngram":
                 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
@@ -552,6 +554,8 @@ def __init__(
                 self._ngram_pinned_val_buf = torch.zeros(
                     self.max_num_reqs, dtype=torch.int32, pin_memory=True
                 )
+            elif self.speculative_config.use_gemma4_mtp():
+                self.drafter = Gemma4Proposer(self.vllm_config, self.device, self)
             elif self.speculative_config.use_dflash():
                 self.drafter = DFlashProposer(self.vllm_config, self.device, self)
                 self.use_aux_hidden_state_outputs = True
@@ -629,7 +633,7 @@ def __init__(
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[placeholder_block_size],
             kernel_block_sizes=[placeholder_block_size],
-            is_spec_decode=bool(self.vllm_config.speculative_config),
+            num_spec_tokens=self.num_spec_tokens,
             logitsprocs=build_logitsprocs(
                 self.vllm_config,
                 self.device,
@@ -645,6 +649,7 @@ def __init__(
             or self.vllm_config.reasoning_config is not None,
             is_pooling_model=self.is_pooling_model,
             cp_kv_cache_interleave_size=self.parallel_config.cp_kv_cache_interleave_size,
+            reasoning_config=self.vllm_config.reasoning_config,
         )
 
         # Separate cuda stream for overlapping transfer of sampled token ids from
@@ -883,6 +888,9 @@ def reset_encoder_cache(self) -> None:
         self.encoder_cache.clear()
         self.late_interaction_runner.clear()
 
+    def post_kv_cache_wake_up(self) -> None:
+        self.init_fp8_kv_scales()
+
     @torch.inference_mode()
     def init_fp8_kv_scales(self) -> None:
         """
@@ -1159,6 +1167,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> Callable | None
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
                 prompt_embeds=new_req_data.prompt_embeds,
+                prompt_is_token_ids=new_req_data.prompt_is_token_ids,
                 mm_features=new_req_data.mm_features,
                 sampling_params=sampling_params,
                 pooling_params=pooling_params,
@@ -1338,13 +1347,27 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> Callable | None
             # For the last rank, we don't need to update the token_ids_cpu
             # because the sampled tokens are already cached.
             if not is_last_rank:
-                # Add new_token_ids to token_ids_cpu.
-                start_token_index = num_computed_tokens
-                end_token_index = num_computed_tokens + len(new_token_ids)
-                self.input_batch.token_ids_cpu[
-                    req_index, start_token_index:end_token_index
-                ] = new_token_ids
-                self.input_batch.num_tokens_no_spec[req_index] = end_token_index
+                start_token_index = self.input_batch.num_tokens_no_spec[req_index]
+                # For chunked prefill, num_computed_tokens may less
+                # than num_tokens_no_spec.
+                # Async scheduled PP: no new_token_ids, advance num_tokens_no_spec
+                # according to num_computed_tokens.
+                end_token_index = max(
+                    start_token_index,
+                    num_computed_tokens + len(new_token_ids),
+                )
+                if end_token_index > start_token_index:
+                    if new_token_ids:
+                        # Add new_token_ids to token_ids_cpu.
+                        num_new_tokens = end_token_index - start_token_index
+                        tokens_to_append = new_token_ids[-num_new_tokens:]
+                        self.input_batch.token_ids_cpu[
+                            req_index, start_token_index:end_token_index
+                        ] = tokens_to_append
+                    self.input_batch.is_token_ids[
+                        req_index, start_token_index:end_token_index
+                    ] = True
+                    self.input_batch.num_tokens_no_spec[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
             self.input_batch.update_req_spec_token_ids(req_state, scheduled_spec_tokens)
@@ -1501,10 +1524,16 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
         )
         mrope_model = cast(SupportsMRoPE, model)
 
+        # `prompt_embeds` is a passthrough modality (no grid_thw), models'
+        # M-RoPE code assumes per-feature grid info, so filter it out. The
+        # prompt_embeds positions are treated as text positions for M-RoPE.
+        mrope_features = [
+            f for f in req_state.mm_features if f.modality != "prompt_embeds"
+        ]
         req_state.mrope_positions, req_state.mrope_position_delta = (
             mrope_model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
-                req_state.mm_features,
+                mrope_features,
             )
         )
 
@@ -2299,11 +2328,18 @@ def _build_attn_group_metadata(
                 cm.slot_mapping = slot_mappings[kv_cache_gid]
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:
-                if isinstance(self.drafter, (EagleProposer, DFlashProposer)):
+                if isinstance(
+                    self.drafter, (EagleProposer, DFlashProposer, Gemma4Proposer)
+                ):
                     if self.drafter.kv_cache_gid == kv_cache_gid:
                         spec_decode_common_attn_metadata = cm
                 else:
                     spec_decode_common_attn_metadata = cm
+            # Capture per-group block tables for multi-group proposers.
+            if self.speculative_config and isinstance(self.drafter, Gemma4Proposer):
+                self.drafter.set_per_group_block_table(
+                    kv_cache_gid, cm.block_table_tensor
+                )
 
             for attn_gid in range(len(self.attn_groups[kv_cache_gid])):
                 if ubatch_slices is not None:
@@ -2740,6 +2776,33 @@ def _execute_mm_encoder(
         if not mm_kwargs:
             return []
 
+        # `prompt_embeds` is a passthrough modality, the tensor is already in
+        # the model embedding space, so no encoder runs. Inject each
+        # `prompt_embeds` tensor directly into the encoder cache here so that
+        # `_gather_mm_embeddings` can splice it via the standard `is_mm_embed`
+        # path.
+        pe_indices = [
+            i
+            for i, (modality, _) in enumerate(mm_kwargs)
+            if modality == "prompt_embeds"
+        ]
+        if pe_indices:
+            for i in pe_indices:
+                pe_tensor = mm_kwargs[i][1]["embedding"].data
+                assert isinstance(pe_tensor, torch.Tensor)
+
+                self.encoder_cache[mm_hashes[i]] = pe_tensor.to(self.device)
+                self.maybe_save_ec_to_connector(self.encoder_cache, mm_hashes[i])
+            # Filter out `prompt_embeds` items from mm_kwargs/mm_hashes/mm_lora_refs
+            # since they don't require further encoder processing.
+            mm_hashes = [h for i, h in enumerate(mm_hashes) if i not in pe_indices]
+            mm_kwargs = [k for i, k in enumerate(mm_kwargs) if i not in pe_indices]
+            mm_lora_refs = [
+                r for i, r in enumerate(mm_lora_refs) if i not in pe_indices
+            ]
+            if not mm_kwargs:
+                return []  # nothing left to encode after filtering out `prompt_embeds`
+
         should_time = bool(
             self.observability_config
             and self.observability_config.enable_mm_processor_stats
@@ -4144,7 +4207,7 @@ def sample_tokens(
             kv_connector_output = self.kv_connector_output
             self.kv_connector_output = None
             # receive sampled token ids from the last PP rank.
-            if self.use_async_scheduling and get_pp_group().world_size > 1:
+            if self.use_async_scheduling and not get_pp_group().is_last_rank:
                 self._pp_receive_prev_sampled_token_ids_to_input_batch()
             if not kv_connector_output:
                 return None  # type: ignore[return-value]
@@ -4238,7 +4301,8 @@ def propose_draft_token_ids(sampled_token_ids):
                     EagleProposer
                     | DFlashProposer
                     | DraftModelProposer
-                    | ExtractHiddenStatesProposer,
+                    | ExtractHiddenStatesProposer
+                    | Gemma4Proposer,
                 )
                 sampled_token_ids = sampler_output.sampled_token_ids
                 if input_fits_in_drafter:
@@ -4634,7 +4698,8 @@ def propose_draft_token_ids(
             or spec_config.uses_draft_model()
         ):
             assert isinstance(
-                self.drafter, EagleProposer | DFlashProposer | DraftModelProposer
+                self.drafter,
+                EagleProposer | DFlashProposer | DraftModelProposer | Gemma4Proposer,
             )
 
             if spec_config.disable_padded_drafter_batch:
@@ -5056,7 +5121,6 @@ def _get_prompt_logprobs_dict(
         if not num_prompt_logprobs_dict:
             return {}
 
-        in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
         prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
 
         # Since prompt logprobs are a rare feature, prioritize simple,
@@ -5080,14 +5144,14 @@ def _get_prompt_logprobs_dict(
             )
 
             # Set up target LogprobsTensors object.
-            logprobs_tensors = in_progress_dict.get(req_id)
-            if not logprobs_tensors:
+            logprobs_tensors = request.in_progress_prompt_logprobs_cpu
+            if logprobs_tensors is None:
                 # Create empty logprobs CPU tensors for the entire prompt.
                 # If chunked, we'll copy in slice by slice.
                 logprobs_tensors = LogprobsTensors.empty_cpu(
                     num_prompt_tokens - 1, num_prompt_logprobs + 1
                 )
-                in_progress_dict[req_id] = logprobs_tensors
+                request.in_progress_prompt_logprobs_cpu = logprobs_tensors
 
             # Determine number of logits to retrieve.
             start_idx = request.num_computed_tokens
@@ -5144,7 +5208,7 @@ def _get_prompt_logprobs_dict(
         # num_prompt_logprobs_dict.
         for req_id in completed_prefill_reqs:
             del num_prompt_logprobs_dict[req_id]
-            del in_progress_dict[req_id]
+            self.requests[req_id].in_progress_prompt_logprobs_cpu = None
 
         # Must synchronize the non-blocking GPU->CPU transfers.
         if prompt_logprobs_dict:
@@ -5557,7 +5621,8 @@ def _dummy_run(
                     EagleProposer
                     | DFlashProposer
                     | DraftModelProposer
-                    | ExtractHiddenStatesProposer,
+                    | ExtractHiddenStatesProposer
+                    | Gemma4Proposer,
                 )
                 assert self.speculative_config is not None
                 # Eagle currently only supports PIECEWISE cudagraphs.
@@ -5663,6 +5728,26 @@ def _dummy_sampler_run(
             sampler_output = self.sampler(
                 logits=logits, sampling_metadata=dummy_metadata
             )
+            # Also warm forward_native (taken when generators dict is non-empty),
+            # but skip the extra call in 'processed_logits' / 'processed_logprobs'
+            # modes — there TopKTopPSampler binds forward = forward_native at
+            # init time, so the warmup call is redundant and only inflates peak
+            # memory during profile_run.
+            # No .clone() of logits: warmup output is discarded, so any in-place
+            # mutation by forward_native does not affect correctness.
+            if self.sampler.logprobs_mode not in (
+                "processed_logits",
+                "processed_logprobs",
+            ):
+                self.sampler(
+                    logits=logits,
+                    sampling_metadata=replace(
+                        dummy_metadata,
+                        generators={
+                            0: torch.Generator(device=self.device).manual_seed(0)
+                        },
+                    ),
+                )
         except RuntimeError as e:
             if "out of memory" in str(e):
                 raise RuntimeError(
@@ -5874,7 +5959,7 @@ def _init_minimal_kv_cache_for_profiling(self) -> None:
         saved_override = self.cache_config.num_gpu_blocks_override
         self.cache_config.num_gpu_blocks_override = min_blocks
         minimal_config = get_kv_cache_config_from_groups(
-            self.vllm_config, kv_cache_groups, available_memory=0, suppress_log=True
+            self.vllm_config, kv_cache_groups, available_memory=0
         )
         self.cache_config.num_gpu_blocks_override = saved_override
 
@@ -6338,7 +6423,8 @@ def initialize_metadata_builders(
             or self.speculative_config.uses_draft_model()
         ):
             assert isinstance(
-                self.drafter, EagleProposer | DFlashProposer | DraftModelProposer
+                self.drafter,
+                EagleProposer | DFlashProposer | DraftModelProposer | Gemma4Proposer,
             )
             self.drafter.initialize_attn_backend(kv_cache_config, kernel_block_sizes)
 
@@ -6391,7 +6477,10 @@ def _check_and_update_cudagraph_mode(
         ):
             assert isinstance(
                 self.drafter,
-                EagleProposer | DFlashProposer | ExtractHiddenStatesProposer,
+                EagleProposer
+                | DFlashProposer
+                | ExtractHiddenStatesProposer
+                | Gemma4Proposer,
             )
             self.drafter.initialize_cudagraph_keys(cudagraph_mode)
 
@@ -6504,10 +6593,11 @@ def may_reinitialize_input_batch(
                 block_sizes=block_sizes,
                 kernel_block_sizes=kernel_block_sizes,
                 max_num_blocks_per_req=max_num_blocks,
-                is_spec_decode=bool(self.vllm_config.speculative_config),
+                num_spec_tokens=self.num_spec_tokens,
                 logitsprocs=self.input_batch.logitsprocs,
                 logitsprocs_need_output_token_ids=self.input_batch.logitsprocs_need_output_token_ids,
                 is_pooling_model=self.is_pooling_model,
+                reasoning_config=self.vllm_config.reasoning_config,
             )
 
         assert self._init_block_sizes == block_sizes, (
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 01f18a11948d..a8c54c4f066e 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -71,8 +71,8 @@ def __init__(
                 A function that sets the number of SMs for computation.
         """
 
-        assert current_platform.is_cuda(), (
-            "SM control is currently only supported on CUDA"
+        assert current_platform.is_cuda() or current_platform.is_rocm(), (
+            "SM/CU control is supported on CUDA and ROCm platforms"
         )
         device = torch.accelerator.current_device_index()
         total_sms = num_compute_units(device)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 19d0a68142e2..2c89fd04cfb1 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -5,12 +5,13 @@
 import gc
 import os
 from collections.abc import Callable
-from contextlib import AbstractContextManager, nullcontext
+from contextlib import AbstractContextManager, contextmanager, nullcontext
 from datetime import timedelta
 from types import NoneType
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
+import regex as re
 import torch
 import torch.nn as nn
 
@@ -46,7 +47,7 @@
 from vllm.tracing import instrument
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
-from vllm.utils.torch_utils import is_quantized_kv_cache, set_random_seed
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (
@@ -192,15 +193,8 @@ def wake_up(self, tags: list[str] | None = None) -> None:
                     buffer.data.copy_(self._sleep_saved_buffers[name].data)
             self._sleep_saved_buffers = {}
 
-        # If the KV cache has just been woken up,
-        # the internal state of cache_engine must be reset,
-        # especially the FP8 scaling factor.
-        if (
-            (tags is None or "kv_cache" in tags)
-            and is_quantized_kv_cache(self.cache_config.cache_dtype)
-            and hasattr(self.model_runner, "init_fp8_kv_scales")
-        ):
-            self.model_runner.init_fp8_kv_scales()
+        if tags is None or "kv_cache" in tags:
+            self.model_runner.post_kv_cache_wake_up()
 
     def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
         if not self.vllm_config.model_config.enable_sleep_mode:
@@ -215,6 +209,30 @@ def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
             )
         return allocator.use_memory_pool(tag=tag)
 
+    @contextmanager
+    def _scoped_allocator_max_split(self, max_split_size_mb: int):
+        """Temporarily set max_split_size_mb to reduce allocator fragmentation at the
+        cost of more cudaMalloc calls (negligible in practice). Restores the original
+        value on exit."""
+        if not current_platform.is_cuda():
+            yield
+            return
+
+        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+        match = re.search(r"max_split_size_mb:(\d+)", conf)
+        original_value = match.group(1) if match else None
+
+        torch._C._accelerator_setAllocatorSettings(
+            f"max_split_size_mb:{max_split_size_mb}"
+        )
+        try:
+            yield
+        finally:
+            # PyTorch defaults to SIZE_MAX (no limit).
+            _SIZE_MAX_MB = (2**64 - 1) // (1024 * 1024)
+            restore = original_value if original_value else str(_SIZE_MAX_MB)
+            torch._C._accelerator_setAllocatorSettings(f"max_split_size_mb:{restore}")
+
     @instrument(span_name="Init device")
     def init_device(self):
         if self.device_config.device_type == "cuda":
@@ -319,6 +337,8 @@ def load_model(self, *, load_dummy_weights: bool = False) -> None:
         with (
             self._maybe_get_memory_pool_context(tag="weights"),
             set_current_vllm_config(self.vllm_config),
+            # 20 MiB is the minimum PyTorch allows for max_split_size_mb.
+            self._scoped_allocator_max_split(max_split_size_mb=20),
         ):
             self.model_runner.load_model(load_dummy_weights=load_dummy_weights)
 
@@ -378,7 +398,7 @@ def determine_available_memory(self) -> int:
             # differently and can produce incorrect/negative estimates.
             cudagraph_memory_estimate = 0
             if (
-                not current_platform.is_rocm()
+                current_platform.is_cuda()
                 and self.vllm_config.compilation_config.cudagraph_mode
                 != CUDAGraphMode.NONE
             ):
@@ -691,6 +711,14 @@ def compile_or_warm_up_model(self) -> CompilationTimes:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+        # All warmup is done — start monitoring for unexpected JIT
+        # compilations that would cause latency spikes during inference.
+        from vllm.triton_utils.jit_monitor import (
+            activate as activate_triton_jit_monitor,
+        )
+
+        activate_triton_jit_monitor()
+
         return CompilationTimes(
             language_model=self.compilation_config.compilation_time,
             encoder=self.compilation_config.encoder_compilation_time,
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
index 1338b46996fc..f4a76529023c 100644
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@@ -214,7 +214,9 @@ def _make_metadata_with_slice(
             seq_lens_cpu_upper_bound[-1] -= tokens_skipped
 
     assert seq_lens_cpu_upper_bound is not None
-    max_seq_len = int(seq_lens_cpu_upper_bound.max())
+    # Preserve the max_seq_len override set during CUDA-graph capture so
+    # the attention backend selects the correct kernel for SWA layers.
+    max_seq_len = max(int(seq_lens_cpu_upper_bound.max()), attn_metadata.max_seq_len)
 
     num_requests = request_slice.stop - request_slice.start
     num_actual_tokens = token_slice.stop - token_slice.start