diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index bad2796266a7..76207e5b368e 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -460,7 +460,7 @@ steps:
   - tests/lora
   - vllm/platforms/rocm.py
   commands:
-  - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py
+  - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_qwen3_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py
 
 #------------------------------------------------------  mi250 · model_executor  -------------------------------------------------------#
 
@@ -880,7 +880,7 @@ steps:
   - vllm/platforms/rocm.py
   commands:
   - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
+  - ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
 
 - label: V1 e2e (2 GPUs) # TBD
   timeout_in_minutes: 180
@@ -929,6 +929,7 @@ steps:
   - tests/tokenizers_
   - tests/reasoning
   - tests/tool_parsers
+  - tests/parser
   - tests/transformers_utils
   - tests/config
   commands:
@@ -942,6 +943,7 @@ steps:
   - pytest -v -s tokenizers_
   - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py
   - pytest -v -s tool_parsers
+  - pytest -v -s parser
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -1100,13 +1102,13 @@ steps:
   - vllm/compilation/
   - vllm/model_executor/layers
   - tests/compile/passes/distributed/
+  - tests/compile/fusions_e2e/
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
   - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
-  - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
-  - pytest -v -s tests/compile/passes/distributed/test_tp2_ar_rms.py::test_tp2_ar_rms_fusions
+  - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py::test_tp2_ar_rms_fusions
 
 #-----------------------------------------------------------  mi300 · cuda  ------------------------------------------------------------#
 
@@ -1320,7 +1322,6 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
-  - pytest -v -s entrypoints/openai/speech_to_text/
   - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Entrypoints Integration (API Server openai - Part 3) # TBD
@@ -1336,7 +1337,21 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
+
+- label: Entrypoints Integration (Speech to Text) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
+  agent_pool: mi300_1
+  fast_check: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/speech_to_text
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/speech_to_text
 
 - label: Entrypoints Integration (LLM) # TBD
   timeout_in_minutes: 180
@@ -1760,7 +1775,7 @@ steps:
   - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
   - pytest -v -s -x lora/test_chatglm3_tp.py
   - pytest -v -s -x lora/test_llama_tp.py
-  - pytest -v -s -x lora/test_llm_with_multi_loras.py
+  - pytest -v -s -x lora/test_qwen3_with_multi_loras.py
   - pytest -v -s -x lora/test_olmoe_tp.py
   - pytest -v -s -x lora/test_gptoss_tp.py
   - pytest -v -s -x lora/test_qwen35_densemodel_lora.py
@@ -1803,9 +1818,10 @@ steps:
   - tests/models/multimodal/generation
   - tests/models/multimodal/test_mapping.py
   commands:
-  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-  - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
-  - pytest -v -s models/multimodal/test_mapping.py
+  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@rocm-7.0-v2.3.0'
+  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
+  - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+
 
 - label: Multi-Modal Models (Extended Generation 2) # TBD
   timeout_in_minutes: 180
@@ -1817,8 +1833,10 @@ steps:
   - vllm/
   - tests/models/multimodal/generation
   commands:
-  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@rocm-7.0-v2.3.0'
+  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
+  - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
 
 - label: Multi-Modal Models (Extended Generation 3) # TBD
   timeout_in_minutes: 180
@@ -2763,7 +2781,6 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
-  - pytest -v -s entrypoints/openai/speech_to_text/
   - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Entrypoints Integration (API Server openai - Part 3) # TBD
@@ -2779,7 +2796,21 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
+
+- label: Entrypoints Integration (Speech to Text) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi355]
+  agent_pool: mi355_1
+  fast_check: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/speech_to_text
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/speech_to_text
 
 - label: Entrypoints Integration (Pooling) # TBD
   timeout_in_minutes: 180
@@ -3043,7 +3074,7 @@ steps:
   - vllm/
   - tests/models/language/generation
   commands:
-  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
+  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@rocm-7.0-v2.3.0'
   - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
   - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
@@ -3318,7 +3349,7 @@ steps:
   - vllm/platforms/rocm.py
   commands:
   - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
+  - ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
 
 - label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD
   timeout_in_minutes: 180
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index ba92d3a3aec0..f9ddf2603085 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -11,7 +11,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text
 
 - label: Entrypoints Integration (LLM)
   key: entrypoints-integration-llm
@@ -44,7 +44,6 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
 
-
 - label: Entrypoints Integration (API Server openai - Part 2)
   key: entrypoints-integration-api-server-openai-part-2
   timeout_in_minutes: 50
@@ -55,7 +54,6 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
-  - pytest -v -s entrypoints/openai/speech_to_text/
   - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Entrypoints Integration (API Server openai - Part 3)
@@ -69,7 +67,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion  --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
 
 - label: Entrypoints Integration (API Server 2)
   key: entrypoints-integration-api-server-2
@@ -86,6 +84,17 @@ steps:
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
+- label: Entrypoints Integration (Speech to Text)
+  key: entrypoints-integration-speech_to_text
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/speech_to_text
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/speech_to_text
+
 - label: Entrypoints Integration (Pooling)
   key: entrypoints-integration-pooling
   timeout_in_minutes: 50
@@ -115,5 +124,5 @@ steps:
   - csrc/
   - vllm/entrypoints/openai/
   - vllm/model_executor/models/whisper.py
-  commands: # LMEval+Transcription WER check
+  commands: # LMEval
   - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index f540eb2fcc2a..8107f9b37ff0 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -9,7 +9,7 @@ steps:
   - vllm/lora
   - tests/lora
   commands:
-    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py 
+    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_qwen3_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py 
   parallelism: 4
 
 
@@ -19,6 +19,7 @@ steps:
   num_devices: 4
   source_file_dependencies:
   - vllm/lora
+  - vllm/model_executor/layers/fused_moe/
   - tests/lora
   commands:
     # FIXIT: find out which code initialize cuda before running the test
@@ -30,7 +31,7 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_qwen3_with_multi_loras.py
     - pytest -v -s -x lora/test_olmoe_tp.py
     - pytest -v -s -x lora/test_gptoss_tp.py
     - pytest -v -s -x lora/test_qwen35_densemodel_lora.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index da04c18017db..2a78201a9e47 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -210,6 +210,7 @@ steps:
 - label: Python-only Installation
   key: python-only-installation
   depends_on: ~
+  optional: true
   timeout_in_minutes: 20
   source_file_dependencies:
   - tests/standalone_tests/python_only_compile.sh
@@ -282,6 +283,7 @@ steps:
   - tests/tokenizers_
   - tests/reasoning
   - tests/tool_parsers
+  - tests/parser
   - tests/transformers_utils
   - tests/config
   device: cpu-small
@@ -296,6 +298,7 @@ steps:
   - pytest -v -s tokenizers_
   - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py
   - pytest -v -s tool_parsers
+  - pytest -v -s parser
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index a20c5e7e9dce..44cf10076ee7 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -6,8 +6,8 @@
 /vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery @xuechendi
 /vllm/lora @jeejeelee
 /vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
-/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
+/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety @zyongye
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety @zyongye
 /vllm/model_executor/layers/mamba @tdoublep @tomeras91
 /vllm/model_executor/layers/mamba/gdn_linear_attn.py @tdoublep @ZJY0516 @vadiklyutiy
 /vllm/model_executor/layers/rotary_embedding.py @vadiklyutiy
@@ -18,7 +18,8 @@
 /vllm/kernels/helion @ProExpertProg @zou3519
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
-CMakeLists.txt @tlrmchlsmth @LucasWilkinson
+/CMakeLists.txt @tlrmchlsmth @LucasWilkinson @Harry-Chen
+/cmake @tlrmchlsmth @LucasWilkinson @Harry-Chen
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
@@ -70,6 +71,10 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/worker/gpu @WoosukKwon @njhill
 /vllm/v1/worker/gpu/kv_connector.py @orozery
 
+# CI & building
+/.buildkite @Harry-Chen
+/docker/Dockerfile @Harry-Chen
+
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
@@ -77,11 +82,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
 /tests/evals @mgoin @vadiklyutiy
-/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 @zyongye
 /tests/kernels/ir @ProExpertProg @tjtanaa
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety @zyongye
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
@@ -147,6 +152,12 @@ mkdocs.yaml @hmellor
 # MTP-specific files
 /vllm/model_executor/models/deepseek_mtp.py @luccafong
 
+# DeepseekV4-specific files
+/vllm/v1/attention/ops/deepseek_v4_ops @zyongye
+/vllm/model_executor/layers/deepseek_compressor.py @zyongye
+/vllm/model_executor/layers/deepseek_v4_attention.py @zyongye
+/vllm/model_executor/layers/sparse_attn_indexer.py @zyongye
+
 # Mistral-specific files
 /vllm/model_executor/models/mistral*.py @patrickvonplaten
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13788fa87437..fd6c7eeffd06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,8 +13,12 @@ cmake_minimum_required(VERSION 3.26)
 # cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 20)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+set(CMAKE_HIP_STANDARD 20)
+set(CMAKE_HIP_STANDARD_REQUIRED ON)
 
 
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@@ -105,6 +109,24 @@ else()
   set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.7;8.9;9.0")
 endif()
 
+#
+# spinloop extension (pure CXX; must stay above the non-CUDA device branch so
+# CPU builds define the target before the early return)
+#
+set(VLLM_SPINLOOP_EXT_SRC "csrc/spinloop.cpp")
+set(SPINLOOP_COMPILE_FLAGS "")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+  list(APPEND SPINLOOP_COMPILE_FLAGS "-mmwaitx")
+endif()
+define_extension_target(
+  spinloop
+  DESTINATION vllm
+  LANGUAGE CXX
+  SOURCES ${VLLM_SPINLOOP_EXT_SRC}
+  COMPILE_FLAGS ${SPINLOOP_COMPILE_FLAGS}
+  USE_SABI 3.11
+  WITH_SOABI)
+
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index d27a5ea93dea..361f08b51054 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,7 +1,7 @@
 include(FetchContent)
 
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake
index 0d7ea43fb7d0..07328c271388 100644
--- a/cmake/external_projects/deepgemm.cmake
+++ b/cmake/external_projects/deepgemm.cmake
@@ -76,7 +76,6 @@ if(DEEPGEMM_ARCHS)
     "${deepgemm_SOURCE_DIR}/third-party/fmt/include")
 
   target_compile_options(_deep_gemm_C PRIVATE
-    $<$<COMPILE_LANGUAGE:CXX>:-std=c++17>
     $<$<COMPILE_LANGUAGE:CXX>:-O3>
     $<$<COMPILE_LANGUAGE:CXX>:-Wno-psabi>
     $<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>)
diff --git a/csrc/cache.h b/csrc/cache.h
index 821d5e719a44..a9e74b0dc2df 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -12,7 +12,8 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
 
 void swap_blocks_batch(const torch::Tensor& src_ptrs,
                        const torch::Tensor& dst_ptrs,
-                       const torch::Tensor& sizes);
+                       const torch::Tensor& sizes,
+                       bool is_src_access_order_any);
 
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 895490f45a79..9130dd2ccae7 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -77,7 +77,8 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
 
 void swap_blocks_batch(const torch::Tensor& src_ptrs,
                        const torch::Tensor& dst_ptrs,
-                       const torch::Tensor& sizes) {
+                       const torch::Tensor& sizes,
+                       bool is_src_access_order_any) {
   TORCH_CHECK(src_ptrs.device().is_cpu(), "src_ptrs must be on CPU");
   TORCH_CHECK(dst_ptrs.device().is_cpu(), "dst_ptrs must be on CPU");
   TORCH_CHECK(sizes.device().is_cpu(), "sizes must be on CPU");
@@ -124,7 +125,12 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
 
   if (batch_fn != nullptr) {
     CUmemcpyAttributes attr = {};
-    attr.srcAccessOrder = CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
+    // ANY lets the DMA engine prefetch source bytes out of stream order,
+    // which is only safe when no GPU stream is concurrently writing the
+    // source.
+    attr.srcAccessOrder = is_src_access_order_any
+                              ? CU_MEMCPY_SRC_ACCESS_ORDER_ANY
+                              : CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
     size_t attrs_idx = 0;
     size_t fail_idx = 0;
     CUresult result = batch_fn(reinterpret_cast<CUdeviceptr*>(dst_data),
diff --git a/csrc/cache_kernels_fused.cu b/csrc/cache_kernels_fused.cu
index be037b2fdec2..8687ebe1f14c 100644
--- a/csrc/cache_kernels_fused.cu
+++ b/csrc/cache_kernels_fused.cu
@@ -21,28 +21,33 @@ namespace vllm {
 
 // NOTE Be EXTRA careful with raw_kv_scalar_t, for __half and __nv_bfloat16 it's
 // using u16 as the backing type.
-template <typename qk_t, bool IS_NEOX, typename raw_kv_scalar_t,
-          typename cache_t, Fp8KVCacheDataType kv_dt>
+template <typename qk_t, typename cos_sin_t, bool IS_NEOX,
+          typename raw_kv_scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void concat_and_cache_mla_rope_fused_kernel(
     const int64_t* __restrict__ positions,  // [num_tokens]
     qk_t* __restrict__ q_pe,        // [num_tokens, num_q_heads, rot_dim]
     qk_t* __restrict__ k_pe,        // [num_tokens, rot_dim]
     const qk_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
-    const qk_t* __restrict__ rope_cos_sin_cache,  // [max_position, 2,
-                                                  // rot_dim // 2]
+    const cos_sin_t* __restrict__ rope_cos_sin_cache,  // [max_position, 2,
+                                                       // rot_dim // 2]
     const int rot_dim, const int64_t q_pe_stride_token,
     const int64_t q_pe_stride_head, const int64_t k_pe_stride,
     const int64_t kv_c_stride, const int num_q_heads,
     cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank +
                                      // rot_dim)]
-    const int64_t* __restrict__ kv_cache_slot_mapping,  // [num_tokens]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int block_stride, const int entry_stride, const int kv_lora_rank,
     const int block_size, const float* kv_cache_quant_scale) {
   // Each thread block is responsible for one token.
   const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
   const int64_t pos = positions[token_idx];
 
-  const qk_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
+  const cos_sin_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
 
   const int embed_dim = rot_dim / 2;
 
@@ -54,8 +59,8 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
 
     // NOTE: Would be nice to have interleaved sin/cos so we could just load
     // both at the same time.
-    qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
-    qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+    qk_t cos = static_cast<qk_t>(VLLM_LDG(cos_sin_ptr + pair_idx));
+    qk_t sin = static_cast<qk_t>(VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim));
 
     qk_t* q_pe_head_ptr =
         q_pe + token_idx * q_pe_stride_token + head_idx * q_pe_stride_head;
@@ -81,21 +86,15 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
     q_pe_head_ptr[pair_idx_y] = y_dst;
   }
 
-  const int64_t slot_idx = kv_cache_slot_mapping[token_idx];
   const int64_t block_idx = slot_idx / block_size;
   const int64_t entry_idx = slot_idx % block_size;
 
-  // NOTE: slot_idx can be -1 if the token is padded
-  if (slot_idx < 0) {
-    return;
-  }
-
   // K with 1 HEAD
   for (int i = threadIdx.x; i < embed_dim; i += blockDim.x) {
     int pair_idx = i;
 
-    qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
-    qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+    qk_t cos = static_cast<qk_t>(VLLM_LDG(cos_sin_ptr + pair_idx));
+    qk_t sin = static_cast<qk_t>(VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim));
 
     qk_t* k_pe_head_ptr = k_pe + token_idx * k_pe_stride;
 
@@ -165,36 +164,43 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
 
 }  // namespace vllm
 
-#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE)      \
-  do {                                                                         \
-    VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] {   \
-      using qk_t = scalar_t;                                                   \
-      if (rope_is_neox) {                                                      \
-        vllm::concat_and_cache_mla_rope_fused_kernel<qk_t, true, RAW_KV_T,     \
-                                                     CACHE_T, KV_DTYPE>        \
-            <<<grid, block, 0, stream>>>(                                      \
-                positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(),          \
-                k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(),                  \
-                rope_cos_sin_cache.data_ptr<qk_t>(), rot_dim,                  \
-                q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
-                num_q_heads, reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),  \
-                kv_cache_slot_mapping.data_ptr<int64_t>(), block_stride,       \
-                entry_stride, kv_lora_rank, block_size,                        \
-                kv_cache_quant_scale.data_ptr<float>());                       \
-      } else {                                                                 \
-        vllm::concat_and_cache_mla_rope_fused_kernel<qk_t, false, RAW_KV_T,    \
-                                                     CACHE_T, KV_DTYPE>        \
-            <<<grid, block, 0, stream>>>(                                      \
-                positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(),          \
-                k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(),                  \
-                rope_cos_sin_cache.data_ptr<qk_t>(), rot_dim,                  \
-                q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
-                num_q_heads, reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),  \
-                kv_cache_slot_mapping.data_ptr<int64_t>(), block_stride,       \
-                entry_stride, kv_lora_rank, block_size,                        \
-                kv_cache_quant_scale.data_ptr<float>());                       \
-      }                                                                        \
-    });                                                                        \
+#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE)     \
+  do {                                                                        \
+    VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] {  \
+      using qk_t = scalar_t;                                                  \
+      VLLM_DISPATCH_FLOATING_TYPES(                                           \
+          rope_cos_sin_cache.scalar_type(), "rope_cos_sin_cache_scalar_type", \
+          [&] {                                                               \
+            using cos_sin_t = scalar_t;                                       \
+            if (rope_is_neox) {                                               \
+              vllm::concat_and_cache_mla_rope_fused_kernel<                   \
+                  qk_t, cos_sin_t, true, RAW_KV_T, CACHE_T, KV_DTYPE>         \
+                  <<<grid, block, 0, stream>>>(                               \
+                      positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(),   \
+                      k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(),           \
+                      rope_cos_sin_cache.data_ptr<cos_sin_t>(), rot_dim,      \
+                      q_pe_stride_token, q_pe_stride_head, k_pe_stride,       \
+                      kv_c_stride, num_q_heads,                               \
+                      reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),        \
+                      slot_mapping.data_ptr<int64_t>(), block_stride,         \
+                      entry_stride, kv_lora_rank, block_size,                 \
+                      kv_cache_quant_scale.data_ptr<float>());                \
+            } else {                                                          \
+              vllm::concat_and_cache_mla_rope_fused_kernel<                   \
+                  qk_t, cos_sin_t, false, RAW_KV_T, CACHE_T, KV_DTYPE>        \
+                  <<<grid, block, 0, stream>>>(                               \
+                      positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(),   \
+                      k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(),           \
+                      rope_cos_sin_cache.data_ptr<cos_sin_t>(), rot_dim,      \
+                      q_pe_stride_token, q_pe_stride_head, k_pe_stride,       \
+                      kv_c_stride, num_q_heads,                               \
+                      reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),        \
+                      slot_mapping.data_ptr<int64_t>(), block_stride,         \
+                      entry_stride, kv_lora_rank, block_size,                 \
+                      kv_cache_quant_scale.data_ptr<float>());                \
+            }                                                                 \
+          });                                                                 \
+    });                                                                       \
   } while (false)
 
 // Executes RoPE on q_pe and k_pe, then writes k_pe and kv_c in the kv cache.
@@ -208,43 +214,52 @@ void concat_and_cache_mla_rope_fused(
     torch::Tensor& kv_c,                // [num_tokens, kv_lora_rank]
     torch::Tensor& rope_cos_sin_cache,  // [max_position, rot_dim]
     bool rope_is_neox,
-    torch::Tensor&
-        kv_cache_slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
     torch::Tensor&
         kv_cache,  // [num_blocks, block_size, (kv_lora_rank + rot_dim)]
     const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale) {
-  const int64_t num_tokens = q_pe.size(0);
+  // NOTE(woosuk): In vLLM V1, query/key/position.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int num_padded_tokens = q_pe.size(0);
+  TORCH_CHECK_GE(num_padded_tokens, num_tokens);
 
   const int num_q_heads = q_pe.size(1);
   const int rot_dim = q_pe.size(2);
   const int kv_lora_rank = kv_c.size(1);
 
-  TORCH_CHECK(positions.size(0) >=
-              num_tokens);  // CUDA Graphs might pad this for us
+  TORCH_CHECK_EQ(positions.size(0), num_padded_tokens);
   TORCH_CHECK_EQ(positions.dim(), 1);
   TORCH_CHECK_EQ(positions.scalar_type(), c10::ScalarType::Long);
 
-  TORCH_CHECK_EQ(q_pe.size(0), num_tokens);
+  TORCH_CHECK_EQ(q_pe.dim(), 3);
+  TORCH_CHECK_EQ(q_pe.size(0), num_padded_tokens);
   TORCH_CHECK_EQ(q_pe.size(1), num_q_heads);
   TORCH_CHECK_EQ(q_pe.size(2), rot_dim);
-  TORCH_CHECK_EQ(q_pe.dim(), 3);
 
-  TORCH_CHECK_EQ(k_pe.size(0), num_tokens);
-  TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
   TORCH_CHECK_EQ(k_pe.dim(), 2);
+  TORCH_CHECK_EQ(k_pe.size(0), num_padded_tokens);
+  TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
   TORCH_CHECK_EQ(k_pe.scalar_type(), q_pe.scalar_type());
 
-  TORCH_CHECK_EQ(kv_c.size(0), num_tokens);
-  TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
   TORCH_CHECK_EQ(kv_c.dim(), 2);
+  TORCH_CHECK_EQ(kv_c.size(0), num_padded_tokens);
+  TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
   TORCH_CHECK_EQ(kv_c.scalar_type(), q_pe.scalar_type());
   TORCH_CHECK_EQ(kv_c.dtype(), q_pe.dtype());
 
   TORCH_CHECK_EQ(rope_cos_sin_cache.size(1), rot_dim);
-  TORCH_CHECK_EQ(rope_cos_sin_cache.scalar_type(), q_pe.scalar_type());
 
-  TORCH_CHECK_EQ(kv_cache_slot_mapping.size(0), num_tokens);
-  TORCH_CHECK_EQ(kv_cache_slot_mapping.scalar_type(), c10::ScalarType::Long);
+  TORCH_CHECK_EQ(slot_mapping.size(0), num_tokens);
+  TORCH_CHECK_EQ(slot_mapping.scalar_type(), c10::ScalarType::Long);
 
   TORCH_CHECK_EQ(kv_cache.size(2), kv_lora_rank + rot_dim);
   TORCH_CHECK_EQ(kv_cache.dim(), 3);
diff --git a/csrc/core/batch_invariant.hpp b/csrc/core/batch_invariant.hpp
index fffe96b86857..8273bc74b1ef 100644
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@@ -1,7 +1,6 @@
 #pragma once
 #include <cstdlib>
 #include <string>
-#include <cctype>
 
 namespace vllm {
 
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
index e910103c4eae..84040a6a2218 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
@@ -1,5 +1,6 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_sm100_fp8_dispatch.cuh"
+#include "core/batch_invariant.hpp"
 
 namespace vllm {
 
@@ -13,9 +14,17 @@ void cutlass_scaled_mm_sm100_fp8(
     STD_TORCH_CHECK(bias->scalar_type() == out.scalar_type(),
                     "currently bias dtype must match output dtype ",
                     out.scalar_type());
+    if (vllm_is_batch_invariant()) {
+      return cutlass_scaled_mm_sm100_fp8_batch_invariant_epilogue<true>(
+          out, a, b, a_scales, b_scales, *bias);
+    }
     return cutlass_scaled_mm_sm100_fp8_epilogue<true>(out, a, b, a_scales,
                                                       b_scales, *bias);
   } else {
+    if (vllm_is_batch_invariant()) {
+      return cutlass_scaled_mm_sm100_fp8_batch_invariant_epilogue<false>(
+          out, a, b, a_scales, b_scales);
+    }
     return cutlass_scaled_mm_sm100_fp8_epilogue<false>(out, a, b, a_scales,
                                                        b_scales);
   }
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
index 5cd55f0198c2..f790b3653d57 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -294,6 +294,34 @@ inline void cutlass_gemm_sm100_fp8_dispatch(
   }
 }
 
+template <typename InType, typename OutType, bool EnableBias,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm100_fp8_batch_invariant_dispatch(
+    torch::stable::Tensor& out, torch::stable::Tensor const& a,
+    torch::stable::Tensor const& b, torch::stable::Tensor const& a_scales,
+    torch::stable::Tensor const& b_scales, EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  STD_TORCH_CHECK(a.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+  STD_TORCH_CHECK(b.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+
+  using Cutlass3xGemmM64SwapAB =
+      typename sm100_fp8_config_M64_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, EnableBias>::Cutlass3xGemm;
+
+  // keep the CUTLASS config independent of M for batch invariance
+  uint32_t const k = a.size(1);
+  if (k < 4096) {
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  }
+  return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64SwapAB>(
+      out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+}
+
 template <bool EnableBias, typename... EpilogueArgs>
 void cutlass_scaled_mm_sm100_fp8_epilogue(torch::stable::Tensor& out,
                                           torch::stable::Tensor const& a,
@@ -320,4 +348,28 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::stable::Tensor& out,
   }
 }
 
+template <bool EnableBias, typename... EpilogueArgs>
+void cutlass_scaled_mm_sm100_fp8_batch_invariant_epilogue(
+    torch::stable::Tensor& out, torch::stable::Tensor const& a,
+    torch::stable::Tensor const& b, torch::stable::Tensor const& a_scales,
+    torch::stable::Tensor const& b_scales, EpilogueArgs&&... epilogue_args) {
+  STD_TORCH_CHECK(a.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+  STD_TORCH_CHECK(b.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+
+  if (out.scalar_type() == torch::headeronly::ScalarType::BFloat16) {
+    return cutlass_gemm_sm100_fp8_batch_invariant_dispatch<
+        cutlass::float_e4m3_t, cutlass::bfloat16_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    STD_TORCH_CHECK(out.scalar_type() == torch::headeronly::ScalarType::Half);
+    return cutlass_gemm_sm100_fp8_batch_invariant_dispatch<
+        cutlass::float_e4m3_t, cutlass::half_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
index fb84faa2a41a..972d6c626062 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
@@ -1,5 +1,6 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_sm120_fp8_dispatch.cuh"
+#include "core/batch_invariant.hpp"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 
 namespace vllm {
@@ -14,9 +15,17 @@ void cutlass_scaled_mm_sm120_fp8(
     STD_TORCH_CHECK(bias->scalar_type() == out.scalar_type(),
                     "currently bias dtype must match output dtype ",
                     out.scalar_type());
+    if (vllm_is_batch_invariant()) {
+      return cutlass_scaled_mm_sm120_fp8_batch_invariant_epilogue<
+          c3x::ScaledEpilogueBias>(out, a, b, a_scales, b_scales, *bias);
+    }
     return cutlass_scaled_mm_sm120_fp8_epilogue<c3x::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
+    if (vllm_is_batch_invariant()) {
+      return cutlass_scaled_mm_sm120_fp8_batch_invariant_epilogue<
+          c3x::ScaledEpilogue>(out, a, b, a_scales, b_scales);
+    }
     return cutlass_scaled_mm_sm120_fp8_epilogue<c3x::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
index 245f5c10fcad..226e4f7a6bdb 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
@@ -179,6 +179,26 @@ inline void cutlass_gemm_sm120_fp8_dispatch(torch::stable::Tensor& out,
       out, a, b, std::forward<EpilogueArgs>(args)...);
 }
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm120_fp8_batch_invariant_dispatch(
+    torch::stable::Tensor& out, torch::stable::Tensor const& a,
+    torch::stable::Tensor const& b, EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  STD_TORCH_CHECK(a.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+  STD_TORCH_CHECK(b.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+
+  using Cutlass3xGemmM64 =
+      typename sm120_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // keep the CUTLASS config independent of M for batch invariance
+  return cutlass_gemm_caller<Cutlass3xGemmM64>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
+}
+
 template <template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_mm_sm120_fp8_epilogue(torch::stable::Tensor& out,
@@ -202,4 +222,26 @@ void cutlass_scaled_mm_sm120_fp8_epilogue(torch::stable::Tensor& out,
   }
 }
 
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm120_fp8_batch_invariant_epilogue(
+    torch::stable::Tensor& out, torch::stable::Tensor const& a,
+    torch::stable::Tensor const& b, EpilogueArgs&&... epilogue_args) {
+  STD_TORCH_CHECK(a.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+  STD_TORCH_CHECK(b.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+
+  if (out.scalar_type() == torch::headeronly::ScalarType::BFloat16) {
+    return cutlass_gemm_sm120_fp8_batch_invariant_dispatch<
+        cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    STD_TORCH_CHECK(out.scalar_type() == torch::headeronly::ScalarType::Half);
+    return cutlass_gemm_sm120_fp8_batch_invariant_dispatch<
+        cutlass::float_e4m3_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
index 0ce361123e5d..e86c9bd48d3f 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
@@ -1,5 +1,6 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_sm90_fp8_dispatch.cuh"
+#include "core/batch_invariant.hpp"
 
 namespace vllm {
 
@@ -13,9 +14,17 @@ void cutlass_scaled_mm_sm90_fp8(
     STD_TORCH_CHECK(bias->scalar_type() == out.scalar_type(),
                     "currently bias dtype must match output dtype ",
                     out.scalar_type());
+    if (vllm_is_batch_invariant()) {
+      return cutlass_scaled_mm_sm90_fp8_batch_invariant_epilogue<true>(
+          out, a, b, a_scales, b_scales, *bias);
+    }
     return cutlass_scaled_mm_sm90_fp8_epilogue<true>(out, a, b, a_scales,
                                                      b_scales, *bias);
   } else {
+    if (vllm_is_batch_invariant()) {
+      return cutlass_scaled_mm_sm90_fp8_batch_invariant_epilogue<false>(
+          out, a, b, a_scales, b_scales);
+    }
     return cutlass_scaled_mm_sm90_fp8_epilogue<false>(out, a, b, a_scales,
                                                       b_scales);
   }
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index 5ce66c96f740..f78b8daea510 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -349,6 +349,35 @@ inline void cutlass_gemm_sm90_fp8_dispatch(
   }
 }
 
+template <typename InType, typename OutType, bool EnableBias,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_fp8_batch_invariant_dispatch(
+    torch::stable::Tensor& out, torch::stable::Tensor const& a,
+    torch::stable::Tensor const& b, torch::stable::Tensor const& a_scales,
+    torch::stable::Tensor const& b_scales, EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  STD_TORCH_CHECK(a.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+  STD_TORCH_CHECK(b.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+
+  using Cutlass3xGemmM64_N1280 =
+      typename sm90_fp8_config_M64_N1280<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64_N8192 =
+      typename sm90_fp8_config_M64_N8192<InType, OutType,
+                                         EnableBias>::Cutlass3xGemm;
+
+  // keep the CUTLASS config independent of M for batch invariance
+  uint32_t const n = b.size(1);
+  if (n <= 1280) {
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N1280>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  }
+  return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N8192>(
+      out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+}
+
 template <bool EnableBias, typename... EpilogueArgs>
 void cutlass_scaled_mm_sm90_fp8_epilogue(torch::stable::Tensor& out,
                                          torch::stable::Tensor const& a,
@@ -375,4 +404,28 @@ void cutlass_scaled_mm_sm90_fp8_epilogue(torch::stable::Tensor& out,
   }
 }
 
+template <bool EnableBias, typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_fp8_batch_invariant_epilogue(
+    torch::stable::Tensor& out, torch::stable::Tensor const& a,
+    torch::stable::Tensor const& b, torch::stable::Tensor const& a_scales,
+    torch::stable::Tensor const& b_scales, EpilogueArgs&&... epilogue_args) {
+  STD_TORCH_CHECK(a.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+  STD_TORCH_CHECK(b.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+
+  if (out.scalar_type() == torch::headeronly::ScalarType::BFloat16) {
+    return cutlass_gemm_sm90_fp8_batch_invariant_dispatch<
+        cutlass::float_e4m3_t, cutlass::bfloat16_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    STD_TORCH_CHECK(out.scalar_type() == torch::headeronly::ScalarType::Half);
+    return cutlass_gemm_sm90_fp8_batch_invariant_dispatch<
+        cutlass::float_e4m3_t, cutlass::half_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_c2x.cu b/csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_c2x.cu
index 5011f442798d..184a26b491f8 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_c2x.cu
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_c2x.cu
@@ -9,6 +9,7 @@
 #include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
 #include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
 
+#include "core/batch_invariant.hpp"
 #include "libtorch_stable/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp"
 
 using namespace vllm;
@@ -191,9 +192,17 @@ void cutlass_scaled_mm_sm89(torch::stable::Tensor& out,
     STD_TORCH_CHECK(bias->scalar_type() == out.scalar_type(),
                     "currently bias dtype must match output dtype ",
                     out.scalar_type());
+    if (vllm_is_batch_invariant()) {
+      return cutlass_scaled_mm_sm89_fp8_batch_invariant_epilogue<
+          c2x::ScaledEpilogueBias>(out, a, b, a_scales, b_scales, *bias);
+    }
     return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
+    if (vllm_is_batch_invariant()) {
+      return cutlass_scaled_mm_sm89_fp8_batch_invariant_epilogue<
+          c2x::ScaledEpilogue>(out, a, b, a_scales, b_scales);
+    }
     return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
index f94d3db240b0..d3424d980f9b 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
@@ -381,4 +381,43 @@ inline void cutlass_gemm_sm89_fp8_dispatch(torch::stable::Tensor& out,
   }
 }
 
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_fp8_batch_invariant_dispatch(
+    torch::stable::Tensor& out, torch::stable::Tensor const& a,
+    torch::stable::Tensor const& b, EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  STD_TORCH_CHECK(a.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+  STD_TORCH_CHECK(b.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+
+  // keep the CUTLASS config independent of M for batch invariance
+  return sm89_fp8_config_M64::dispatch<InType, OutType, Epilogue>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm89_fp8_batch_invariant_epilogue(
+    torch::stable::Tensor& out, torch::stable::Tensor const& a,
+    torch::stable::Tensor const& b, EpilogueArgs&&... epilogue_args) {
+  STD_TORCH_CHECK(a.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+  STD_TORCH_CHECK(b.scalar_type() ==
+                  torch::headeronly::ScalarType::Float8_e4m3fn);
+
+  if (out.scalar_type() == torch::headeronly::ScalarType::BFloat16) {
+    return cutlass_gemm_sm89_fp8_batch_invariant_dispatch<
+        cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    STD_TORCH_CHECK(out.scalar_type() == torch::headeronly::ScalarType::Half);
+    return cutlass_gemm_sm89_fp8_batch_invariant_dispatch<
+        cutlass::float_e4m3_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh
index df9b84abcc13..a2f9f46c8f89 100644
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@@ -1,7 +1,7 @@
 #include <cstdint>
 
 /* Adapted from ./csrc/quantization/gguf/mmq.cuh
-   based on ./vllm/model_executor/layers/fused_moe/fused_moe.py */
+   based on ./vllm/model_executor/layers/fused_moe/experts/triton_moe.py */
 template <typename scalar_t, int qk, int qr, int qi, bool need_sum,
           typename block_q_t, int mmq_x, int mmq_y, int nwarps,
           allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles,
diff --git a/csrc/spinloop.cpp b/csrc/spinloop.cpp
new file mode 100644
index 000000000000..c29e48a5f0ec
--- /dev/null
+++ b/csrc/spinloop.cpp
@@ -0,0 +1,204 @@
+#include <Python.h>
+
+extern "C" {
+
+#include <stdbool.h>
+#include <time.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+  #include <cpuid.h>
+  #include <mwaitxintrin.h>
+#endif
+
+#if defined(CLOCK_MONOTONIC_RAW)
+  #define TIMEOUT_CLOCK CLOCK_MONOTONIC_RAW
+#else
+  #define TIMEOUT_CLOCK CLOCK_MONOTONIC
+#endif
+
+#define CPU_SUPPORT_NONE 0
+#define CPU_SUPPORT_MONITORX 1
+
+#define MWAITX_DEFAULT_TIMEOUT_CYCLES 1000000
+
+typedef struct {
+  unsigned int cpu_support;
+  unsigned int max_monitor_line_size;
+} spinloop_state_t;
+
+static void determine_cpu_support(spinloop_state_t* state) {
+  state->cpu_support = CPU_SUPPORT_NONE;
+  state->max_monitor_line_size = 0;
+
+#if defined(__i386__) || defined(__x86_64__)
+  unsigned int eax, ebx, ecx, edx;
+  if (__get_cpuid(0, &eax, &ebx, &ecx, &edx) == 1) {
+    // AMD CPU (possible monitorx/mwaitx support)
+    if (ebx == 0x68747541 && edx == 0x69746e65 && ecx == 0x444d4163) {
+      if (__get_cpuid(0x80000000, &eax, &ebx, &ecx, &edx) == 1 &&
+          eax >= 0x80000001 &&
+          __get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx) == 1) {
+        if ((ecx & (1 << 29)) != 0) {
+          state->cpu_support = CPU_SUPPORT_MONITORX;
+        }
+      }
+    }
+  }
+
+  if (state->cpu_support == CPU_SUPPORT_MONITORX) {
+    if (__get_cpuid(5, &eax, &ebx, &ecx, &edx) == 1) {
+      state->max_monitor_line_size = ebx & 0xff;
+    }
+  }
+#endif
+}
+
+static PyObject* method_spinloop(PyObject* self, PyObject* args,
+                                 PyObject* kwargs) {
+  Py_buffer buffer;
+  PyObject* callback;
+  double timeout = 0.;
+
+  spinloop_state_t* state = (spinloop_state_t*)PyModule_GetState(self);
+  if (state == NULL) {
+    PyErr_SetString(PyExc_TypeError, "Failed to retrieve module state!");
+    return NULL;
+  }
+
+  static const char* keywords[] = {"buffer", "callback", "timeout", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*O|d", (char**)keywords,
+                                   &buffer, &callback, &timeout)) {
+    return NULL;
+  }
+
+  if (!PyCallable_Check(callback)) {
+    PyErr_SetString(PyExc_TypeError, "callback parameter must be callable!");
+    PyBuffer_Release(&buffer);
+    return NULL;
+  }
+
+  struct timespec t_start;
+  if (clock_gettime(TIMEOUT_CLOCK, &t_start) != 0) {
+    PyErr_SetString(PyExc_RuntimeError, "clock_gettime() failed!");
+    PyBuffer_Release(&buffer);
+    return NULL;
+  }
+
+  bool result = false;
+  bool error = false;
+  bool have_timeout = (timeout > 1e-9);
+  unsigned int iteration = 0;
+  const bool buffer_qualifies = (buffer.len <= state->max_monitor_line_size);
+
+  while (true) {
+    PyObject* res = PyObject_CallNoArgs(callback);
+    if (res == NULL) {
+      error = true;
+      break;
+    }
+    int ok = (res == Py_True);
+    Py_DECREF(res);
+
+    if (ok) {
+      result = true;
+      break;
+    }
+
+    // Check timeout at most every 16 iterations to avoid clock_gettime and
+    // comparison cost
+    if (have_timeout && (iteration & 15u) == 0) {
+      struct timespec t_now;
+      if (clock_gettime(TIMEOUT_CLOCK, &t_now) != 0) {
+        PyErr_SetString(PyExc_RuntimeError, "clock_gettime() failed!");
+        error = true;
+        break;
+      }
+
+      const double elapsed = (double)(t_now.tv_sec - t_start.tv_sec) +
+                             (t_now.tv_nsec - t_start.tv_nsec) * 1e-9;
+      if (elapsed >= timeout) {
+        result = false;
+        break;
+      }
+    }
+    ++iteration;
+
+#if defined(__i386__) || defined(__x86_64__)
+    // monitorx + mwaitx with qualified buffer
+    if (buffer_qualifies && state->cpu_support == CPU_SUPPORT_MONITORX) {
+      _mm_monitorx(buffer.buf, 0, 0);
+
+      // Check once more in case the buffer has been modified while we were
+      // arming the monitor hardware
+      res = PyObject_CallNoArgs(callback);
+      if (res == NULL) {
+        error = true;
+        break;
+      }
+      ok = (res == Py_True);
+      Py_DECREF(res);
+
+      if (ok) {
+        result = true;
+        break;
+      }
+
+      // Run mwaitx with enabled timeout (bit 1). The actual timeout value
+      // is not very important, we just want to ensure we don't lock up
+      // here for too long.
+      Py_BEGIN_ALLOW_THREADS _mm_mwaitx((1 << 1), 0,
+                                        MWAITX_DEFAULT_TIMEOUT_CYCLES);
+      Py_END_ALLOW_THREADS
+    }
+
+    // Fallback: Busy poll
+    else {
+#endif
+      // Give other threads a chance to be scheduled
+      Py_BEGIN_ALLOW_THREADS
+#if defined(__i386__) || defined(__x86_64__)
+      __builtin_ia32_pause();
+#elif defined(__aarch64__)
+        __asm__ volatile("yield" :: : "memory");
+#endif
+      Py_END_ALLOW_THREADS
+#if defined(__i386__) || defined(__x86_64__)
+    }
+#endif
+  }
+
+  PyBuffer_Release(&buffer);
+
+  if (error) {
+    return NULL;
+  }
+
+  if (result) {
+    Py_RETURN_TRUE;
+  }
+
+  Py_RETURN_FALSE;
+}
+
+static PyMethodDef spinloop_methods[] = {
+    {"spinloop", (PyCFunction)method_spinloop, METH_VARARGS | METH_KEYWORDS,
+     "Wait for store with callback"},
+    {NULL, NULL, 0, NULL}};
+
+static struct PyModuleDef spinloop_module = {
+    PyModuleDef_HEAD_INIT, "spinloop",
+    "Hardware-optimized spinloops for Python", sizeof(spinloop_state_t),
+    spinloop_methods};
+
+PyMODINIT_FUNC PyInit_spinloop(void) {
+  PyObject* m = PyModule_Create(&spinloop_module);
+  if (m != NULL) {
+    spinloop_state_t* state = (spinloop_state_t*)PyModule_GetState(m);
+    if (state != NULL) {
+      determine_cpu_support(state);
+    }
+  }
+  return m;
+}
+
+}  // extern "C"
diff --git a/csrc/topk.cu b/csrc/topk.cu
index c5bffb32856d..9ca9aa8824d8 100644
--- a/csrc/topk.cu
+++ b/csrc/topk.cu
@@ -20,7 +20,7 @@ void launch_persistent_topk(const torch::Tensor& logits,
   namespace P = vllm::persistent;
 
   const int64_t num_rows = logits.size(0);
-  const int64_t stride = logits.size(1);
+  const int64_t stride = logits.stride(0);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   static int num_sms = 0;
@@ -243,7 +243,7 @@ void persistent_topk(const torch::Tensor& logits, const torch::Tensor& lengths,
   TORCH_CHECK(output.dim() == 2, "output must be 2D");
 
   const int64_t num_rows = logits.size(0);
-  const int64_t stride = logits.size(1);
+  const int64_t stride = logits.stride(0);
 
   TORCH_CHECK(lengths.numel() == num_rows, "lengths size mismatch");
   TORCH_CHECK(output.size(0) == num_rows && output.size(1) == k,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index e695497fd88f..7562d90c0b99 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -553,7 +553,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   // Batch swap: submit all block copies in a single driver call.
   cache_ops.def(
       "swap_blocks_batch(Tensor src_ptrs, Tensor dst_ptrs,"
-      "                  Tensor sizes) -> ()");
+      "                  Tensor sizes,"
+      "                  bool is_src_access_order_any=False) -> ()");
   cache_ops.impl("swap_blocks_batch", torch::kCPU, &swap_blocks_batch);
 
   // Reshape the key and value tensors and cache them.
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index a21916d0b531..cb6d09f5bb6c 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -9,7 +9,7 @@ ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="v0.1.12.post2"
+ARG AITER_BRANCH="v0.1.13"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 ARG MORI_BRANCH="v1.1.0"
 ARG MORI_REPO="https://github.com/ROCm/mori.git"
diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
index c9ceb67cce67..4f0c933c4ac8 100644
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -132,7 +132,7 @@ The generated timeline is an interactive visualization in the form of an HTML fi
 
 Example output:
 
-<iframe src="../../assets/contributing/vllm_bench_serve_timeline.html" width="100%" height="600" frameborder="0"></iframe>
+<iframe src="../assets/contributing/vllm_bench_serve_timeline.html" width="100%" height="600" frameborder="0"></iframe>
 
 ##### Dataset statistics
 
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index ba1f5e43d61e..dceb78f52638 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -142,7 +142,7 @@ We use "mamba-like" to refer to layers that possess a state that is updated in-p
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
-It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/v1/attention/backends/registry.py) when adding a new mamba backend.
+It is also worth noting that we should update `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/v1/attention/backends/registry.py) when adding a new mamba backend.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
 Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
 The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
index 4aefeb5585fe..d2557a2281cf 100644
--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -138,7 +138,7 @@ For example:
 
 --8<-- "vllm/model_executor/models/transformers/moe.py:transformers_fused_moe"
 
---8<-- "vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk"
+--8<-- "vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py:grouped_topk"
 ```
 
 **9. Norm:**
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 54b796fde3bf..2bbadd8ff813 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -80,14 +80,14 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
 
 | Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
 | ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
-| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
+| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.experts.triton_moe.TritonExperts] |
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
 | deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
 | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp4] |
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassBatchedExpertsFp8] |
-| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
+| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.OAITritonExperts] |
-| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
+| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.experts.marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.experts.marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.experts.marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],</br>[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],</br>[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],</br>[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] |
 | rocm aiter moe | standard | mxfp4,</br>fp8 | G(32),G(128),A,T | silu, gelu,</br>swigluoai | Y | N | `rocm_aiter_fused_experts`,</br>`AiterExperts` |
 | cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index e7ee3a64008c..960108b6894f 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -126,9 +126,15 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
     - Set when prefiller and decoder are on different machines
     - Connection info is passed via KVTransferParams from prefiller to decoder for handshake
 
-- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
+- `kv_lease_duration` (via `kv_connector_extra_config`): Lease duration (in seconds) for the prefiller's KV cache blocks. (Optional)
+    - Default: 30
+    - When a prefill request finishes, its KV blocks are held for this duration waiting for the decoder to read them. While the request is queued on the decoder, periodic heartbeats automatically extend the lease. If neither a heartbeat nor a read notification arrives before the lease expires, the blocks are freed. The heartbeat interval and extension amount are derived automatically from this value.
+    - Example: `--kv-transfer-config '{"kv_connector_extra_config": {"kv_lease_duration": 60}}'`
+
+- `decoder_kv_blocks_ttl` (via `kv_connector_extra_config`): TTL (in seconds) for KV blocks cached on the decoder in bidirectional transfer mode. (Optional)
     - Default: 480
-    - If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
+    - In bidirectional mode, the decoder caches KV blocks for multi-turn conversations. This TTL controls how long those blocks are held before being released. Unlike the prefiller lease, this TTL is not renewed via heartbeats.
+    - Example: `--kv-transfer-config '{"kv_connector_extra_config": {"decoder_kv_blocks_ttl": 600}}'`
 
 ## Multi-Instance Setup
 
diff --git a/docs/features/speculative_decoding/README.md b/docs/features/speculative_decoding/README.md
index bef71a4f5a37..ad01ee3d74bc 100644
--- a/docs/features/speculative_decoding/README.md
+++ b/docs/features/speculative_decoding/README.md
@@ -72,6 +72,17 @@ only apply to model-based methods such as `draft_model`, `mtp`, `eagle3`, and
 | `rejection_sample_method` | `string` | `strict` | `strict`, `probabilistic`, or `synthetic`. |
 | `synthetic_acceptance_rate` | `float` | `None` | Average acceptance rate to target when `rejection_sample_method` is `synthetic`. Valid range is `[0, 1]`. |
 
+!!! note
+    Gemma 4 assistant checkpoints are handled as Gemma 4 MTP speculators, not
+    as generic draft models. Use `"method": "mtp"` with the assistant
+    checkpoint in `model`, as shown in the [MTP guide](mtp.md#gemma-4-assistant-models).
+
+    If startup logs show `SpeculativeConfig(method='draft_model', ...)` for a
+    Gemma 4 assistant checkpoint, the installed vLLM version does not include
+    Gemma 4 MTP support for that path. Upgrade to a version that includes
+    Gemma 4 MTP support instead of forcing the assistant checkpoint through
+    generic draft-model speculative decoding.
+
 ### Method-specific keys
 
 #### N-gram
diff --git a/docs/features/speculative_decoding/mtp.md b/docs/features/speculative_decoding/mtp.md
index 7e1d1ec7038b..d60f8ff27ba2 100644
--- a/docs/features/speculative_decoding/mtp.md
+++ b/docs/features/speculative_decoding/mtp.md
@@ -9,6 +9,31 @@ MTP is useful when:
 - Your model natively supports MTP.
 - You want model-based speculative decoding with minimal extra configuration.
 
+## Gemma 4 Assistant Models
+
+Gemma 4 assistant checkpoints use vLLM's Gemma 4 MTP path. They are not generic
+draft models, even though they are passed through the `model` field in
+`--speculative-config`.
+
+Use `"method": "mtp"` when serving Gemma 4 with an assistant checkpoint:
+
+```bash
+vllm serve google/gemma-4-E2B-it \
+    --tensor-parallel-size 1 \
+    --max-model-len 8192 \
+    --speculative-config '{"method":"mtp","model":"gg-hf-am/gemma-4-E2B-it-assistant","num_speculative_tokens":1}'
+```
+
+The E2B, E4B, 26B-A4B, and 31B Gemma 4 IT assistant checkpoints are supported
+when their configuration uses `model_type: gemma4_assistant`. vLLM maps those
+checkpoints to `Gemma4MTPModel` internally and wires the assistant layers to
+share KV cache with the target model.
+
+If an older vLLM release logs `SpeculativeConfig(method='draft_model', ...)`
+for a Gemma 4 assistant checkpoint, that release is treating the assistant as a
+generic draft model and may fail during initialization for multimodal Gemma 4
+targets. Upgrade to a version with Gemma 4 MTP support instead.
+
 ## Offline Example
 
 ```python
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index ac3309b23414..a0eb56302056 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -3,9 +3,10 @@
 vLLM supports the following hardware platforms:
 
 - [GPU](gpu.md)
-    - [NVIDIA CUDA](gpu.md#nvidia-cuda)
-    - [AMD ROCm](gpu.md#amd-rocm)
-    - [Intel XPU](gpu.md#intel-xpu)
+    - [NVIDIA CUDA](gpu.md)
+    - [AMD ROCm](gpu.md)
+    - [Intel XPU](gpu.md)
+    - [Apple Silicon](gpu.md) (via [vLLM-Metal](https://github.com/vllm-project/vllm-metal))
 - [CPU](cpu.md)
     - [Intel/AMD x86](cpu.md#intelamd-x86)
     - [ARM AArch64](cpu.md#arm-aarch64)
diff --git a/docs/getting_started/installation/gpu.apple.inc.md b/docs/getting_started/installation/gpu.apple.inc.md
new file mode 100644
index 000000000000..30e6244e05c3
--- /dev/null
+++ b/docs/getting_started/installation/gpu.apple.inc.md
@@ -0,0 +1,125 @@
+<!-- markdownlint-disable MD041 -->
+--8<-- [start:installation]
+
+For GPU-accelerated inference on Apple Silicon, use [vLLM-Metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend and provides native GPU acceleration via Apple's Metal framework.
+
+vLLM-Metal works with MLX-optimized models from the [mlx-community](https://huggingface.co/mlx-community) organization on Hugging Face, which provides quantized versions of popular models optimized for Apple Silicon.
+
+!!! tip
+    For installation and usage instructions, see the [Set up using vLLM-Metal](#set-up-using-vllm-metal) section below.
+
+--8<-- [end:installation]
+--8<-- [start:requirements]
+
+- OS: macOS Sonoma or later
+- Hardware: Apple Silicon
+- Metal support enabled
+
+!!! note
+    See the [Set up using vLLM-Metal](#set-up-using-vllm-metal) section below for installation instructions.
+
+--8<-- [end:requirements]
+--8<-- [start:set-up-using-python]
+
+## Set up using vLLM-Metal
+
+vLLM-Metal is distributed as a separate package that provides native GPU acceleration on Apple Silicon.
+
+To install vLLM-Metal, follow the installation instructions in the [vLLM-Metal documentation](https://github.com/vllm-project/vllm-metal#installation).
+
+The installation will:
+
+1. Set up the appropriate Python environment
+2. Install MLX and required dependencies
+3. Install the vLLM-Metal package
+
+After installation, you can start using vLLM with Metal GPU acceleration.
+
+!!! tip
+    When using vLLM-Metal, use models from the [mlx-community](https://huggingface.co/mlx-community) on Hugging Face for best performance. These models are optimized for MLX and often include quantized versions (4-bit, 8-bit) that run efficiently on Apple Silicon.
+
+    Example model: `mlx-community/Qwen2.5-0.5B-Instruct-4bit`
+
+### Using vLLM-Metal
+
+After installation, vLLM-Metal provides an easy-to-use CLI for running an OpenAI-compatible API server:
+
+```bash
+# Activate the vLLM-Metal environment
+source ~/.venv-vllm-metal/bin/activate
+
+# Start the API server (specify your mlx-community model or it will use default)
+vllm serve
+```
+
+Once the server is running, you have multiple options to interact with it:
+
+#### Option 1: Interactive chat
+
+Open a new terminal and start an interactive chat session:
+
+```bash
+source ~/.venv-vllm-metal/bin/activate
+vllm chat
+```
+
+#### Option 2: API requests with curl
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "max_tokens": 50
+  }'
+```
+
+#### Option 3: Python with OpenAI SDK
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="dummy"  # No auth required for local server
+)
+
+response = client.chat.completions.create(
+    model="mlx-community/Qwen2.5-0.5B-Instruct-4bit",
+    messages=[{"role": "user", "content": "Hello!"}]
+)
+
+print(response.choices[0].message.content)
+```
+
+For more details on the `vllm` CLI commands, see the [OpenAI-compatible server documentation](../../serving/openai_compatible_server.md).
+
+--8<-- [end:set-up-using-python]
+--8<-- [start:pre-built-wheels]
+
+vLLM-Metal is installed via the vLLM-Metal package. See the [Set up using vLLM-Metal](#set-up-using-vllm-metal) section above.
+
+--8<-- [end:pre-built-wheels]
+--8<-- [start:build-wheel-from-source]
+
+For build instructions from source, refer to the [vLLM-Metal documentation](https://github.com/vllm-project/vllm-metal#installation).
+
+--8<-- [end:build-wheel-from-source]
+--8<-- [start:pre-built-images]
+
+--8<-- [end:pre-built-images]
+--8<-- [start:build-image-from-source]
+
+--8<-- [end:build-image-from-source]
+--8<-- [start:supported-features]
+
+vLLM-Metal provides:
+
+- Native GPU acceleration using Metal
+- MLX-based compute backend optimized for Apple Silicon
+- OpenAI-compatible API server
+- Support for popular model architectures
+
+For specific feature support and limitations, refer to the [vLLM-Metal documentation](https://github.com/vllm-project/vllm-metal).
+
+--8<-- [end:supported-features]
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index 475c67ce9d05..91d933dd4e86 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -18,6 +18,10 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:installation"
 
+=== "Apple Silicon"
+
+    --8<-- "docs/getting_started/installation/gpu.apple.inc.md:installation"
+
 ## Requirements
 
 - OS: Linux
@@ -38,6 +42,10 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:requirements"
 
+=== "Apple Silicon"
+
+    --8<-- "docs/getting_started/installation/gpu.apple.inc.md:requirements"
+
 ## Set up using Python
 
 ### Create a new Python environment
@@ -56,6 +64,10 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
+=== "Apple Silicon"
+
+    --8<-- "docs/getting_started/installation/gpu.apple.inc.md:set-up-using-python"
+
 ### Pre-built wheels {#pre-built-wheels}
 
 === "NVIDIA CUDA"
@@ -70,6 +82,10 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-wheels"
 
+=== "Apple Silicon"
+
+    --8<-- "docs/getting_started/installation/gpu.apple.inc.md:pre-built-wheels"
+
 ### Build wheel from source
 
 === "NVIDIA CUDA"
@@ -84,6 +100,10 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-wheel-from-source"
 
+=== "Apple Silicon"
+
+    --8<-- "docs/getting_started/installation/gpu.apple.inc.md:build-wheel-from-source"
+
 ## Set up using Docker
 
 ### Pre-built images
@@ -102,6 +122,10 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-images"
 
+=== "Apple Silicon"
+
+    --8<-- "docs/getting_started/installation/gpu.apple.inc.md:pre-built-images"
+
 --8<-- [end:pre-built-images]
 
 ### Build image from source
@@ -120,6 +144,10 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-image-from-source"
 
+=== "Apple Silicon"
+
+    --8<-- "docs/getting_started/installation/gpu.apple.inc.md:build-image-from-source"
+
 --8<-- [end:build-image-from-source]
 
 ## Supported features
@@ -135,3 +163,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 === "Intel XPU"
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:supported-features"
+
+=== "Apple Silicon"
+
+    --8<-- "docs/getting_started/installation/gpu.apple.inc.md:supported-features"
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 015514def33f..a748ba4a9300 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -10,6 +10,9 @@ This guide will help you quickly get started with vLLM to perform:
 - OS: Linux
 - Python: 3.10 -- 3.13
 
+!!! note
+    vLLM also works on macOS with [vLLM-Metal](https://github.com/vllm-project/vllm-metal) for Apple Silicon GPU acceleration. See the [GPU installation guide](installation/gpu.md) and select the "Apple Silicon" tab.
+
 ## Installation
 
 === "NVIDIA CUDA"
@@ -73,6 +76,18 @@ This guide will help you quickly get started with vLLM to perform:
     !!! note
         For more detailed instructions, including Docker, installing from source, and troubleshooting, please refer to the [vLLM on TPU documentation](https://docs.vllm.ai/projects/tpu/en/latest/).
 
+=== "Apple Silicon (Mac)"
+
+    If you are using Apple Silicon Macs, you can use vLLM-Metal for GPU-accelerated inference via Apple's Metal framework.
+
+    Follow the installation instructions in the [vLLM-Metal documentation](https://github.com/vllm-project/vllm-metal#installation).
+
+    !!! note
+        vLLM-Metal uses MLX instead of PyTorch as the compute backend and requires MLX-optimized models from the [mlx-community](https://huggingface.co/mlx-community) on Hugging Face.
+
+    !!! tip
+        For more detailed instructions, please refer to the [GPU installation guide](installation/gpu.md) and select the "Apple Silicon" tab.
+
 !!! note
     For more detail and non-CUDA platforms, please refer to the [installation guide](installation/README.md) for specific instructions on how to install vLLM.
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index e79fec8169f2..454ab08debc9 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -378,14 +378,14 @@ th {
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `thu-coai/ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ |
-| `CohereMoeForCausalLM` | Command (MoE) | (model checkpoints loaded with `trust_remote_code=True`) | ✅︎ | ✅︎ |
+| `Cohere2MoeForCausalLM` | Command (MoE) | (model checkpoints loaded with `trust_remote_code=True`) | ✅︎ | ✅︎ |
 | `CwmForCausalLM` | CWM | `facebook/cwm`, etc. | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
 | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ |
 | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ |
 | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ |
-| `DeepseekV4ForCausalLM` | DeepSeek-V4 | `deepseek-ai/DeepSeek-V4-Flash`, `deepseek-ai/DeepSeek-V4-Pro`, etc. | | |
+| `DeepseekV4ForCausalLM` | DeepSeek-V4 | `deepseek-ai/DeepSeek-V4-Flash`, `deepseek-ai/DeepSeek-V4-Pro`, etc. | | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ |
 | `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | ✅︎ | ✅︎ |
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ |
@@ -598,7 +598,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ |
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
-| `Molmo2ForConditionalGeneration` | Molmo2 | T + I<sup>+</sup> / V | `allenai/Molmo2-4B`, `allenai/Molmo2-8B`, `allenai/Molmo2-O-7B` | ✅︎ | ✅︎ |
+| `Molmo2ForConditionalGeneration` | Molmo2 | T + I<sup>+</sup> / V | `allenai/Molmo2-4B`, `allenai/Molmo2-8B`, `allenai/Molmo2-O-7B`, `allenai/MolmoWeb-4B`<sup>^</sup>, `allenai/MolmoWeb-8B`<sup>^</sup> | ✅︎ | ✅︎ |
 | `Moondream3ForCausalLM` | Moondream3 | T + I | `moondream/moondream3-preview` | | ✅︎ |
 | `MusicFlamingoForConditionalGeneration` | MusicFlamingo | T + A | `nvidia/music-flamingo-2601-hf`, `nvidia/music-flamingo-think-2601-hf` | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
@@ -659,10 +659,18 @@ Some models are supported only via the [Transformers modeling backend](#transfor
     For `Gemma4ForConditionalGeneration`:
     - audio input is only supported by the `gemma-4-E2B` and `gemma-4-E4B` variants.
     - The model does not ingest videos directly. However, vLLM’s Gemma 4 implementation supports video inputs by handling video processing internally. Users can send videos directly in the message structure to vLLM, where they are converted into text and image frames before being passed to the model.
+    - Gemma 4 assistant checkpoints for speculative decoding use vLLM's Gemma
+      4 MTP path, not generic draft-model speculative decoding. See the
+      [Gemma 4 assistant model MTP example](../features/speculative_decoding/mtp.md#gemma-4-assistant-models).
 
 !!! note
     For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc.), InternVL3 and InternVL3.5 have video inputs support currently.
 
+!!! note
+    To use `allenai/MolmoWeb-4B` or `allenai/MolmoWeb-8B`, serve the checkpoint
+    with the Molmo2 architecture and disable multimodal-prefix attention:
+    `--hf-overrides '{"architectures": ["Molmo2ForConditionalGeneration"], "is_mm_prefix_lm": false}'`.
+
 !!! note
     `Moondream3ForCausalLM` uses task-specific prompt templates for `query`
     and `caption`. The native `detect` and `point` skills require custom
diff --git a/docs/usage/security.md b/docs/usage/security.md
index e548899abbf1..49c4d1f0f3c3 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -333,6 +333,43 @@ Most cache paths default to subdirectories under a single root. Changing `VLLM_C
 - **Do not copy cache contents from untrusted sources.** If you distribute cache artifacts between environments, ensure they originate from a trusted build pipeline.
 - **Container deployments:** If mounting cache directories into containers, ensure the volume source is trusted.
 
+## FIPS Compatibility
+
+FIPS compliance depends on many factors, so a vLLM deployment is not automatically FIPS compliant. Recent changes have improved vLLM's *tolerance* of FIPS-enabled hosts — that is, avoiding crashes when non-approved algorithms are blocked — but tolerance is not the same as compliance. Whether a deployment satisfies FIPS requirements depends on the host operating system, the OpenSSL provider backing Python's `hashlib` and `ssl` modules, and which optional dependencies are installed.
+
+### FIPS-relevant configuration
+
+Operators running vLLM on FIPS-enabled hosts should select FIPS-approved algorithms via the following knobs:
+
+- **Multimodal input hashing** — `VLLM_MM_HASHER_ALGORITHM` defaults to `blake3`, which is not FIPS-approved. Set it to `sha256` or `sha512` in FIPS-enabled environments.
+- **Prefix-cache hashing** — set `--prefix-caching-hash-algo` (config field `prefix_caching_hash_algo`) to `sha256` or `sha256_cbor`. The `xxhash` and `xxhash_cbor` options are not FIPS-approved.
+- **TLS ciphers** — use `--ssl-ciphers` to restrict the API server's TLS handshake to FIPS-approved cipher suites that match your environment's policy.
+
+### Automatic fallback for non-security MD5 use
+
+vLLM uses MD5 in a few places to derive non-security cache keys (for example, configuration hashes). These call sites pass `usedforsecurity=False` and additionally fall back to SHA-256 when the underlying OpenSSL provider refuses MD5 outright (see `safe_hash()` in `vllm/utils/hashing.py`). No user action is required; this behavior is documented so that auditors and security reviewers can identify the MD5 references and understand their purpose.
+
+### Dependencies that provide non-FIPS hash implementations
+
+Some dependencies expose hash implementations that are not FIPS-approved. vLLM only invokes them when the corresponding algorithm is selected, but operators with strict cryptographic controls may want to ensure the code paths are not exercised — and, where policy requires, that the packages themselves are absent:
+
+- `blake3` — currently listed in `requirements/common.txt`, so a standard install pulls it in. It is imported lazily and only used when `VLLM_MM_HASHER_ALGORITHM=blake3` (the default). Setting `VLLM_MM_HASHER_ALGORITHM` to `sha256` or `sha512` is sufficient to keep the non-FIPS code path dormant. If your policy additionally forbids the package being present, uninstall it after `pip install` (`pip uninstall blake3`); vLLM will continue to function as long as `VLLM_MM_HASHER_ALGORITHM` is set to a non-blake3 value.
+- `xxhash` — a true optional dependency (not in `requirements/common.txt`). It is only imported when an `xxhash`-based prefix-cache algorithm is selected. Leave it uninstalled and select a `sha256`-based prefix-cache algorithm.
+
+### Beyond hashing: other FIPS considerations
+
+Hashing is the area where vLLM has explicit FIPS-aware code, but a FIPS-compliant deployment depends on several factors that sit outside vLLM itself. Operators should evaluate the following with their platform and security teams:
+
+- **Host crypto provider.** Python's `hashlib` and `ssl` modules are FIPS-aware only when Python is linked against a FIPS-validated OpenSSL (or equivalent) provider supplied by the host OS. vLLM inherits whatever provider the host configures — it does not bundle one.
+- **API server TLS.** TLS termination for the OpenAI-compatible API server uses the host's OpenSSL via Python's `ssl` module. Restrict the cipher suite with `--ssl-ciphers` to match your environment's FIPS policy, and ensure server certificates are issued with FIPS-approved algorithms and key sizes.
+- **Outbound HTTPS.** Model and asset downloads (for example, via `huggingface_hub`) use the same host TLS stack. The same provider/cipher considerations apply.
+- **Inter-node communication is unencrypted by default.** As described in [Inter-Node Communication](#inter-node-communication), PyTorch Distributed, KV-cache transfer, and data-parallel channels do not encrypt traffic. FIPS environments that require FIPS-approved cryptography for data in transit must provide that protection externally — for example, via an mTLS sidecar or IPsec terminated by a FIPS-validated module — since vLLM's internal channels cannot satisfy the requirement on their own. Network isolation alone is not cryptography and does not meet a "FIPS-approved cryptography for data in transit" requirement, though it remains a useful defense-in-depth measure.
+- **Dependencies that bundle their own OpenSSL.** Some Python wheels statically link OpenSSL builds that fail the kernel FIPS self-test on FIPS-enabled hosts (`FATAL FIPS SELFTEST FAILURE`). `opencv-python-headless` is a known example; other manylinux wheels may behave similarly. Audit your installed wheels for bundled crypto libraries when troubleshooting FIPS startup failures.
+- **Accelerator and ML libraries.** PyTorch, CUDA, cuDNN, NCCL, and similar components have their own crypto and FIPS posture independent of vLLM. NVIDIA publishes FIPS-validated builds for some libraries; vLLM does not pin to those builds, so selecting and validating them is the operator's responsibility.
+- **What is *not* a FIPS concern in vLLM.** Random number generation used for token sampling (Python/NumPy/PyTorch RNGs) is not a cryptographic use and is out of scope for FIPS. Pickled cache artifacts are a separate security concern covered under [Cache Directory Security](#cache-directory-security).
+
+In short: the configuration knobs above let vLLM avoid non-approved algorithms, and the automatic fallbacks let it run without crashing on FIPS-enabled hosts. End-to-end FIPS compliance, however, is a property of the full deployment — host OS, crypto provider, transitive dependencies, and network architecture — not of vLLM alone.
+
 ## Reporting Security Vulnerabilities
 
 If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
diff --git a/examples/disaggregated/kv_load_failure_recovery_offline/load_recovery_example_connector.py b/examples/disaggregated/kv_load_failure_recovery_offline/load_recovery_example_connector.py
index 7aab07f8a2c3..5bae04426746 100644
--- a/examples/disaggregated/kv_load_failure_recovery_offline/load_recovery_example_connector.py
+++ b/examples/disaggregated/kv_load_failure_recovery_offline/load_recovery_example_connector.py
@@ -20,6 +20,7 @@
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.kv_cache_interface import KVCacheConfig
 
 logger = logging.getLogger()
 logging.basicConfig(level=logging.INFO)
@@ -35,8 +36,17 @@ def from_base(cls, base: ExampleConnectorMetadata):
 
 
 class LoadRecoveryExampleConnector(ExampleConnector):
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        super().__init__(
+            vllm_config=vllm_config,
+            role=role,
+            kv_cache_config=kv_cache_config,
+        )
         self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
             "async_load", False
         )
diff --git a/examples/tool_chat_template_gemma4.jinja b/examples/tool_chat_template_gemma4.jinja
index f62ca843a405..d61dd795b586 100644
--- a/examples/tool_chat_template_gemma4.jinja
+++ b/examples/tool_chat_template_gemma4.jinja
@@ -263,7 +263,7 @@
             {%- if message.get('tool_responses') -%}
                 {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
                 {%- for tool_response in message['tool_responses'] -%}
-                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown', true), tool_response['response']) -}}
                     {%- set ns_tr_out.flag = true -%}
                     {%- set ns.prev_message_type = 'tool_response' -%}
                 {%- endfor -%}
@@ -277,7 +277,7 @@
                     {%- else -%}
                         {%- set follow = loop_messages[k] -%}
                         {#- Resolve tool_call_id to function name -#}
-                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown', true)) -%}
                         {%- for tc in message['tool_calls'] -%}
                             {%- if tc.get('id') == follow.get('tool_call_id') -%}
                                 {%- set ns_tname.name = tc['function']['name'] -%}
diff --git a/pyproject.toml b/pyproject.toml
index 6a0ae2fea6eb..0641d5aca208 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -123,7 +123,8 @@ python = "./.venv"
 extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", "tests/tokenizers_/*",
     "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
     "examples/pooling/token_embed/*", "tests/models/language/pooling/*",
-    "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/speech_to_text/test_transcription_validation.py",
+    "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*",
+    "tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py",
     "docs/governance/process.md", "docs/assets/contributing/vllm_bench_serve_timeline.html", 
     "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*", "csrc/cpu/sgl-kernels/*"]
 ignore-hidden = false
diff --git a/setup.py b/setup.py
index 7c226a72425f..dc963c9e7891 100644
--- a/setup.py
+++ b/setup.py
@@ -686,6 +686,7 @@ def extract_precompiled_and_patch_package(
                     "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                     "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                     "vllm/cumem_allocator.abi3.so",
+                    "vllm/spinloop.abi3.so",
                     # ROCm-specific libraries
                     "vllm/_rocm_C.abi3.so",
                 ]
@@ -993,6 +994,8 @@ def _read_requirements(filename: str) -> list[str]:
     # copying the relevant .py files from the source repository.
     ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True))
 
+ext_modules.append(CMakeExtension(name="vllm.spinloop"))
+
 if _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
 
diff --git a/tests/compile/correctness_e2e/test_async_tp.py b/tests/compile/correctness_e2e/test_async_tp.py
index 3539e4d5abb4..932e513258d0 100644
--- a/tests/compile/correctness_e2e/test_async_tp.py
+++ b/tests/compile/correctness_e2e/test_async_tp.py
@@ -13,6 +13,17 @@
 from vllm.config import (
     CompilationMode,
 )
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
+
+NVFP4_MODEL_ID = "nvidia/Llama-3.1-8B-Instruct-NVFP4"
+NVFP4_HF_OVERRIDES = {
+    "num_hidden_layers": 4,
+    "hidden_size": 512,
+    "intermediate_size": 800,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 1,
+}
 
 
 @create_new_process_for_each_test()
@@ -82,3 +93,65 @@ def test_async_tp_pass_correctness(
     ]
 
     compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
+
+
+@create_new_process_for_each_test()
+def test_async_tp_pass_nvfp4_correctness(num_gpus_available: int, monkeypatch):
+    if (
+        not current_platform.is_cuda()
+        or not current_platform.is_device_capability_family(100)
+    ):
+        pytest.skip("NVFP4 requires Blackwell")
+    if not has_flashinfer():
+        pytest.skip("FlashInfer is required for the NVFP4 AsyncTP path")
+
+    monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", "flashinfer-cutlass")
+
+    tp_size = 2
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} GPUs")
+
+    common_args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+        "--load-format",
+        "dummy",
+        "--hf-overrides",
+        json.dumps(NVFP4_HF_OVERRIDES),
+    ]
+
+    compilation_config = {
+        "mode": CompilationMode.VLLM_COMPILE,
+        "compile_sizes": [2, 4, 8],
+        "splitting_ops": [],
+        "pass_config": {
+            "enable_sp": True,
+            "fuse_gemm_comms": True,
+            "fuse_allreduce_rms": False,
+            "sp_min_token_num": 1,
+        },
+    }
+
+    async_tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+        "--compilation_config",
+        json.dumps(compilation_config),
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    compare_two_settings(NVFP4_MODEL_ID, async_tp_args, tp_args, method="generate")
diff --git a/tests/compile/correctness_e2e/test_sequence_parallel.py b/tests/compile/correctness_e2e/test_sequence_parallel.py
index 4b7cb814e74a..295277ddacf7 100644
--- a/tests/compile/correctness_e2e/test_sequence_parallel.py
+++ b/tests/compile/correctness_e2e/test_sequence_parallel.py
@@ -21,12 +21,14 @@
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-from ...models.registry import HF_EXAMPLE_MODELS
+from ...models.registry import HF_EXAMPLE_MODELS, _HfExamplesInfo
 from ...utils import compare_two_settings, create_new_process_for_each_test
 
 logger = init_logger("test_sequence_parallel")
 
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+NVFP4_MODEL_ID = "nvidia/Llama-3.1-8B-Instruct-NVFP4"
+NVFP4_MODEL_INFO = _HfExamplesInfo(NVFP4_MODEL_ID)
 
 
 class ParallelSetup(NamedTuple):
@@ -41,6 +43,7 @@ class ParallelSetup(NamedTuple):
 class SPTestOptions(NamedTuple):
     multi_node_only: bool
     load_format: str | None = None
+    model_info: _HfExamplesInfo | None = None
 
 
 @dataclass
@@ -167,9 +170,11 @@ def _compare_sp(
     num_gpus_available: int,
     use_inductor_graph_partition: bool,
     fuse_gemm_comms: bool,
+    enable_prompt_embeds: bool,
     *,
     method: Literal["generate", "encode"],
     is_multimodal: bool,
+    dtype: str = "float16",
 ):
     (
         tp_size,
@@ -180,14 +185,15 @@ def _compare_sp(
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format = test_options
+    multi_node_only = test_options.multi_node_only
+    load_format = test_options.load_format
 
-    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info = test_options.model_info or HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
 
     trust_remote_code = model_info.trust_remote_code
     tokenizer_mode = model_info.tokenizer_mode
-    hf_overrides = model_info.hf_overrides
+    hf_overrides = dict(model_info.hf_overrides)
     require_embed_inputs = model_info.require_embed_inputs
 
     if load_format == "dummy":
@@ -220,7 +226,7 @@ def _compare_sp(
     common_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "float16",
+        dtype,
         "--max-model-len",
         "2048",
         "--max-num-seqs",
@@ -248,6 +254,8 @@ def _compare_sp(
                 "--enable-mm-embeds",
             ]
         )
+    elif enable_prompt_embeds:
+        common_args.append("--enable-prompt-embeds")
 
     compilation_config = {
         "mode": CompilationMode.VLLM_COMPILE,
@@ -257,7 +265,9 @@ def _compare_sp(
             "fuse_gemm_comms": fuse_gemm_comms,
             "fuse_norm_quant": fuse_norm_quant,
             "fuse_act_quant": fuse_act_quant,
+            "fuse_allreduce_rms": False,
             "eliminate_noops": True,
+            "sp_min_token_num": 0,
         },
         "use_inductor_graph_partition": use_inductor_graph_partition,
     }
@@ -349,6 +359,84 @@ def test_tp_sp_generation(
         num_gpus_available,
         use_inductor_graph_partition,
         fuse_gemm_comms=fuse_gemm_comms,
+        enable_prompt_embeds=False,
         method="generate",
         is_multimodal=False,
     )
+
+
+# Focused regression test for the SP + prompt_embeds graph-rewrite path.
+# Covers pp_size=1 (SP only) and pp_size=2 (SP + PP); kept small on purpose so
+# we don't double the matrix of `test_tp_sp_generation` above.
+SP_PROMPT_EMBEDS_PARALLEL_SETUPS = [
+    ParallelSetup(
+        tp_size=2,
+        pp_size=pp_size,
+        fuse_norm_quant=False,
+        fuse_act_quant=False,
+        eager_mode=False,
+        chunked_prefill=False,
+    )
+    for pp_size in [1, 2]
+]
+
+
+@pytest.mark.parametrize("parallel_setup", SP_PROMPT_EMBEDS_PARALLEL_SETUPS)
+@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+@create_new_process_for_each_test()
+def test_tp_sp_generation_prompt_embeds(
+    parallel_setup: ParallelSetup,
+    num_gpus_available,
+    use_inductor_graph_partition: bool,
+):
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
+    _compare_sp(
+        "hmellor/tiny-random-LlamaForCausalLM",
+        parallel_setup,
+        distributed_backend="mp",
+        runner="auto",
+        test_options=SPTestOptions(multi_node_only=False, load_format=None),
+        num_gpus_available=num_gpus_available,
+        use_inductor_graph_partition=use_inductor_graph_partition,
+        fuse_gemm_comms=False,
+        enable_prompt_embeds=True,
+        method="generate",
+        is_multimodal=False,
+    )
+
+
+@create_new_process_for_each_test()
+def test_tp_sp_nvfp4_generation(num_gpus_available: int):
+    if (
+        not current_platform.is_cuda()
+        or not current_platform.is_device_capability_family(100)
+    ):
+        pytest.skip("NVFP4 requires Blackwell")
+
+    _compare_sp(
+        NVFP4_MODEL_ID,
+        ParallelSetup(
+            tp_size=2,
+            pp_size=1,
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            eager_mode=True,
+            chunked_prefill=False,
+        ),
+        "mp",
+        "auto",
+        SPTestOptions(
+            multi_node_only=False,
+            load_format="dummy",
+            model_info=NVFP4_MODEL_INFO,
+        ),
+        num_gpus_available,
+        use_inductor_graph_partition=False,
+        fuse_gemm_comms=False,
+        enable_prompt_embeds=False,
+        method="generate",
+        is_multimodal=False,
+        dtype="bfloat16",
+    )
diff --git a/tests/compile/fullgraph/test_toy_llama.py b/tests/compile/fullgraph/test_toy_llama.py
index 915fbc6ce7f3..69c758702e8a 100644
--- a/tests/compile/fullgraph/test_toy_llama.py
+++ b/tests/compile/fullgraph/test_toy_llama.py
@@ -17,7 +17,6 @@
 import torch
 from torch import nn
 
-from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
@@ -340,6 +339,8 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
 def test_toy_llama(
     backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
 ):
+    from vllm.compilation.counter import compilation_counter
+
     # We disable the vLLM compile cache into a new tmp dir for 1 reason:
     # 1. To make sure we can properly track the number of Inductor compilations.
     monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
index baa7bdef0a7d..a22c68f4bf92 100644
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -13,11 +13,13 @@
     AttentionBackendCase,
     Matches,
     custom_ops_combos,
+    is_blackwell,
 )
 from .models import (
     FLASHINFER_ATTN,
     TRITON_ATTN,
     llama3_8b,
+    llama3_8b_fp4,
     llama3_8b_fp8,
     llama4_scout_fp8,
     qwen3_a3b,
@@ -90,6 +92,69 @@ def test_tp2_async_tp_fp8_fusions(
     )
 
 
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp4],
+)
+@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+def test_tp2_async_tp_nvfp4_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    # NVFP4 currently wires the all-gather + GEMM path only.
+    matches = matches_fn(n_layers)._replace(async_tp=n_layers * 2)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
+            fuse_allreduce_rms=False,
+            # Override threshold for testing (models have small hidden_size)
+            sp_min_token_num=512,
+        ),
+    )
+
+    matches_check = [
+        "act_quant_fusion",
+        "attn_quant_fusion",
+        "sequence_parallel",
+        "async_tp",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "model_name, matches_fn, model_kwargs, hf_overrides",
diff --git a/tests/compile/passes/test_double_aiter_rms_quant_fusion.py b/tests/compile/passes/test_double_aiter_rms_quant_fusion.py
new file mode 100644
index 000000000000..161c956548a7
--- /dev/null
+++ b/tests/compile/passes/test_double_aiter_rms_quant_fusion.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for the DoubleQuant fan-out variants registered by
+``RocmAiterRMSNormQuantFusionPass``.
+
+Both variants target a 1-to-2 fan-out where one ``rms_norm`` output feeds
+two distinct ``rocm_aiter_group_fp8_quant`` consumers and rewrite it into
+two independent fused ``rms_norm + group_fp8_quant`` ops:
+
+* ``DoubleAiterRMSFp8GroupQuantPattern`` matches the un-viewed shape
+  (e.g. Kimi-K2.5 / DSR1).
+* ``DoubleAiterRMSFp8GroupQuantViewPattern`` (this PR) is the view-tolerant
+  sibling that additionally matches the
+  ``rms_norm -> view -> group_fp8_quant`` shape that DSv3.2's MLA indexer
+  q_c norm exposes through ``Fp8BlockScaledMMLinearKernel.apply_weights``'s
+  2D-flatten boilerplate.
+"""
+
+import pytest
+import torch
+
+import vllm.config
+from tests.compile.backend import TestBackend
+from vllm._aiter_ops import is_aiter_found_and_supported, rocm_aiter_ops
+from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
+from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
+
+EPS = 1e-5
+HIDDEN_SIZE = 256
+GROUP_SIZE = 128
+
+
+class _NoViewDoubleQuantModel(torch.nn.Module):
+    """``rms_norm -> 2x group_fp8_quant`` fan-out (Kimi-K2.5 / DSR1 shape)."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(HIDDEN_SIZE, dtype=torch.bfloat16))
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # avoid graph input being a direct arg to a matched pattern node
+        x = torch.relu(x)
+        rms = torch.ops.vllm_ir.rms_norm(x, self.weight, EPS)
+        q1, s1 = torch.ops.vllm.rocm_aiter_group_fp8_quant.default(rms, GROUP_SIZE)
+        q2, s2 = torch.ops.vllm.rocm_aiter_group_fp8_quant.default(rms, GROUP_SIZE)
+        return q1, s1, q2, s2
+
+
+class _ViewDoubleQuantModel(torch.nn.Module):
+    """``rms_norm -> view -> 2x group_fp8_quant`` fan-out (DSv3.2 shape).
+
+    Reproduces the FX-graph shape produced by ``Fp8BlockScaledMMLinearKernel``'s
+    2D-flatten before the FP8 group quant op.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(HIDDEN_SIZE, dtype=torch.bfloat16))
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        x = torch.relu(x)
+        rms = torch.ops.vllm_ir.rms_norm(x, self.weight, EPS)
+        view = rms.view(-1, rms.shape[-1])
+        q1, s1 = torch.ops.vllm.rocm_aiter_group_fp8_quant.default(view, GROUP_SIZE)
+        q2, s2 = torch.ops.vllm.rocm_aiter_group_fp8_quant.default(view, GROUP_SIZE)
+        return q1, s1, q2, s2
+
+
+@pytest.mark.parametrize(
+    "model_cls",
+    [_NoViewDoubleQuantModel, _ViewDoubleQuantModel],
+    ids=["no_view", "with_view"],
+)
+@pytest.mark.skipif(
+    not is_aiter_found_and_supported(),
+    reason="Only test on ROCm with AITER installed and supported",
+)
+def test_double_aiter_rms_fp8_group_quant_fusion(
+    model_cls: type[torch.nn.Module],
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Both fan-out shapes (with and without an intermediate view) must fuse
+    into ``rocm_aiter_rmsnorm_fp8_group_quant``: the no-view shape via
+    ``DoubleAiterRMSFp8GroupQuantPattern`` and the viewed shape via the
+    new ``DoubleAiterRMSFp8GroupQuantViewPattern`` sibling.
+
+    A failure on the ``with_view`` parametrization is a regression on the
+    DSv3.2 q_c norm path that this PR's view-tolerant pattern is intended
+    to cover.
+    """
+    torch._dynamo.reset()
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=torch.bfloat16),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=["+rms_norm", "+quant_fp8"],
+            pass_config=PassConfig(
+                fuse_norm_quant=True,
+                eliminate_noops=True,
+            ),
+        ),
+    )
+
+    with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
+            RocmAiterRMSNormQuantFusionPass,
+        )
+
+        torch.set_default_device("cuda")
+        torch.set_default_dtype(torch.bfloat16)
+        torch.manual_seed(0)
+
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        rocm_aiter_ops.refresh_env_variables()
+
+        fusion_pass = RocmAiterRMSNormQuantFusionPass(vllm_config)
+        passes = [
+            NoOpEliminationPass(vllm_config),
+            fusion_pass,
+            PostCleanupPass(vllm_config),
+        ]
+        backend = TestBackend(*passes)
+        model = model_cls()
+
+        x = torch.randn(8, HIDDEN_SIZE)
+        torch._dynamo.mark_dynamic(x, 0)
+
+        outputs_unfused = model(x)
+        model_fused = torch.compile(model, backend=backend)
+        outputs_fused = model_fused(x)
+
+        # Both consumers must be rewritten into the fused op (one
+        # ``register_replacement`` rewrite covers the whole 1-to-2 fan-out).
+        assert fusion_pass.matched_count == 1, (
+            f"Expected the {model_cls.__name__} fan-out to fuse via the "
+            f"DoubleQuant pattern (matched_count == 1), got "
+            f"{fusion_pass.matched_count}"
+        )
+
+        fused_op = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op()
+        backend.check_after_ops([fused_op])
+
+        # Numerical parity sanity-check: the fused pair must match the
+        # unfused pair on FP8 outputs (exact byte-equality is the goal,
+        # but allow a tiny tolerance for any residual numeric noise).
+        for fused_t, unfused_t in zip(outputs_fused, outputs_unfused):
+            torch.testing.assert_close(
+                fused_t.to(torch.float32),
+                unfused_t.to(torch.float32),
+                atol=1e-2,
+                rtol=1e-2,
+            )
diff --git a/tests/compile/passes/test_mla_rope_kvcache_cat_fusion.py b/tests/compile/passes/test_mla_rope_kvcache_cat_fusion.py
new file mode 100644
index 000000000000..cc3bfb7693ba
--- /dev/null
+++ b/tests/compile/passes/test_mla_rope_kvcache_cat_fusion.py
@@ -0,0 +1,413 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+import vllm.config
+from tests.compile.backend import TestBackend
+from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
+from vllm._aiter_ops import is_aiter_found_and_supported, rocm_aiter_ops
+from vllm.compilation.passes.fusion.mla_rope_kvcache_cat_fusion import (
+    MLARoPEKVCacheCatFusionPass,
+)
+from vllm.compilation.passes.utility.fix_functionalization import (
+    FixFunctionalizationPass,
+)
+from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
+from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    CompilationMode,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.attention import MLAAttention
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.rotary_embedding import (
+    DeepseekScalingRotaryEmbedding,
+    RotaryEmbedding,
+)
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import _encode_layer_name
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+INDEX_SELECT_OP = torch.ops.aten.index.Tensor
+VLLM_UNIFIED_MLA_KV_CACHE_UPDATE_OP = torch.ops.vllm.unified_mla_kv_cache_update
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class MLARoPEKVCacheCatTestModel(torch.nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        attn_backend: AttentionBackendEnum,
+        use_deepseek_scaling_rope: bool,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        is_neox: bool,
+        dtype: torch.dtype,
+        device: torch.device,
+        prefix: str = "model.layers.0.self_attn.attn",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.dtype = dtype
+        self.device = device
+        self.layer_name = prefix
+
+        self.num_kv_heads = 1
+        self.head_size = kv_lora_rank + qk_rope_head_dim
+        self.block_size = vllm_config.cache_config.block_size
+        self.scale = self.qk_head_dim**-0.5
+
+        if use_deepseek_scaling_rope:
+            self.rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size=qk_rope_head_dim,
+                rotary_dim=qk_rope_head_dim,
+                max_position_embeddings=4096,
+                base=10000,
+                is_neox_style=is_neox,
+                scaling_factor=1.0,
+                dtype=dtype,
+            )
+        else:
+            self.rotary_emb = RotaryEmbedding(
+                head_size=qk_rope_head_dim,
+                rotary_dim=qk_rope_head_dim,
+                max_position_embeddings=4096,
+                base=10000,
+                is_neox_style=is_neox,
+                dtype=dtype,
+            )
+
+        # Initialize intermediate mm layers for unit test
+        self.q_b_proj = ColumnParallelLinear(
+            self.q_lora_rank,
+            self.num_heads * self.qk_head_dim,
+            bias=False,
+            prefix=f"{prefix}.q_b_proj",
+        ).to(device)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            prefix=f"{prefix}.kv_b_proj",
+        ).to(device)
+
+        # ColumnParallelLinear default init in bf16 with seed 0 produces
+        # near-zero weights (7/4.7M nonzero), making the GEMM output almost
+        # entirely zero and masking correctness bugs. Reinitialize to get
+        # dense outputs.
+        with torch.no_grad():
+            torch.nn.init.normal_(self.q_b_proj.weight, std=0.02)
+            torch.nn.init.normal_(self.kv_b_proj.weight, std=0.02)
+
+        # Register layer metadata for the fusion pass via MLAAttention
+        self.mla_attn = MLAAttention(
+            num_heads=self.num_heads,
+            scale=self.scale,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            v_head_dim=self.v_head_dim,
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            kv_b_proj=self.kv_b_proj,
+            cache_config=vllm_config.cache_config,
+            quant_config=vllm_config.quant_config,
+            prefix=prefix,
+            attn_backend=attn_backend.get_class(),
+        )
+        self.attn_backend: type[AttentionBackend] = self.mla_attn.get_attn_backend()
+        self.mla_attn._k_scale = self.mla_attn._k_scale.to(device)
+        self.mla_attn._v_scale = self.mla_attn._v_scale.to(device)
+
+        # Keep both the string dtype (for ops) and torch dtype (for tensors)
+        self.kv_cache_dtype_str = vllm_config.cache_config.cache_dtype
+        self.kv_cache_dtype = (
+            FP8_DTYPE if self.kv_cache_dtype_str.startswith("fp8") else self.dtype
+        )
+
+        # Initialize attn MetadataBuilder
+        self.builder = self.attn_backend.get_builder_cls()(
+            kv_cache_spec=self.mla_attn.get_kv_cache_spec(vllm_config),
+            layer_names=[self.mla_attn.layer_name],
+            vllm_config=vllm_config,
+            device=device,
+        )
+
+    def build_attn_metadata(self, batch_size: int) -> CommonAttentionMetadata:
+        """Initialize attention metadata."""
+        # Create common attn metadata
+        batch_spec = BatchSpec(seq_lens=[1] * batch_size, query_lens=[1] * batch_size)
+        common_attn_metadata = create_common_attn_metadata(
+            batch_spec, self.block_size, self.device, arange_block_indices=True
+        )
+
+        max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
+        num_blocks = batch_size * max_blocks
+
+        # Fetch the attention backend and kv cache shape and stride order
+        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_kv_heads, self.head_size
+        )
+        try:
+            kv_cache_stride_order = self.attn_backend.get_kv_cache_stride_order()
+        except (AttributeError, NotImplementedError):
+            kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+        kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+        inv_order = [
+            kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
+        ]
+
+        raw_tensor = torch.zeros(
+            num_blocks * self.block_size * self.num_kv_heads * self.head_size,
+            dtype=self.kv_cache_dtype,
+            device=self.device,
+        )
+        raw_tensor = raw_tensor.view(kv_cache_shape)
+        kv_cache = raw_tensor.permute(*inv_order)
+
+        self.mla_attn.kv_cache = kv_cache
+
+        # Build attn metadata
+        attn_metadata = self.builder.build(
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata
+        )
+
+        return attn_metadata
+
+    def forward(
+        self, qkv_lora: torch.Tensor, positions: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        qkv_lora = qkv_lora.clone()
+        q_c, kv_lora = qkv_lora.split(
+            [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+            dim=-1,
+        )
+        q = self.q_b_proj(q_c)[0]
+        kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+        q = q.view(-1, self.num_heads, self.qk_head_dim)
+        k_pe = k_pe.unsqueeze(1)
+
+        q[..., self.qk_nope_head_dim :], k_pe = self.rotary_emb(
+            positions, q[..., self.qk_nope_head_dim :], k_pe
+        )
+
+        dummy = torch.ops.vllm.unified_mla_kv_cache_update(
+            kv_c,
+            k_pe,
+            _encode_layer_name(self.layer_name),
+            self.kv_cache_dtype_str,
+            self.mla_attn._k_scale,
+        )
+        return q, kv_c, k_pe, dummy
+
+    def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
+        ops = [
+            INDEX_SELECT_OP,
+            torch.ops.vllm.unified_mla_kv_cache_update.default,
+        ]
+        return ops
+
+    def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
+        return [torch.ops.vllm.fused_rope_unified_mla_kv_cache_update.default]
+
+
+MLA_BACKENDS = [AttentionBackendEnum.TRITON_MLA]
+if flash_attn_supports_mla():
+    MLA_BACKENDS += [AttentionBackendEnum.FLASH_ATTN_MLA]
+if is_aiter_found_and_supported():
+    MLA_BACKENDS += [AttentionBackendEnum.ROCM_AITER_MLA]
+
+
+@pytest.mark.parametrize("attn_backend", MLA_BACKENDS)
+@pytest.mark.parametrize("use_deepseek_scaling_rope", [True])
+@pytest.mark.parametrize("num_heads", [16])
+@pytest.mark.parametrize("qk_nope_head_dim", [128])
+@pytest.mark.parametrize("qk_rope_head_dim", [64])
+@pytest.mark.parametrize("v_head_dim", [128])
+@pytest.mark.parametrize("q_lora_rank", [1536])
+@pytest.mark.parametrize("kv_lora_rank", [512])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("is_neox", [True, False])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="MLA RoPE+KVCache+Cat fusion is only supported on CUDA and ROCm.",
+)
+def test_mla_rope_kvcache_cat_fusion(
+    attn_backend: AttentionBackendEnum,
+    use_deepseek_scaling_rope: bool,
+    num_heads: int,
+    qk_nope_head_dim: int,
+    qk_rope_head_dim: int,
+    v_head_dim: int,
+    q_lora_rank: int,
+    kv_lora_rank: int,
+    block_size: int,
+    is_neox: bool,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(
+            model="deepseek-ai/DeepSeek-V2-Lite",
+            dtype=dtype,
+        ),
+        cache_config=CacheConfig(
+            block_size=block_size,
+            cache_dtype=kv_cache_dtype,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            pass_config=PassConfig(
+                fuse_rope_kvcache_cat_mla=True,
+                eliminate_noops=True,
+            ),
+        ),
+    )
+
+    with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        if not torch.distributed.is_initialized():
+            from vllm.distributed.parallel_state import (
+                init_distributed_environment,
+                initialize_model_parallel,
+            )
+            from vllm.utils.system_utils import update_environment_variables
+
+            update_environment_variables(
+                {
+                    "RANK": "0",
+                    "LOCAL_RANK": "0",
+                    "WORLD_SIZE": "1",
+                    "MASTER_ADDR": "localhost",
+                    "MASTER_PORT": "54321",
+                }
+            )
+            init_distributed_environment()
+            initialize_model_parallel()
+
+        if attn_backend == AttentionBackendEnum.ROCM_AITER_MLA:
+            m.setenv("VLLM_ROCM_USE_AITER", "1")
+            rocm_aiter_ops.refresh_env_variables()
+
+        model = MLARoPEKVCacheCatTestModel(
+            vllm_config=vllm_config,
+            attn_backend=attn_backend,
+            use_deepseek_scaling_rope=use_deepseek_scaling_rope,
+            num_heads=num_heads,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            q_lora_rank=q_lora_rank,
+            kv_lora_rank=kv_lora_rank,
+            is_neox=is_neox,
+            dtype=dtype,
+            device=torch.get_default_device(),
+        )
+
+        fusion_pass = MLARoPEKVCacheCatFusionPass(vllm_config)
+        # note: FixFunctionalizationPass is required to correctly lower
+        # the fused op to its inplace version with auto-functionalization v1.
+        # Without it, decompose_auto_functionalized calls clone_preserve_strides
+        # on the non-contiguous q_pe slice directly, and inductor's lowering
+        # of the resulting as_strided chain incorrectly drops the storage offset.
+        # auto-functionalization v2 avoids this: it clones the contiguous base
+        # tensor (_all_bases) and reconstructs the slice as a view, so the
+        # offset is never passed through as_strided lowering.
+        passes = [
+            NoOpEliminationPass(vllm_config),
+            fusion_pass,
+            PostCleanupPass(vllm_config),
+            FixFunctionalizationPass(vllm_config),
+        ]
+        backend = TestBackend(*passes)
+
+        T = 5
+
+        qkv_lora = torch.randn(
+            T,
+            q_lora_rank + kv_lora_rank + qk_rope_head_dim,
+            dtype=dtype,
+        )
+        pos = torch.arange(T, dtype=torch.long)
+
+        qkv_unfused = qkv_lora.clone()
+        pos_unfused = pos.clone()
+
+        # Run unfused version
+        with set_forward_context(None, vllm_config):
+            forward_context = get_forward_context()
+            attn_metadata = model.build_attn_metadata(T)
+            forward_context.slot_mapping = {
+                model.layer_name: attn_metadata.slot_mapping
+            }
+            q_unfused, kv_c_unfused, k_pe_unfused, dummy = model(
+                qkv_unfused, pos_unfused
+            )
+            attn_layer = forward_context.no_compile_layers[model.layer_name]
+            kv_cache_unfused = attn_layer.kv_cache.clone()
+        del dummy
+
+        # Run fused version (compiled)
+        torch._dynamo.mark_dynamic(qkv_lora, 0)
+        torch._dynamo.mark_dynamic(pos, 0)
+        with set_forward_context(None, vllm_config):
+            model_fused = torch.compile(model, backend=backend)
+            forward_context = get_forward_context()
+            attn_metadata = model.build_attn_metadata(T)
+            forward_context.slot_mapping = {
+                model.layer_name: attn_metadata.slot_mapping
+            }
+            q_fused, kv_c_fused, k_pe_fused, dummy = model_fused(qkv_lora, pos)
+            attn_layer = forward_context.no_compile_layers[model.layer_name]
+            kv_cache_fused = attn_layer.kv_cache
+        del dummy
+
+        assert fusion_pass.matched_count == 1
+
+        backend.check_before_ops(model.ops_in_model_before())
+        backend.check_after_ops(model.ops_in_model_after())
+
+        if dtype == torch.float16:
+            ATOL, RTOL = (2e-3, 2e-3)
+        else:
+            ATOL, RTOL = (1e-2, 1e-2)
+
+        torch.testing.assert_close(q_unfused, q_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(kv_c_unfused, kv_c_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(k_pe_unfused, k_pe_fused, atol=ATOL, rtol=RTOL)
+        # Cannot compare fp8_* directly here, cast to model dtype instead
+        torch.testing.assert_close(
+            kv_cache_unfused.view(dtype),
+            kv_cache_fused.view(dtype),
+            atol=ATOL,
+            rtol=RTOL,
+        )
diff --git a/tests/compile/passes/test_rope_kvcache_fusion.py b/tests/compile/passes/test_rope_kvcache_fusion.py
index bab70c12a89b..b27adfc46f51 100644
--- a/tests/compile/passes/test_rope_kvcache_fusion.py
+++ b/tests/compile/passes/test_rope_kvcache_fusion.py
@@ -34,7 +34,6 @@
     CommonAttentionMetadata,
 )
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
-from vllm.v1.kv_cache_interface import AttentionSpec
 
 INDEX_SELECT_OP = torch.ops.aten.index.Tensor
 VLLM_UNIFIED_KV_CACHE_UPDATE_OP = torch.ops.vllm.unified_kv_cache_update
@@ -102,13 +101,8 @@ def __init__(
         )
 
         # Initialize attn MetadataBuilder
-        self.builder = self.attn.attn_backend.get_builder_cls()(
-            kv_cache_spec=AttentionSpec(
-                block_size=self.block_size,
-                num_kv_heads=self.num_kv_heads,
-                head_size=head_size,
-                dtype=self.kv_cache_dtype,
-            ),
+        self.builder = self.attn_backend.get_builder_cls()(
+            kv_cache_spec=self.attn.get_kv_cache_spec(vllm_config),
             layer_names=[self.attn.layer_name],
             vllm_config=vllm_config,
             device=device,
@@ -126,12 +120,11 @@ def build_attn_metadata(self, batch_size: int) -> CommonAttentionMetadata:
         num_blocks = batch_size * max_blocks
 
         # Fetch the attention backend and kv cache shape and stride order
-        attn_backend = self.attn.attn_backend
-        kv_cache_shape = attn_backend.get_kv_cache_shape(
+        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
             num_blocks, self.block_size, self.num_kv_heads, self.head_size
         )
         try:
-            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
+            kv_cache_stride_order = self.attn_backend.get_kv_cache_stride_order()
         except (AttributeError, NotImplementedError):
             kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
 
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index c3a065c56142..13e988307047 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import functools
 import hashlib
-import multiprocessing
 import os
 import pickle
 import tempfile
@@ -15,7 +13,6 @@
 import torch
 
 import vllm.envs as envs
-import vllm.model_executor.layers.activation
 from vllm.compilation.backends import VllmBackend
 from vllm.compilation.caching import (
     StandaloneCompiledArtifacts,
@@ -476,64 +473,57 @@ def test_standalone_compile_correctness():
 @create_new_process_for_each_test("spawn")
 def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
     """
-    Test that compiling gpt2 twice results in a cache hit and
-    capture torch dynamic symbol creations to ensure make_symbol
-    not called on cache hit.
-    """
+    Test that compiling gpt2 twice results in a cache hit.
 
-    import torch.fx.experimental.symbolic_shapes as symbolic_shapes_module
-    from torch.utils._sympy.symbol import make_symbol
+    Counter values are read from the EngineCore subprocess via
+    ``LLM.collective_rpc`` so the test works under default V1
+    multiprocessing (no shared memory between test and engine).
+    """
 
     from vllm import LLM
 
-    create_symbol_counter = multiprocessing.Value("i", 0)
-    original_make_symbol = make_symbol
+    def _snap(self):
+        from vllm.compilation.counter import compilation_counter
 
-    @functools.wraps(original_make_symbol)
-    def counting_make_symbol(prefix, idx, **kwargs):
-        with create_symbol_counter.get_lock():
-            create_symbol_counter.value += 1
-        return original_make_symbol(prefix, idx, **kwargs)
-
-    symbolic_shapes_module.make_symbol = counting_make_symbol
-    try:
-        with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname:
-            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
-            m.setenv("VLLM_USE_AOT_COMPILE", "1")
-            # First compilation - initialize model and generate
-            llm_model = LLM(
-                model="gpt2",
-                compilation_config=CompilationConfig(
-                    mode=CompilationMode.VLLM_COMPILE,
-                ),
-                max_model_len=256,
-            )
+        return (
+            compilation_counter.num_aot_compiles,
+            compilation_counter.num_aot_artifacts_saved,
+            compilation_counter.num_aot_artifacts_loaded,
+        )
 
-            llm_model.generate("Hello, my name is")
-            assert create_symbol_counter.value == 2
-            create_symbol_counter.value = 0
+    # collective_rpc(callable) requires pickle-based serialization.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
-            # Clean up first model
-            del llm_model
-            disable_envs_cache()
-            vllm.model_executor.layers.activation._ACTIVATION_REGISTRY._dict.clear()
+    with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname:
+        m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+        m.setenv("VLLM_USE_AOT_COMPILE", "1")
+        # First compilation - initialize model and generate
+        llm_model = LLM(
+            model="gpt2",
+            compilation_config=CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+            ),
+            max_model_len=256,
+        )
 
-            # Second compilation - should hit cache
-            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
-            llm_model = LLM(
-                model="gpt2",
-                compilation_config=CompilationConfig(
-                    mode=CompilationMode.VLLM_COMPILE,
-                ),
-                max_model_len=256,
-            )
-            llm_model.generate("Hello, my name is")
+        llm_model.generate("Hello, my name is")
+        assert llm_model.collective_rpc(_snap)[0] == (1, 1, 0)
 
-            assert create_symbol_counter.value == 0
+        # Clean up first model
+        del llm_model
+        disable_envs_cache()
 
-    finally:
-        # Restore original method
-        symbolic_shapes_module.make_symbol = original_make_symbol
+        # Second compilation - should hit cache
+        m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+        llm_model = LLM(
+            model="gpt2",
+            compilation_config=CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+            ),
+            max_model_len=256,
+        )
+        llm_model.generate("Hello, my name is")
+        assert llm_model.collective_rpc(_snap)[0] == (0, 0, 1)
 
 
 @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 1ab4949c4003..2a2658016e24 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -9,7 +9,7 @@
 import pytest
 from pydantic import Field
 
-from vllm.config import AttentionConfig, CompilationConfig, config
+from vllm.config import AttentionConfig, CompilationConfig, ModelConfig, config
 from vllm.engine.arg_utils import (
     EngineArgs,
     _expand_json_human_readable_numbers,
@@ -116,6 +116,10 @@ class DummyConfig:
     """Regular bool with default True"""
     optional_bool: bool | None = None
     """Optional bool with default None"""
+
+    optional_bool_or_str: bool | str | None = None
+    """Optional bool-or-str with default None"""
+
     optional_literal: Literal["x", "y"] | None = None
     """Optional literal with default None"""
     tuple_n: tuple[int, ...] = Field(default_factory=lambda: (1, 2, 3))
@@ -170,6 +174,11 @@ def test_get_kwargs():
     # bools should not have their type set
     assert kwargs["regular_bool"].get("type") is None
     assert kwargs["optional_bool"].get("type") is None
+    # optional bool-or-str should accept an optional string value
+    assert kwargs["optional_bool_or_str"]["type"] is str
+    assert kwargs["optional_bool_or_str"]["nargs"] == "?"
+    assert kwargs["optional_bool_or_str"]["const"] is True
+    assert "action" not in kwargs["optional_bool_or_str"]
     # optional literals should have None as a choice
     assert kwargs["optional_literal"]["choices"] == ["x", "y", "None"]
     # tuples should have the correct nargs
@@ -197,6 +206,32 @@ def test_get_kwargs():
     assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)  # type: ignore[call-arg]
 
 
+def test_hf_token_get_kwargs():
+    kwargs = get_kwargs(ModelConfig)["hf_token"]
+
+    assert kwargs["type"] is str
+    assert kwargs["nargs"] == "?"
+    assert kwargs["const"] is True
+    assert "action" not in kwargs
+
+
+@pytest.mark.parametrize(
+    ("cli_args", "expected"),
+    [
+        ([], None),
+        (["--hf-token"], True),
+        (["--hf-token", "hf_secret"], "hf_secret"),
+        (["--hf-token", "None"], "None"),
+    ],
+)
+def test_hf_token_cli_arg(cli_args, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    args = parser.parse_args(cli_args)
+
+    assert args.hf_token == expected
+
+
 @pytest.mark.parametrize(
     ("arg", "expected"),
     [
diff --git a/tests/entrypoints/openai/chat_completion/test_enable_force_include_usage.py b/tests/entrypoints/openai/chat_completion/test_enable_force_include_usage.py
index 1bc545e86464..75ddeb43ab77 100644
--- a/tests/entrypoints/openai/chat_completion/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/chat_completion/test_enable_force_include_usage.py
@@ -67,58 +67,3 @@ async def test_chat_with_enable_force_include_usage(
             chunk.usage.prompt_tokens + chunk.usage.completion_tokens
         )
         last_completion_tokens = chunk.usage.completion_tokens
-
-
-@pytest.fixture(scope="module")
-def transcription_server_with_force_include_usage():
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-num-seqs",
-        "4",
-        "--enforce-eager",
-        "--enable-force-include-usage",
-        "--gpu-memory-utilization",
-        "0.2",
-    ]
-
-    with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def transcription_client_with_force_include_usage(
-    transcription_server_with_force_include_usage,
-):
-    async with (
-        transcription_server_with_force_include_usage.get_async_client() as async_client
-    ):
-        yield async_client
-
-
-@pytest.mark.asyncio
-async def test_transcription_with_enable_force_include_usage(
-    transcription_client_with_force_include_usage, winning_call
-):
-    res = (
-        await transcription_client_with_force_include_usage.audio.transcriptions.create(
-            model="openai/whisper-large-v3-turbo",
-            file=winning_call,
-            language="en",
-            temperature=0.0,
-            stream=True,
-            timeout=30,
-        )
-    )
-
-    async for chunk in res:
-        if not len(chunk.choices):
-            # final usage sent
-            usage = chunk.usage
-            assert isinstance(usage, dict)
-            assert usage["prompt_tokens"] > 0
-            assert usage["completion_tokens"] > 0
-            assert usage["total_tokens"] > 0
-        else:
-            assert not hasattr(chunk, "usage")
diff --git a/tests/entrypoints/openai/completion/test_shutdown.py b/tests/entrypoints/openai/completion/test_shutdown.py
index 966c9f869c44..d7fba364ab71 100644
--- a/tests/entrypoints/openai/completion/test_shutdown.py
+++ b/tests/entrypoints/openai/completion/test_shutdown.py
@@ -299,19 +299,22 @@ async def test_abort_timeout_exits_quickly(wait_for_engine_idle: float):
         start_time = time.time()
         proc.send_signal(signal.SIGTERM)
 
-        # abort timeout (0) should exit promptly
-        for _ in range(20):
-            if proc.poll() is not None:
-                break
-            time.sleep(0.1)
+        # abort timeout (0) should stop the server promptly. On ROCm, process
+        # exit can spend extra time in HIP/RCCL/native extension teardown after
+        # the server and engine have already shut down.
+        max_exit_time = 4.0 if _IS_ROCM else 2.1
 
-        if proc.poll() is None:
+        try:
+            proc.wait(timeout=max_exit_time)
+        except subprocess.TimeoutExpired:
             proc.kill()
             proc.wait(timeout=5)
             pytest.fail("Process did not exit after SIGTERM with abort timeout")
 
         exit_time = time.time() - start_time
-        assert exit_time < 2.1, f"Default shutdown took too long: {exit_time:.1f}s"
+        assert exit_time < max_exit_time, (
+            f"Default shutdown took too long: {exit_time:.1f}s"
+        )
         assert proc.returncode in (0, -15, None), f"Unexpected: {proc.returncode}"
 
         await _assert_children_cleaned_up(child_pids)
diff --git a/tests/entrypoints/openai/responses/test_function_call.py b/tests/entrypoints/openai/responses/test_function_call.py
index 8ca43feaca4f..9dcbd74c890b 100644
--- a/tests/entrypoints/openai/responses/test_function_call.py
+++ b/tests/entrypoints/openai/responses/test_function_call.py
@@ -325,8 +325,12 @@ async def test_function_calling_with_streaming_expected_arguments(
     "tool_choice",
     ["auto", "required", {"type": "function", "name": "get_current_weather"}],
 )
+@pytest.mark.parametrize(
+    "enable_thinking",
+    [True, False],
+)
 async def test_function_calling_with_streaming_types(
-    client: openai.AsyncOpenAI, model_name: str, tool_choice
+    client: openai.AsyncOpenAI, model_name: str, tool_choice, enable_thinking: bool
 ):
     # this links the "done" type with the "start" type
     # so every "done" type should have a corresponding "start" type
@@ -436,6 +440,7 @@ async def test_function_calling_with_streaming_types(
         input=input_list,
         tools=tools,
         tool_choice=tool_choice,
+        extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
         stream=True,
     )
 
diff --git a/tests/entrypoints/openai/realtime/__init__.py b/tests/entrypoints/speech_to_text/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/realtime/__init__.py
rename to tests/entrypoints/speech_to_text/__init__.py
diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/speech_to_text/conftest.py
similarity index 100%
rename from tests/entrypoints/openai/conftest.py
rename to tests/entrypoints/speech_to_text/conftest.py
diff --git a/tests/entrypoints/openai/speech_to_text/__init__.py b/tests/entrypoints/speech_to_text/correctness/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/speech_to_text/__init__.py
rename to tests/entrypoints/speech_to_text/correctness/__init__.py
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/speech_to_text/correctness/test_transcription_api_correctness.py
similarity index 100%
rename from tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
rename to tests/entrypoints/speech_to_text/correctness/test_transcription_api_correctness.py
diff --git a/tests/entrypoints/speech_to_text/realtime/__init__.py b/tests/entrypoints/speech_to_text/realtime/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/openai/realtime/test_realtime_validation.py b/tests/entrypoints/speech_to_text/realtime/test_realtime_validation.py
similarity index 97%
rename from tests/entrypoints/openai/realtime/test_realtime_validation.py
rename to tests/entrypoints/speech_to_text/realtime/test_realtime_validation.py
index e317090fa543..675922f58840 100644
--- a/tests/entrypoints/openai/realtime/test_realtime_validation.py
+++ b/tests/entrypoints/speech_to_text/realtime/test_realtime_validation.py
@@ -10,7 +10,7 @@
 import pytest
 import websockets
 
-from tests.entrypoints.openai.conftest import add_attention_backend
+from tests.entrypoints.speech_to_text.conftest import add_attention_backend
 from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
 from vllm.assets.audio import AudioAsset
 from vllm.multimodal.media.audio import load_audio
@@ -163,6 +163,11 @@ async def test_multi_chunk_streaming(
                 " A little piece of practical poetry. Mary had a little lamb,"
                 " it sleeps with quite a flow, and everywhere that Mary went,"
                 " the lamb was sure to go."
+            ) or full_text == (
+                " First words I spoke in the original phonograph."
+                " A little piece of practical poetry. Mary had a little lamb,"
+                " it squeaked with quite a flow, and everywhere that Mary went,"
+                " the lamb was sure to go."
             )
 
 
diff --git a/tests/entrypoints/speech_to_text/test_speech_to_text_cancellation.py b/tests/entrypoints/speech_to_text/test_speech_to_text_cancellation.py
new file mode 100644
index 000000000000..2c8c1229e840
--- /dev/null
+++ b/tests/entrypoints/speech_to_text/test_speech_to_text_cancellation.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, Mock
+
+import pytest
+
+from vllm.entrypoints.openai.speech_to_text.protocol import TranscriptionResponse
+from vllm.entrypoints.openai.speech_to_text.speech_to_text import OpenAISpeechToText
+
+
+async def _never_finishes():
+    await asyncio.Event().wait()
+    yield
+
+
+async def _records_start_then_never_finishes(started_request_ids, request_id):
+    started_request_ids.append(request_id)
+    await asyncio.Event().wait()
+    yield
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("engine_inputs", "expected_request_ids"),
+    [
+        ([{"prompt": "chunk"}], ["transcribe-outer-request"]),
+        (
+            [{"prompt": "chunk-0"}, {"prompt": "chunk-1"}],
+            ["transcribe-outer-request-0", "transcribe-outer-request-1"],
+        ),
+    ],
+)
+async def test_non_streaming_cancel_aborts_engine_requests(
+    engine_inputs, expected_request_ids
+):
+    engine_client = SimpleNamespace(
+        errored=False,
+        generate=Mock(side_effect=lambda *_args, **_kwargs: _never_finishes()),
+        abort=AsyncMock(),
+        is_tracing_enabled=AsyncMock(return_value=False),
+    )
+
+    server = OpenAISpeechToText.__new__(OpenAISpeechToText)
+    server.engine_client = engine_client
+    server.task_type = "transcribe"
+    server.models = SimpleNamespace(model_name=lambda: "audio")
+    server.model_config = SimpleNamespace(max_model_len=1024)
+    server.model_cls = SimpleNamespace(no_space_languages=set())
+    server.default_sampling_params = {}
+    server.asr_config = SimpleNamespace(max_audio_clip_s=30)
+    server._check_model = AsyncMock(return_value=None)
+    server._maybe_get_adapters = Mock(return_value=None)
+    server._preprocess_speech_to_text = AsyncMock(return_value=(engine_inputs, 40.0))
+    server._log_inputs = Mock()
+
+    request = SimpleNamespace(
+        model="audio",
+        response_format="json",
+        stream=False,
+        use_beam_search=False,
+        max_completion_tokens=None,
+        language="en",
+        prompt="",
+        to_sampling_params=Mock(return_value=object()),
+    )
+    raw_request = SimpleNamespace(
+        headers={"X-Request-Id": "outer-request"},
+        state=SimpleNamespace(),
+    )
+
+    task = asyncio.create_task(
+        server._create_speech_to_text(
+            audio_data=b"audio",
+            request=request,
+            raw_request=raw_request,
+            response_class=TranscriptionResponse,
+            stream_generator_method=Mock(),
+        )
+    )
+    await asyncio.sleep(0)
+
+    task.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+    generated_request_ids = [
+        call.args[2] for call in engine_client.generate.call_args_list
+    ]
+    assert generated_request_ids == expected_request_ids
+    engine_client.abort.assert_awaited_once_with(expected_request_ids)
+
+
+@pytest.mark.asyncio
+async def test_non_streaming_cancel_advances_all_chunk_generators():
+    started_request_ids: list[str] = []
+    engine_client = SimpleNamespace(
+        errored=False,
+        generate=Mock(
+            side_effect=lambda *_args, **_kwargs: (
+                _records_start_then_never_finishes(started_request_ids, _args[2])
+            )
+        ),
+        abort=AsyncMock(),
+        is_tracing_enabled=AsyncMock(return_value=False),
+    )
+
+    engine_inputs = [
+        {"prompt": "chunk-0"},
+        {"prompt": "chunk-1"},
+        {"prompt": "chunk-2"},
+    ]
+    server = OpenAISpeechToText.__new__(OpenAISpeechToText)
+    server.engine_client = engine_client
+    server.task_type = "transcribe"
+    server.models = SimpleNamespace(model_name=lambda: "audio")
+    server.model_config = SimpleNamespace(max_model_len=1024)
+    server.model_cls = SimpleNamespace(no_space_languages=set())
+    server.default_sampling_params = {}
+    server.asr_config = SimpleNamespace(max_audio_clip_s=30)
+    server._check_model = AsyncMock(return_value=None)
+    server._maybe_get_adapters = Mock(return_value=None)
+    server._preprocess_speech_to_text = AsyncMock(return_value=(engine_inputs, 90.0))
+    server._log_inputs = Mock()
+
+    request = SimpleNamespace(
+        model="audio",
+        response_format="json",
+        stream=False,
+        use_beam_search=False,
+        max_completion_tokens=None,
+        language="en",
+        prompt="",
+        to_sampling_params=Mock(return_value=object()),
+    )
+    raw_request = SimpleNamespace(
+        headers={"X-Request-Id": "outer-request"},
+        state=SimpleNamespace(),
+    )
+
+    task = asyncio.create_task(
+        server._create_speech_to_text(
+            audio_data=b"audio",
+            request=request,
+            raw_request=raw_request,
+            response_class=TranscriptionResponse,
+            stream_generator_method=Mock(),
+        )
+    )
+    await asyncio.sleep(0.01)
+
+    expected_request_ids = [
+        "transcribe-outer-request-0",
+        "transcribe-outer-request-1",
+        "transcribe-outer-request-2",
+    ]
+    assert set(started_request_ids) == set(expected_request_ids)
+
+    task.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+
+@pytest.mark.asyncio
+async def test_language_detection_cancel_aborts_engine_request():
+    engine_client = SimpleNamespace(
+        generate=Mock(return_value=_never_finishes()),
+        abort=AsyncMock(),
+    )
+
+    server = OpenAISpeechToText.__new__(OpenAISpeechToText)
+    server.engine_client = engine_client
+    server.asr_config = SimpleNamespace()
+    server.tokenizer = Mock()
+    server.model_cls = SimpleNamespace(
+        get_language_detection_prompt=Mock(return_value={"prompt": "detect"}),
+        get_language_token_ids=Mock(return_value=[1]),
+        parse_language_detection_output=Mock(),
+    )
+
+    request_id = "transcribe-outer-request-lang_detect"
+    task = asyncio.create_task(server._detect_language(Mock(), request_id))
+    await asyncio.sleep(0)
+
+    task.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+    engine_client.abort.assert_awaited_once_with(request_id)
diff --git a/tests/entrypoints/speech_to_text/transcription/__init__.py b/tests/entrypoints/speech_to_text/transcription/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/speech_to_text/transcription/test_enable_force_include_usage.py b/tests/entrypoints/speech_to_text/transcription/test_enable_force_include_usage.py
new file mode 100644
index 000000000000..0477e7082bc7
--- /dev/null
+++ b/tests/entrypoints/speech_to_text/transcription/test_enable_force_include_usage.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+
+@pytest.fixture(scope="module")
+def transcription_server_with_force_include_usage():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-num-seqs",
+        "4",
+        "--enforce-eager",
+        "--enable-force-include-usage",
+        "--gpu-memory-utilization",
+        "0.2",
+    ]
+
+    with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def transcription_client_with_force_include_usage(
+    transcription_server_with_force_include_usage,
+):
+    async with (
+        transcription_server_with_force_include_usage.get_async_client() as async_client
+    ):
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_transcription_with_enable_force_include_usage(
+    transcription_client_with_force_include_usage, winning_call
+):
+    res = (
+        await transcription_client_with_force_include_usage.audio.transcriptions.create(
+            model="openai/whisper-large-v3-turbo",
+            file=winning_call,
+            language="en",
+            temperature=0.0,
+            stream=True,
+            timeout=30,
+        )
+    )
+
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            usage = chunk.usage
+            assert isinstance(usage, dict)
+            assert usage["prompt_tokens"] > 0
+            assert usage["completion_tokens"] > 0
+            assert usage["total_tokens"] > 0
+        else:
+            assert not hasattr(chunk, "usage")
diff --git a/tests/entrypoints/openai/speech_to_text/test_transcription_inter_chunk_spacing.py b/tests/entrypoints/speech_to_text/transcription/test_transcription_inter_chunk_spacing.py
similarity index 100%
rename from tests/entrypoints/openai/speech_to_text/test_transcription_inter_chunk_spacing.py
rename to tests/entrypoints/speech_to_text/transcription/test_transcription_inter_chunk_spacing.py
diff --git a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py b/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py
similarity index 98%
rename from tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
rename to tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py
index 4ac48699a022..5ea218406b98 100644
--- a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
+++ b/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from tests.entrypoints.openai.conftest import add_attention_backend
+from tests.entrypoints.speech_to_text.conftest import add_attention_backend
 from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
 
 MISTRAL_FORMAT_ARGS = [
diff --git a/tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py b/tests/entrypoints/speech_to_text/transcription/test_transcription_validation_whisper.py
similarity index 100%
rename from tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py
rename to tests/entrypoints/speech_to_text/transcription/test_transcription_validation_whisper.py
diff --git a/tests/entrypoints/speech_to_text/translation/__init__.py b/tests/entrypoints/speech_to_text/translation/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/openai/speech_to_text/test_translation_validation.py b/tests/entrypoints/speech_to_text/translation/test_translation_validation.py
similarity index 99%
rename from tests/entrypoints/openai/speech_to_text/test_translation_validation.py
rename to tests/entrypoints/speech_to_text/translation/test_translation_validation.py
index a8b17bf34324..ed3cff5f1c22 100644
--- a/tests/entrypoints/openai/speech_to_text/test_translation_validation.py
+++ b/tests/entrypoints/speech_to_text/translation/test_translation_validation.py
@@ -13,7 +13,7 @@
 import pytest_asyncio
 import soundfile as sf
 
-from tests.entrypoints.openai.conftest import add_attention_backend
+from tests.entrypoints.speech_to_text.conftest import add_attention_backend
 from tests.utils import RemoteOpenAIServer
 from vllm.logger import init_logger
 from vllm.multimodal.media.audio import load_audio
diff --git a/tests/kernels/attention/test_rocm_triton_attn_dsv4.py b/tests/kernels/attention/test_rocm_triton_attn_dsv4.py
new file mode 100644
index 000000000000..aefcfeee4d7e
--- /dev/null
+++ b/tests/kernels/attention/test_rocm_triton_attn_dsv4.py
@@ -0,0 +1,377 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="Only used by ROCm"
+)
+
+NOPE_HEAD_DIM = 448
+ROPE_HEAD_DIM = 64
+HEAD_DIM = NOPE_HEAD_DIM + ROPE_HEAD_DIM
+
+
+def _ref_global_topk_ragged(
+    topk_indices: torch.Tensor,
+    token_to_req_indices: torch.Tensor,
+    block_table: torch.Tensor,
+    block_size: int,
+    is_valid_token: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    topk = topk_indices.reshape(topk_indices.shape[0], -1)
+    valid = (topk >= 0) & is_valid_token[:, None]
+    lens = valid.sum(dim=1, dtype=torch.int32)
+    indptr = torch.zeros(lens.shape[0] + 1, dtype=torch.int32, device=topk.device)
+    torch.cumsum(lens, dim=0, out=indptr[1:])
+
+    safe_topk = torch.clamp(topk, min=0)
+    block_indices = safe_topk // block_size
+    block_offsets = safe_topk % block_size
+    req_indices = token_to_req_indices[:, None].expand_as(topk)
+    slot_ids = block_table[req_indices, block_indices] * block_size + block_offsets
+
+    offsets = torch.arange(topk.shape[1], dtype=torch.int32, device=topk.device)
+    positions = indptr[:-1, None] + offsets[None, :]
+    return slot_ids[valid], positions[valid].to(torch.long), indptr, lens
+
+
+def _ref_sparse_prefill_ragged(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    rows: list[list[int]],
+    scale: float,
+    attn_sink: torch.Tensor | None,
+) -> torch.Tensor:
+    q_f32 = q.float()
+    kv_f32 = kv.float()
+    out = torch.empty_like(q_f32)
+
+    for query_idx in range(q.shape[0]):
+        row_indices = rows[query_idx]
+        for head_idx in range(q.shape[1]):
+            if row_indices:
+                selected_kv = kv_f32[row_indices]
+                scores = torch.mv(selected_kv, q_f32[query_idx, head_idx]) * scale
+                if attn_sink is not None:
+                    scores_with_sink = torch.cat(
+                        [scores, attn_sink[head_idx].float().reshape(1)]
+                    )
+                    probs = torch.softmax(scores_with_sink, dim=0)[:-1]
+                else:
+                    probs = torch.softmax(scores, dim=0)
+                out[query_idx, head_idx] = torch.sum(
+                    probs[:, None] * selected_kv, dim=0
+                )
+            else:
+                out[query_idx, head_idx] = 0
+    return out.to(torch.bfloat16)
+
+
+def _pack_fp8_ds_mla_cache(kv: torch.Tensor, block_size: int) -> torch.Tensor:
+    assert kv.shape[-1] == HEAD_DIM
+    num_tokens = kv.shape[0]
+    num_blocks = (num_tokens + block_size - 1) // block_size
+    cache = torch.zeros(
+        (num_blocks, block_size, 584),
+        dtype=torch.uint8,
+        device=kv.device,
+    )
+    cache_flat = cache.view(torch.uint8).flatten()
+    kv_nope_fp8 = (
+        kv[:, :NOPE_HEAD_DIM].to(current_platform.fp8_dtype()).view(torch.uint8)
+    )
+    kv_rope_u8 = kv[:, NOPE_HEAD_DIM:].contiguous().view(torch.uint8)
+
+    for slot in range(num_tokens):
+        block_idx = slot // block_size
+        pos = slot % block_size
+        block_base = block_idx * cache.stride(0)
+        token_base = block_base + pos * 576
+        scale_base = block_base + block_size * 576 + pos * 8
+        cache_flat[token_base : token_base + NOPE_HEAD_DIM].copy_(kv_nope_fp8[slot])
+        cache_flat[
+            token_base + NOPE_HEAD_DIM : token_base + NOPE_HEAD_DIM + ROPE_HEAD_DIM * 2
+        ].copy_(kv_rope_u8[slot])
+        cache_flat[scale_base : scale_base + 7].fill_(127)
+    return cache
+
+
+def _read_fp8_ds_mla_cache(
+    cache: torch.Tensor, slot: int, block_size: int
+) -> torch.Tensor:
+    cache_flat = cache.view(torch.uint8).flatten()
+    block_idx = slot // block_size
+    pos = slot % block_size
+    block_base = block_idx * cache.stride(0)
+    token_base = block_base + pos * 576
+
+    nope_u8 = cache_flat[token_base : token_base + NOPE_HEAD_DIM]
+    nope = nope_u8.view(current_platform.fp8_dtype()).to(torch.float32)
+    rope_u8 = cache_flat[
+        token_base + NOPE_HEAD_DIM : token_base + NOPE_HEAD_DIM + ROPE_HEAD_DIM * 2
+    ]
+    rope = rope_u8.view(torch.bfloat16).to(torch.float32)
+    return torch.cat([nope, rope])
+
+
+def _ref_sparse_decode_ragged(
+    q: torch.Tensor,
+    main_cache: torch.Tensor,
+    main_rows: list[list[int]],
+    scale: float,
+    attn_sink: torch.Tensor | None,
+    block_size: int,
+    extra_cache: torch.Tensor | None = None,
+    extra_rows: list[list[int]] | None = None,
+) -> torch.Tensor:
+    q_f32 = q.float()
+    out = torch.empty_like(q_f32)
+
+    for query_idx in range(q.shape[0]):
+        row_kv = [
+            _read_fp8_ds_mla_cache(main_cache, int(slot), block_size)
+            for slot in main_rows[query_idx]
+        ]
+        if extra_cache is not None and extra_rows is not None:
+            row_kv.extend(
+                _read_fp8_ds_mla_cache(extra_cache, int(slot), block_size)
+                for slot in extra_rows[query_idx]
+            )
+
+        kv = torch.stack(row_kv).to(q.device)
+        for head_idx in range(q.shape[1]):
+            scores = torch.mv(kv, q_f32[query_idx, head_idx]) * scale
+            if attn_sink is not None:
+                scores_with_sink = torch.cat(
+                    [scores, attn_sink[head_idx].float().reshape(1)]
+                )
+                probs = torch.softmax(scores_with_sink, dim=0)[:-1]
+            else:
+                probs = torch.softmax(scores, dim=0)
+            out[query_idx, head_idx] = torch.sum(probs[:, None] * kv, dim=0)
+    return out.to(torch.bfloat16)
+
+
+def _ref_combine_topk_swa_ragged(
+    device: torch.device,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    expected_ragged = torch.tensor(
+        [
+            100,
+            101,
+            7,
+            8,
+            9,
+            110,
+            111,
+            8,
+            9,
+            10,
+            120,
+            121,
+            122,
+            9,
+            10,
+            11,
+            150,
+            27,
+            28,
+            29,
+            160,
+            161,
+            28,
+            29,
+            30,
+        ],
+        dtype=torch.int32,
+        device=device,
+    )
+    expected_lens = torch.tensor([5, 5, 6, 4, 5], dtype=torch.int32, device=device)
+    expected_indptr = torch.zeros(6, dtype=torch.int32, device=device)
+    torch.cumsum(expected_lens, dim=0, out=expected_indptr[1:])
+    return expected_ragged, expected_indptr, expected_lens
+
+
+@torch.inference_mode()
+def test_compute_global_topk_ragged_indices_and_indptr() -> None:
+    from vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse_dsv4 import (
+        compute_global_topk_ragged_indices_and_indptr,
+    )
+
+    device = torch.device("cuda")
+    block_size = 4
+    topk_indices = torch.tensor(
+        [
+            [0, 3, 4, -1],
+            [5, 8, -1, -1],
+            [2, 7, 9, -1],
+        ],
+        dtype=torch.int32,
+        device=device,
+    )
+    token_to_req_indices = torch.tensor([0, 1, 1], dtype=torch.int32, device=device)
+    block_table = torch.tensor(
+        [
+            [10, 11, 12],
+            [20, 21, 22],
+        ],
+        dtype=torch.int32,
+        device=device,
+    )
+    is_valid_token = torch.tensor([True, False, True], dtype=torch.bool, device=device)
+
+    actual_ragged, actual_indptr, actual_lens = (
+        compute_global_topk_ragged_indices_and_indptr(
+            topk_indices,
+            token_to_req_indices,
+            block_table,
+            block_size,
+            is_valid_token,
+        )
+    )
+    expected_values, expected_positions, expected_indptr, expected_lens = (
+        _ref_global_topk_ragged(
+            topk_indices,
+            token_to_req_indices,
+            block_table,
+            block_size,
+            is_valid_token,
+        )
+    )
+
+    torch.testing.assert_close(actual_ragged[expected_positions], expected_values)
+    torch.testing.assert_close(actual_indptr, expected_indptr)
+    torch.testing.assert_close(actual_lens, expected_lens)
+
+
+@torch.inference_mode()
+def test_sparse_attn_prefill_ragged_kernel() -> None:
+    from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
+        _rocm_sparse_attn_prefill_ragged_triton,
+    )
+
+    device = torch.device("cuda")
+    torch.manual_seed(0)
+    q = torch.randn(3, 3, HEAD_DIM, dtype=torch.bfloat16, device=device) * 0.125
+    kv = torch.randn(5, HEAD_DIM, dtype=torch.bfloat16, device=device) * 0.125
+    indices = torch.tensor([0, 2, 1, 3, 4], dtype=torch.int32, device=device)
+    indptr = torch.tensor([0, 2, 5, 5], dtype=torch.int32, device=device)
+    attn_sink = torch.tensor([-0.25, 0.0, 0.25], dtype=torch.float32, device=device)
+    scale = HEAD_DIM**-0.5
+
+    actual = _rocm_sparse_attn_prefill_ragged_triton(
+        q=q,
+        kv=kv,
+        indices=indices,
+        indptr=indptr,
+        scale=scale,
+        attn_sink=attn_sink,
+        nope_head_dim=NOPE_HEAD_DIM,
+        rope_head_dim=ROPE_HEAD_DIM,
+    )
+    expected = _ref_sparse_prefill_ragged(
+        q, kv, [[0, 2], [1, 3, 4], []], scale, attn_sink
+    )
+
+    torch.testing.assert_close(actual, expected, atol=2e-2, rtol=2e-2)
+
+
+@torch.inference_mode()
+def test_sparse_attn_decode_ragged_kernel() -> None:
+    from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
+        _rocm_sparse_attn_decode_ragged_triton,
+    )
+
+    device = torch.device("cuda")
+    torch.manual_seed(1)
+    block_size = 4
+    q = torch.randn(2, 3, HEAD_DIM, dtype=torch.bfloat16, device=device) * 0.125
+    main_kv = torch.randn(6, HEAD_DIM, dtype=torch.bfloat16, device=device) * 0.125
+    extra_kv = torch.randn(5, HEAD_DIM, dtype=torch.bfloat16, device=device) * 0.125
+    main_cache = _pack_fp8_ds_mla_cache(main_kv, block_size)
+    extra_cache = _pack_fp8_ds_mla_cache(extra_kv, block_size)
+    main_indices = torch.tensor([0, 2, 4, 1], dtype=torch.int32, device=device)
+    main_indptr = torch.tensor([0, 2, 4], dtype=torch.int32, device=device)
+    extra_indices = torch.tensor([1, 3, 0], dtype=torch.int32, device=device)
+    extra_indptr = torch.tensor([0, 1, 3], dtype=torch.int32, device=device)
+    attn_sink = torch.tensor([-0.1, 0.0, 0.1], dtype=torch.float32, device=device)
+    scale = HEAD_DIM**-0.5
+
+    actual = _rocm_sparse_attn_decode_ragged_triton(
+        q=q,
+        main_cache=main_cache,
+        main_indices=main_indices,
+        main_indptr=main_indptr,
+        scale=scale,
+        attn_sink=attn_sink,
+        nope_head_dim=NOPE_HEAD_DIM,
+        rope_head_dim=ROPE_HEAD_DIM,
+        extra_cache=extra_cache,
+        extra_indices=extra_indices,
+        extra_indptr=extra_indptr,
+    )
+    expected = _ref_sparse_decode_ragged(
+        q=q,
+        main_cache=main_cache,
+        main_rows=[[0, 2], [4, 1]],
+        scale=scale,
+        attn_sink=attn_sink,
+        block_size=block_size,
+        extra_cache=extra_cache,
+        extra_rows=[[1], [3, 0]],
+    )
+
+    torch.testing.assert_close(actual, expected, atol=2e-2, rtol=2e-2)
+
+
+@torch.inference_mode()
+def test_combine_topk_swa_indices_ragged() -> None:
+    from vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse_dsv4 import (
+        combine_topk_swa_indices_ragged,
+    )
+
+    device = torch.device("cuda")
+    topk_indices = torch.tensor(
+        [
+            [100, 101, 102, 103],
+            [110, 111, 112, 113],
+            [120, 121, 122, 123],
+            [130, 131, 132, 133],
+            [140, 141, 142, 143],
+        ],
+        dtype=torch.int32,
+        device=device,
+    )
+    query_start_loc = torch.tensor([0, 3, 5], dtype=torch.int32, device=device)
+    seq_lens = torch.tensor([6, 4], dtype=torch.int32, device=device)
+    gather_lens = torch.tensor([4, 3], dtype=torch.int32, device=device)
+    window_size = 3
+    compress_ratio = 2
+    topk = 4
+    M = 20
+    N = 8
+
+    actual_ragged, actual_indptr, actual_lens = combine_topk_swa_indices_ragged(
+        topk_indices,
+        query_start_loc,
+        seq_lens,
+        gather_lens,
+        window_size,
+        compress_ratio,
+        topk,
+        M,
+        N,
+    )
+    expected_ragged, expected_indptr, expected_lens = _ref_combine_topk_swa_ragged(
+        device
+    )
+
+    torch.testing.assert_close(
+        actual_ragged[: expected_ragged.numel()], expected_ragged
+    )
+    torch.testing.assert_close(actual_indptr, expected_indptr)
+    torch.testing.assert_close(actual_lens, expected_lens)
diff --git a/tests/kernels/mamba/test_ssu_dispatch.py b/tests/kernels/mamba/test_ssu_dispatch.py
index 887d60b27365..703a5df163e9 100644
--- a/tests/kernels/mamba/test_ssu_dispatch.py
+++ b/tests/kernels/mamba/test_ssu_dispatch.py
@@ -13,6 +13,7 @@
     selective_state_update,
 )
 from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 from vllm.v1.kv_cache_interface import (
     KVCacheConfig,
     KVCacheGroupSpec,
@@ -27,7 +28,9 @@
     HAS_FLASHINFER = False
 
 
-def _kv_cache_config_with_ssu(mamba_type: str = "mamba2") -> KVCacheConfig:
+def _kv_cache_config_with_ssu(
+    mamba_type: MambaAttentionBackendEnum = MambaAttentionBackendEnum.MAMBA2,
+) -> KVCacheConfig:
     spec = MambaSpec(
         block_size=16,
         shapes=((16, 64),),
@@ -77,7 +80,12 @@ def test_uninitialized_backend_raises():
 
 
 @pytest.mark.parametrize(
-    "mamba_type", ["linear_attention", "gdn_attention", "short_conv"]
+    "mamba_type",
+    [
+        MambaAttentionBackendEnum.LINEAR,
+        MambaAttentionBackendEnum.GDN_ATTN,
+        MambaAttentionBackendEnum.SHORT_CONV,
+    ],
 )
 def test_init_is_noop_for_non_ssu_mamba_type(mamba_type):
     import vllm.model_executor.layers.mamba.ops.ssu_dispatch as mod
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 812164ea287c..111975cf6bae 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -237,7 +237,7 @@ def expert_info(kind) -> ExpertInfo:
     )
 
 if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
-    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe import (
         FlashInferExperts,
     )
     from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided import (  # noqa: E501
@@ -298,7 +298,7 @@ def expert_info(kind) -> ExpertInfo:
     )
 
 if has_aiter():
-    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
         AiterExperts,
     )
 
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index dad25bc31959..9e35be0db801 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -18,12 +18,12 @@
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe import (
+    FlashInferExperts,
+)
 from vllm.model_executor.layers.fused_moe.experts.trtllm_fp8_moe import (
     TrtLlmFp8ExpertsMonolithic,
 )
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    FlashInferExperts,
-)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     rotate_weights_for_fi_trtllm_fp8_per_tensor_moe,
diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
index d116a96f58bc..6ae67fa0e987 100644
--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -22,7 +22,7 @@
     FusedMoEParallelConfig,
     RoutingMethodType,
 )
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe import (
     FlashInferExperts,
     is_valid_flashinfer_cutlass_fused_moe,
 )
diff --git a/tests/kernels/moe/test_marlin_vs_trtllm_mxint4.py b/tests/kernels/moe/test_marlin_vs_trtllm_mxint4.py
index 0ad2cd06ee3c..eaeca6a8a5dc 100644
--- a/tests/kernels/moe/test_marlin_vs_trtllm_mxint4.py
+++ b/tests/kernels/moe/test_marlin_vs_trtllm_mxint4.py
@@ -5,7 +5,7 @@
 import pytest
 import torch
 
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
     fused_marlin_moe,
 )
 from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import (
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index ebc3256b548f..544106282585 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -32,7 +32,7 @@
     int4_w4a16_moe_quant_config,
     int8_w8a16_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
     batched_fused_marlin_moe,
     fused_marlin_moe,
 )
diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py
index b0ecc9ed71f6..11f9cf28ae87 100644
--- a/tests/kernels/moe/test_rocm_aiter_topk.py
+++ b/tests/kernels/moe/test_rocm_aiter_topk.py
@@ -20,7 +20,7 @@
     pytest.skip("This test can only run on ROCm.", allow_module_level=True)
 
 # this import statement is needed to ensure the ops are registered
-import vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe  # noqa: F401
+import vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe  # noqa: F401
 
 # need to import once to ensure the ops are registered
 # Check if aiter package is installed
diff --git a/tests/kernels/moe/test_triton_moe_no_act_mul.py b/tests/kernels/moe/test_triton_moe_no_act_mul.py
index 1dfac3cf0fdc..9d16ae5b63db 100644
--- a/tests/kernels/moe/test_triton_moe_no_act_mul.py
+++ b/tests/kernels/moe/test_triton_moe_no_act_mul.py
@@ -15,7 +15,7 @@
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
 from vllm.platforms import current_platform
 
 # Test parameters
@@ -151,7 +151,7 @@ def test_triton_experts_no_mul_activation(
 @torch.inference_mode()
 def test_workspace_shapes_no_mul_vs_gated():
     """Test that workspace shapes differ correctly between gated and non-gated."""
-    from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+    from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
 
     M, N, K, topk = 64, 256, 128, 2
 
@@ -192,7 +192,7 @@ def test_workspace_shapes_no_mul_vs_gated():
 @torch.inference_mode()
 def test_adjust_n_for_activation():
     """Test the adjust_N_for_activation method."""
-    from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+    from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
 
     experts = TritonExperts(
         moe_config=make_dummy_moe_config(),
diff --git a/tests/kernels/moe/test_unquantized_backend_selection.py b/tests/kernels/moe/test_unquantized_backend_selection.py
index 2c4cd7b94e78..bc322aed3903 100644
--- a/tests/kernels/moe/test_unquantized_backend_selection.py
+++ b/tests/kernels/moe/test_unquantized_backend_selection.py
@@ -158,7 +158,7 @@ def test_select_cuda_flashinfer_trtllm_backend(mock_is_supported_trtllm, monkeyp
     return_value=(False, None),
 )
 @patch(
-    "vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts.is_supported_config",
+    "vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe.FlashInferExperts.is_supported_config",
     return_value=(True, None),
 )
 @pytest.mark.skipif(
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index a5d6afc489f5..de3a456fb449 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -17,12 +17,14 @@
     FusedMoEQuantConfig,
     RoutingMethodType,
 )
+from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
+    TritonExperts,
+)
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedTritonExperts,
     NaiveBatchedExperts,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    TritonExperts,
     fused_experts,
 )
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
diff --git a/tests/kernels/test_compressor_kv_cache.py b/tests/kernels/test_compressor_kv_cache.py
index 122254bc3c41..ef9fda050830 100644
--- a/tests/kernels/test_compressor_kv_cache.py
+++ b/tests/kernels/test_compressor_kv_cache.py
@@ -3,11 +3,12 @@
 """
 Round-trip tests for compressor → FP8 quant + KV cache insert → gather + dequant.
 
-Four test functions cover five paths:
+These tests cover:
   A) DeepseekV4 Attention: head_dim=512 (448 FP8 nope + 64 bf16 rope), quant_block=64
-  B) Indexer:       head_dim=128 (all FP8), quant_block=128
-  C) DeepseekV4 Attention magnitude range: correctness across small/large values
-  D) Indexer fused Triton kernel: compress+norm+rope+quant+insert
+  B) Fused dequant+gather K cache
+  C) Indexer:       head_dim=128 (all FP8), quant_block=128
+  D) DeepseekV4 Attention magnitude range: correctness across small/large values
+  E) Indexer fused Triton kernel: compress+norm+rope+quant+insert
 """
 
 import math
@@ -134,7 +135,140 @@ def test_deepseek_v4_attention_quant_cache_roundtrip(num_tokens: int, block_size
     )
 
 
-# ── Test B: Indexer path ────────────────────────────────────────────────────
+# ── Test B: Fused dequant+gather K cache ────────────────────────────────────
+
+
+def _dequantize_and_gather_k_cache_reference(
+    out: torch.Tensor,
+    k_cache: torch.Tensor,
+    seq_lens: torch.Tensor,
+    gather_lens: torch.Tensor | None,
+    block_table: torch.Tensor,
+    block_size: int,
+    offset: int,
+) -> None:
+    fp8_dim = 448
+    bf16_dim = 64
+    scale_dim = 8
+    quant_block = 64
+    token_data_size = fp8_dim + bf16_dim * 2
+
+    for req_id in range(seq_lens.shape[0]):
+        seq_len = seq_lens[req_id].item()
+        gather_len = gather_lens[req_id].item() if gather_lens is not None else seq_len
+        start_pos = seq_len - gather_len
+
+        for i in range(gather_len):
+            pos = start_pos + i
+            pos_in_block = pos % block_size
+            block_idx = block_table[req_id, pos // block_size].item()
+            cache_block = k_cache[block_idx].view(-1)
+
+            token_data_start = pos_in_block * token_data_size
+            fp8_bytes = cache_block[token_data_start : token_data_start + fp8_dim]
+            fp8_vals = fp8_bytes.view(torch.float8_e4m3fn).float()
+
+            scale_start = block_size * token_data_size + pos_in_block * scale_dim
+            encoded_scales = cache_block[scale_start : scale_start + scale_dim]
+            scales = torch.exp2(encoded_scales[:7].float() - 127.0)
+            dequant = fp8_vals * scales.repeat_interleave(quant_block)
+
+            bf16_start = token_data_start + fp8_dim
+            bf16_bytes = cache_block[bf16_start : bf16_start + bf16_dim * 2]
+            bf16_tail = bf16_bytes.view(torch.bfloat16)
+
+            out[req_id, offset + i, :fp8_dim] = dequant
+            out[req_id, offset + i, fp8_dim:] = bf16_tail
+
+
+@pytest.mark.parametrize(
+    ("seq_lens_host", "gather_lens_host", "offset"),
+    [
+        ([9, 23, 7], None, 0),
+        ([19, 8, 257], [6, 8, 129], 5),
+    ],
+)
+def test_dequantize_and_gather_k_cache(
+    seq_lens_host: list[int],
+    gather_lens_host: list[int] | None,
+    offset: int,
+):
+    block_size = 64
+    head_dim = 512
+    nope_dim = 448
+    scale_dim = 8
+    head_bytes = nope_dim + (head_dim - nope_dim) * 2 + scale_dim
+    device = "cuda"
+    num_reqs = len(seq_lens_host)
+    num_tokens = sum(seq_lens_host)
+    max_gather_len = max(gather_lens_host or seq_lens_host)
+    max_blocks_per_seq = math.ceil(max(seq_lens_host) / block_size)
+    num_blocks = sum(math.ceil(seq_len / block_size) for seq_len in seq_lens_host)
+
+    compressed_kv = torch.randn(
+        num_tokens, head_dim, dtype=torch.bfloat16, device=device
+    )
+
+    # Randomize physical pages so the test covers block-table translation.
+    # Keep padded block-table entries invalid to catch accidental reads.
+    physical_blocks = torch.randperm(num_blocks, device=device)
+    block_table = torch.full(
+        (num_reqs, max_blocks_per_seq), int(-1e6), dtype=torch.int32, device=device
+    )
+    start = 0
+    for req_id, seq_len in enumerate(seq_lens_host):
+        num_req_blocks = math.ceil(seq_len / block_size)
+        req_blocks = physical_blocks[start : start + num_req_blocks]
+        block_table[req_id, :num_req_blocks] = req_blocks
+        start += num_req_blocks
+
+    # Build slot_mapping for quantize_and_insert_k_cache.
+    slot_mapping = torch.empty(num_tokens, dtype=torch.int64, device=device)
+    start = 0
+    for req_id, seq_len in enumerate(seq_lens_host):
+        logical_pos = torch.arange(seq_len, dtype=torch.int64, device=device)
+        block_idx = block_table[req_id, logical_pos // block_size].to(torch.int64)
+        token_slots = block_idx * block_size + logical_pos % block_size
+        slot_mapping[start : start + seq_len] = token_slots
+        start += seq_len
+
+    # Insert compressed K into the paged cache layout used by the gather op.
+    k_cache = torch.empty(
+        num_blocks, block_size, head_bytes, dtype=torch.uint8, device=device
+    )
+    k_cache_2d = k_cache.view(num_blocks, -1)
+    quantize_and_insert_k_cache(compressed_kv, k_cache_2d, slot_mapping, block_size)
+
+    out_shape = (num_reqs, offset + max_gather_len + 3, head_dim)
+    ref_out = torch.empty(out_shape, dtype=torch.bfloat16, device=device)
+    actual_out = torch.empty_like(ref_out)
+    seq_lens = torch.tensor(seq_lens_host, dtype=torch.int32, device=device)
+    gather_lens = (
+        torch.tensor(gather_lens_host, dtype=torch.int32, device=device)
+        if gather_lens_host is not None
+        else None
+    )
+
+    # Compare production gather against a PyTorch reference for valid output rows.
+    _dequantize_and_gather_k_cache_reference(
+        ref_out, k_cache, seq_lens, gather_lens, block_table, block_size, offset
+    )
+    dequantize_and_gather_k_cache(
+        actual_out, k_cache, seq_lens, gather_lens, block_table, block_size, offset
+    )
+    torch.accelerator.synchronize()
+
+    # only check non-padded content
+    for req_id, seq_len in enumerate(seq_lens_host):
+        gather_len = (
+            gather_lens_host[req_id] if gather_lens_host is not None else seq_len
+        )
+        actual = actual_out[req_id, offset : offset + gather_len]
+        expected = ref_out[req_id, offset : offset + gather_len]
+        torch.testing.assert_close(actual, expected, rtol=0, atol=0)
+
+
+# ── Test C: Indexer path ────────────────────────────────────────────────────
 
 
 @pytest.mark.parametrize("num_tokens", [1, 4, 8, 17])
@@ -254,7 +388,7 @@ def test_indexer_gather_accepts_upper_bound_output():
     assert torch.all(dst_scale[valid_tokens:] == sentinel)
 
 
-# ── Test C: DeepseekV4 attention with values at different magnitudes ───────────
+# ── Test D: DeepseekV4 attention with values at different magnitudes ───────────
 
 
 def test_deepseek_v4_quant_magnitude_range():
@@ -316,7 +450,7 @@ def test_deepseek_v4_quant_magnitude_range():
             )
 
 
-# ── Test D: Indexer fused K-cache insert (Triton kernels) ────────────────────
+# ── Test E: Indexer fused K-cache insert (Triton kernels) ────────────────────
 #
 # Both kernels share the same Triton signature; use_fp4 selects between them.
 # Full pipeline: state-cache gather → softmax-weighted compress → RMSNorm →
diff --git a/tests/kernels/test_fused_indexer_q_rope_quant.py b/tests/kernels/test_fused_indexer_q_rope_quant.py
index be2039ce513e..41a4d0ed0905 100644
--- a/tests/kernels/test_fused_indexer_q_rope_quant.py
+++ b/tests/kernels/test_fused_indexer_q_rope_quant.py
@@ -122,7 +122,7 @@ def _reference(
         return q_fp8, weights_out
 
 
-@pytest.mark.parametrize("num_tokens", [1, 7, 32, 257])
+@pytest.mark.parametrize("num_tokens", [1, 7, 32, 257, 1023])
 @pytest.mark.parametrize("cache_dtype", [torch.float32, torch.bfloat16])
 @pytest.mark.parametrize("use_fp4", [False, True])
 @torch.inference_mode()
diff --git a/tests/kernels/test_mhc_kernels.py b/tests/kernels/test_mhc_kernels.py
new file mode 100644
index 000000000000..24d9038f4806
--- /dev/null
+++ b/tests/kernels/test_mhc_kernels.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm.model_executor.layers.mhc as mhc_ops  # noqa: F401
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
+
+DEVICE = current_platform.device_type
+
+
+def sinkhorn_normalize_ref(x: torch.Tensor, repeat: int, eps: float) -> torch.Tensor:
+    x = x.softmax(-1) + eps
+    x = x / (x.sum(-2, keepdim=True) + eps)
+    for _ in range(repeat - 1):
+        x = x / (x.sum(-1, keepdim=True) + eps)
+        x = x / (x.sum(-2, keepdim=True) + eps)
+    return x
+
+
+def mhc_pre_ref(
+    residual: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """mHC pre reference kernel from tilelang repo: https://github.com/tile-ai/tilelang/blob/d135bd1cd2d2eee74fbb41dd0a0831a427194c86/examples/deepseek_mhc/example_mhc_pre.py#L303"""
+    hc_mult = residual.shape[-2]
+
+    residual_flat = residual.flatten(-2, -1).float()
+    sqrsum = residual_flat.square().sum(-1)
+    mixes = (
+        residual_flat @ fn.T * (sqrsum.unsqueeze(-1) / fn.shape[-1] + rms_eps).rsqrt()
+    )
+
+    hc_scale = torch.cat(
+        [
+            hc_scale[0].expand(hc_mult),
+            hc_scale[1].expand(hc_mult),
+            hc_scale[2].expand(hc_mult * hc_mult),
+        ],
+    )
+    mixes = mixes * hc_scale + hc_base
+
+    pre_mix = mixes[:, :hc_mult].sigmoid().unsqueeze(-1) + hc_pre_eps
+    post_mix = (
+        mixes[:, hc_mult : 2 * hc_mult].sigmoid() * hc_post_mult_value
+    ).unsqueeze(-1)
+    res_mix = mixes[:, 2 * hc_mult :].view(-1, hc_mult, hc_mult)
+
+    res_mix = sinkhorn_normalize_ref(
+        res_mix, repeat=sinkhorn_repeat, eps=hc_sinkhorn_eps
+    )
+
+    layer_input = (residual * pre_mix).sum(-2).bfloat16()
+
+    return post_mix, res_mix, layer_input
+
+
+def mhc_post_ref(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    post_layer_mix: torch.Tensor,
+    comb_res_mix: torch.Tensor,
+) -> torch.Tensor:
+    """mHC post reference kernel from tilelang repo: https://github.com/tile-ai/tilelang/blob/d135bd1cd2d2eee74fbb41dd0a0831a427194c86/examples/deepseek_mhc/example_mhc_post.py#L68"""
+    term2 = torch.bmm(comb_res_mix.mT, residual.float())
+    return (x.float().unsqueeze(-2) * post_layer_mix + term2).bfloat16()
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="CUDA required",
+)
+@pytest.mark.parametrize("num_tokens", [1, 4, 8, 128])
+@pytest.mark.parametrize("hidden_size", [4096, 7168])
+@pytest.mark.parametrize("hc_mult", [4])
+def test_mhc_fused_post_pre(num_tokens, hidden_size, hc_mult):
+    torch.set_default_device(DEVICE)
+    set_random_seed(0)
+
+    x = torch.randn((num_tokens, hidden_size), dtype=torch.bfloat16)
+    residual = torch.randn((num_tokens, hc_mult, hidden_size), dtype=torch.bfloat16)
+    post_layer_mix = torch.randn((num_tokens, hc_mult, 1), dtype=torch.float32)
+    comb_res_mix = torch.randn((num_tokens, hc_mult, hc_mult), dtype=torch.float32)
+
+    hc_mult2 = hc_mult * hc_mult
+    hc_mult3 = hc_mult * 2 + hc_mult2
+    fn = (
+        torch.randn((hc_mult3, hc_mult, hidden_size), dtype=torch.float)
+        * 1e-4
+        * (1 + torch.arange(hc_mult).mul(0.01).view(1, -1, 1))
+    ).flatten(1, 2)
+    hc_scale = torch.randn((3,), dtype=torch.float) * 0.1
+    hc_base = torch.randn((hc_mult3,), dtype=torch.float) * 0.1
+
+    hc_sinkhorn_eps = hc_pre_eps = rms_eps = 1e-6
+    sinkhorn_repeat = 20
+    hc_post_alpha = 1.0
+
+    def run_ref():
+        residual_ref = mhc_post_ref(x, residual, post_layer_mix, comb_res_mix)
+        post_mix_ref, res_mix_ref, layer_input_ref = mhc_pre_ref(
+            residual_ref,
+            fn,
+            hc_scale,
+            hc_base,
+            rms_eps,
+            hc_pre_eps,
+            hc_sinkhorn_eps,
+            hc_post_alpha,
+            sinkhorn_repeat,
+        )
+        return residual_ref, post_mix_ref, res_mix_ref, layer_input_ref
+
+    residual_ref, post_mix_ref, res_mix_ref, layer_input_ref = run_ref()
+
+    residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
+        x,
+        residual,
+        post_layer_mix,
+        comb_res_mix,
+        fn,
+        hc_scale,
+        hc_base,
+        rms_eps,
+        hc_pre_eps,
+        hc_sinkhorn_eps,
+        hc_post_alpha,
+        sinkhorn_repeat,
+    )
+
+    torch.testing.assert_close(residual, residual_ref, atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(post_mix, post_mix_ref, atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(res_mix, res_mix_ref, atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(x, layer_input_ref, atol=1e-2, rtol=1e-2)
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 8f42243387d2..ace4fb5f50ef 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 import vllm
 import vllm.config
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
 
@@ -50,6 +53,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(
@@ -70,6 +76,9 @@ def test_chatglm3_lora(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @multi_gpu_test(num_gpus=4)
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
     llm = vllm.LLM(
diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py
index c76d3c6e798e..673e8e85555a 100644
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -11,6 +11,7 @@
 from huggingface_hub import snapshot_download
 
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ..conftest import AudioTestAssets, VllmRunner
 from ..utils import create_new_process_for_each_test
@@ -76,6 +77,9 @@ def test_active_default_mm_lora(
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @create_new_process_for_each_test()
 def test_inactive_default_mm_lora(
     vllm_runner: type[VllmRunner],
@@ -92,6 +96,9 @@ def test_inactive_default_mm_lora(
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @create_new_process_for_each_test()
 def test_default_mm_lora_succeeds_with_redundant_lora_request(
     vllm_runner: type[VllmRunner],
@@ -107,6 +114,9 @@ def test_default_mm_lora_succeeds_with_redundant_lora_request(
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @create_new_process_for_each_test()
 def test_default_mm_lora_fails_with_overridden_lora_request(
     vllm_runner: type[VllmRunner],
diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py
index 68dd87233ac0..648660734655 100644
--- a/tests/lora/test_gptoss_tp.py
+++ b/tests/lora/test_gptoss_tp.py
@@ -129,6 +129,7 @@ def test_gpt_oss_lora_tp2(
             tensor_parallel_size=2,
             gpu_memory_utilization=0.8,
             fully_sharded_loras=fully_sharded_loras,
+            enable_expert_parallel=not fully_sharded_loras,
             compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
                 cudagraph_specialize_lora=False,
             ),
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 99c823238ddb..42f6ddc2f690 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -10,6 +10,7 @@
 from vllm import LLM
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.platforms import current_platform
 
 from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
 
@@ -139,6 +140,9 @@ def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool):
     generate_and_test(llm, llama32_lora_files)
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @multi_gpu_test(num_gpus=4)
 def test_llama_lora_tp4(llama32_lora_files):
     llm = vllm.LLM(
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 3d6484a710a6..0090f9c569b0 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -68,6 +68,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index 0b4770622050..bbc25cb6b8e4 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -11,6 +11,7 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ..utils import multi_gpu_test
 
@@ -110,6 +111,9 @@ def generate_and_test(
         )
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 def test_olmoe_lora(olmoe_lora_files, maybe_enable_lora_dual_stream):
     # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
     # Otherwise, the lora-test will fail due to CUDA OOM.
@@ -178,6 +182,9 @@ def test_olmoe_lora_mixed_random(
     assert outputs[0].outputs[0].text.strip().startswith(EXPECTED_LORA_OUTPUT[0])
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @pytest.mark.parametrize("fully_sharded_loras", [False, True])
 @multi_gpu_test(num_gpus=2)
 def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras):
diff --git a/tests/lora/test_qwen35_densemodel_lora.py b/tests/lora/test_qwen35_densemodel_lora.py
index a9ee5fac8cb9..e926bbcef27c 100644
--- a/tests/lora/test_qwen35_densemodel_lora.py
+++ b/tests/lora/test_qwen35_densemodel_lora.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
 from transformers import AutoTokenizer
 
 import vllm
 import vllm.config
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
 
@@ -311,6 +313,9 @@ def _assert_qwen35_text_vl_and_mixed_lora(
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @create_new_process_for_each_test()
 def test_qwen35_text_lora(
     qwen35_text_lora_files, qwen35_vl_lora_files, maybe_enable_lora_dual_stream
diff --git a/tests/lora/test_llm_with_multi_loras.py b/tests/lora/test_qwen3_with_multi_loras.py
similarity index 100%
rename from tests/lora/test_llm_with_multi_loras.py
rename to tests/lora/test_qwen3_with_multi_loras.py
diff --git a/tests/lora/test_qwen3moe_tp.py b/tests/lora/test_qwen3moe_tp.py
index fcac4275cc40..9af142f6f388 100644
--- a/tests/lora/test_qwen3moe_tp.py
+++ b/tests/lora/test_qwen3moe_tp.py
@@ -5,6 +5,8 @@
 # NOTE To avoid overloading the CI pipeline, this test script will not
 # be triggered on CI and is primarily intended for local testing and verification.
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -82,15 +84,15 @@ def test_qwen3moe_lora(qwen3moe_lora_files):
 
 
 @multi_gpu_test(num_gpus=2)
-def test_qwen3moe_lora_tp2(qwen3moe_lora_files):
+@pytest.mark.parametrize("ep", [False, True])
+def test_qwen3moe_lora_tp2(ep, qwen3moe_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
         enable_lora=True,
         max_loras=4,
-        enforce_eager=True,
         trust_remote_code=True,
-        enable_chunked_prefill=True,
+        enable_expert_parallel=ep,
         tensor_parallel_size=2,
     )
 
@@ -99,15 +101,15 @@ def test_qwen3moe_lora_tp2(qwen3moe_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-def test_qwen3moe_lora_tp4(qwen3moe_lora_files):
+@pytest.mark.parametrize("ep", [False, True])
+def test_qwen3moe_lora_tp4(ep, qwen3moe_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
         enable_lora=True,
         max_loras=4,
-        enforce_eager=True,
         trust_remote_code=True,
-        enable_chunked_prefill=True,
+        enable_expert_parallel=ep,
         tensor_parallel_size=4,
     )
 
diff --git a/tests/lora/test_qwenvl.py b/tests/lora/test_qwenvl.py
index 5f8fc26c16d3..a4a32278db01 100644
--- a/tests/lora/test_qwenvl.py
+++ b/tests/lora/test_qwenvl.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
+import pytest
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 
 
@@ -206,6 +208,9 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
         )
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
     config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
@@ -216,6 +221,9 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
         tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
 
 
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
     config = TestConfig(
         model_path=QWEN25VL_MODEL_PATH,
diff --git a/tests/lora/test_whisper.py b/tests/lora/test_whisper.py
index 83b814d49f7f..ea8179a9c661 100644
--- a/tests/lora/test_whisper.py
+++ b/tests/lora/test_whisper.py
@@ -124,30 +124,3 @@ def test_whisper_multi_lora(whisper_lora_files):
         f"Expected same outputs for same adapter with different IDs. "
         f"Got: {outputs_lora1} vs {outputs_lora2}"
     )
-
-
-@create_new_process_for_each_test()
-def test_whisper_with_and_without_lora(whisper_lora_files):
-    """Test that Whisper produces different outputs with and without LoRA.
-
-    This test verifies that the LoRA adapter actually affects the model output.
-    """
-    llm = create_whisper_llm(enable_lora=True)
-
-    # Run with LoRA
-    outputs_with_lora = run_whisper_inference(
-        llm, lora_path=whisper_lora_files, lora_id=1
-    )
-
-    # Run without LoRA (base model only)
-    outputs_without_lora = run_whisper_inference(llm, lora_path=None)
-
-    # Both should produce valid outputs
-    assert len(outputs_with_lora[0]) > 0
-    assert len(outputs_without_lora[0]) > 0
-
-    print(f"Output with LoRA: {outputs_with_lora[0]}")
-    print(f"Output without LoRA: {outputs_without_lora[0]}")
-
-    # Note: Outputs may or may not differ depending on the adapter
-    # The main verification is that both configurations work
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 186e7e054ce1..93634760a576 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -41,6 +41,7 @@ def run_test(
     tensor_parallel_size: int,
     distributed_executor_backend: str | None = None,
     enforce_eager: bool = True,
+    gpu_memory_utilization: float = 0.9,
 ) -> None:
     """Inference result should be the same between hf and vllm.
 
@@ -57,6 +58,7 @@ def run_test(
         distributed_executor_backend=distributed_executor_backend,
         limit_mm_per_prompt={"audio": 2},
         enforce_eager=enforce_eager,
+        gpu_memory_utilization=gpu_memory_utilization,
         disable_custom_all_reduce=True,
     ) as vllm_model:
         vllm_outputs_per_case = [
@@ -319,6 +321,7 @@ def test_models_distributed(
         tensor_parallel_size=2,
         distributed_executor_backend=distributed_executor_backend,
         enforce_eager=False,
+        gpu_memory_utilization=0.65,
     )
 
 
diff --git a/tests/models/multimodal/processing/test_gemma4.py b/tests/models/multimodal/processing/test_gemma4.py
index 8541701ae101..3ff056d1156f 100644
--- a/tests/models/multimodal/processing/test_gemma4.py
+++ b/tests/models/multimodal/processing/test_gemma4.py
@@ -4,9 +4,12 @@
 from collections.abc import Mapping
 
 import pytest
+import torch
 from PIL import Image as PILImage
 
+from vllm.model_executor.models.gemma4_mm import Gemma4ImagePixelInputs
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig
 
 from ....conftest import ImageTestAssets
 from ...utils import build_model_context
@@ -15,6 +18,36 @@
 GEMMA4_MODEL_ID = "google/gemma-4-E2B-it"
 
 
+def test_gemma4_image_schema_accepts_variable_patch_counts():
+    Gemma4ImagePixelInputs(
+        pixel_values=[
+            torch.randn(10080, 768),
+            torch.randn(2520, 768),
+        ],
+        pixel_position_ids=[
+            torch.zeros(10080, 2, dtype=torch.long),
+            torch.zeros(2520, 2, dtype=torch.long),
+        ],
+    )
+
+
+def test_gemma4_image_batching_keeps_variable_patch_counts_unstacked():
+    field = MultiModalFieldConfig.batched("image").field
+    elems = field.build_elems(
+        "image",
+        "pixel_values",
+        [torch.randn(10080, 768), torch.randn(2520, 768)],
+    )
+
+    reduced = field.reduce_data(list(elems))
+
+    assert isinstance(reduced, list)
+    assert [tensor.shape for tensor in reduced] == [
+        torch.Size([10080, 768]),
+        torch.Size([2520, 768]),
+    ]
+
+
 @pytest.mark.parametrize(
     "image_width,image_height,max_soft_tokens",
     [
diff --git a/tests/models/multimodal/processing/test_molmo2.py b/tests/models/multimodal/processing/test_molmo2.py
new file mode 100644
index 000000000000..c12f70c2765a
--- /dev/null
+++ b/tests/models/multimodal/processing/test_molmo2.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from types import SimpleNamespace
+
+import torch
+
+from vllm.model_executor.models.molmo2 import build_flat_image_bool_length
+
+
+def test_build_flat_image_bool_length_matches_molmoweb_processor_tokens():
+    hf_config = SimpleNamespace(
+        image_patch_id=151938,
+        low_res_image_start_token_id=151940,
+        image_start_token_id=151936,
+        image_col_id=151939,
+        image_end_token_id=151937,
+    )
+    image_grids = torch.tensor([[14, 14, 14, 23]], dtype=torch.long)
+
+    image_tokens, num_image_tokens = build_flat_image_bool_length(
+        image_grids,
+        hf_config,
+        image_use_col_tokens=True,
+        use_single_crop_col_tokens=None,
+        use_single_crop_start_token=False,
+    )
+
+    assert num_image_tokens.tolist() == [550]
+    assert len(image_tokens) == 550
+    assert image_tokens[0].item() == hf_config.image_start_token_id
+    assert (image_tokens == hf_config.image_col_id).sum().item() == 28
+
+
+def test_build_flat_image_bool_length_respects_disabled_col_tokens():
+    hf_config = SimpleNamespace(
+        image_patch_id=151938,
+        low_res_image_start_token_id=151940,
+        image_start_token_id=151936,
+        image_col_id=151939,
+        image_end_token_id=151937,
+    )
+    image_grids = torch.tensor([[2, 3, 5, 7]], dtype=torch.long)
+
+    image_tokens, num_image_tokens = build_flat_image_bool_length(
+        image_grids,
+        hf_config,
+        image_use_col_tokens=False,
+        use_single_crop_col_tokens=False,
+        use_single_crop_start_token=True,
+    )
+
+    assert num_image_tokens.tolist() == [45]
+    assert len(image_tokens) == 45
+    assert image_tokens[0].item() == hf_config.low_res_image_start_token_id
+    assert (image_tokens == hf_config.image_col_id).sum().item() == 0
diff --git a/tests/models/multimodal/test_nano_nemotron_vl.py b/tests/models/multimodal/test_nano_nemotron_vl.py
index 6922af79c08e..aa93ee31168d 100644
--- a/tests/models/multimodal/test_nano_nemotron_vl.py
+++ b/tests/models/multimodal/test_nano_nemotron_vl.py
@@ -53,6 +53,19 @@ def load_weights(self, weights):
         self.loaded_weights = list(weights)
 
 
+class _FakeTensor:
+    """Sentinel stand-in for torch.Tensor in load_weights tests. Supports the
+    .detach().clone() chain used by load_weights for buffered mm weights;
+    both methods return self so identity (and the existing equality
+    assertions) are preserved through cloning."""
+
+    def detach(self):
+        return self
+
+    def clone(self):
+        return self
+
+
 def test_nano_nemotron_vl_skips_multimodal_weights_in_text_only_mode():
     model = object.__new__(NemotronH_Nano_VL_V2)
     language_model = _LanguageModel()
@@ -86,7 +99,7 @@ def test_nano_nemotron_vl_loads_vision_weights_without_sound_encoder():
     object.__setattr__(model, "sound_encoder", None)
 
     language_weight = object()
-    vision_weight = object()
+    vision_weight = _FakeTensor()
     model.load_weights(
         [
             ("language_model.layers.0.weight", language_weight),
diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py
index 6b34262d3e9e..25a63f6bd907 100644
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -93,6 +93,37 @@ def run_awq_test(
         )
 
 
+@pytest.mark.parametrize(
+    ("model", "quantization", "dtype"),
+    [
+        ("mattbucci/gemma-4-26B-AWQ", "awq", "float16"),
+        ("cyankiwi/gemma-4-26B-A4B-it-AWQ-4bit", "compressed-tensors", "bfloat16"),
+    ],
+    ids=[
+        "gemma4-moe-standard-awq-dot-suffix",
+        "gemma4-moe-compressed-tensors-underscore-suffix",
+    ],
+)
+@torch.inference_mode()
+def test_awq_load(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    quantization: str,
+    dtype: str,
+) -> None:
+    """Regression test: AWQ weight loading must not KeyError."""
+    with vllm_runner(
+        model,
+        quantization=quantization,
+        dtype=dtype,
+        max_model_len=128,
+        enforce_eager=True,
+    ) as vllm_model:
+        outputs = vllm_model.generate_greedy(example_prompts[:2], max_tokens=32)
+    assert len(outputs) == 2
+
+
 @pytest.mark.parametrize(
     ("source_model", "quant_model"),
     [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e50b0a8de4d9..3465ce9b9734 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -238,7 +238,7 @@ def check_available_online(
         "CohereLabs/c4ai-command-r7b-12-2024",
         trust_remote_code=True,
     ),
-    "CohereMoeForCausalLM": _HfExamplesInfo(
+    "Cohere2MoeForCausalLM": _HfExamplesInfo(
         "/host/engines/cohere-moe",
         trust_remote_code=True,
         is_available_online=False,
@@ -1406,6 +1406,13 @@ def check_available_online(
         max_num_seqs=32,
     ),
     # [Eagle]
+    "EagleCohereForCausalLM": _HfExamplesInfo(
+        "/host/engines/cohere-moe",
+        speculative_model="/host/engines/cohere-moe/eagle",
+        tokenizer="/host/engines/cohere-moe",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
     "EagleDeepSeekMTPModel": _HfExamplesInfo(
         "eagle618/deepseek-v3-random",
         speculative_model="eagle618/eagle-deepseek-v3-random",
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index 120b2cde0f35..593075e9d491 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -239,3 +239,49 @@ def check_model(model):
         output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
         print(f"ModelOpt FP8_PB_WO output: {output}")
+
+
+def test_modelopt_nvfp4_config_dispatches_w4a4_method():
+    """``quant_method="NVFP4"`` (W4A4 default) routes to the existing
+    ``ModelOptNvFp4LinearMethod``."""
+    from vllm.model_executor.layers.quantization.modelopt import (
+        ModelOptNvFp4Config,
+        ModelOptNvFp4LinearMethod,
+    )
+
+    config = ModelOptNvFp4Config(
+        quant_method="NVFP4",
+        is_checkpoint_nvfp4_serialized=True,
+        kv_cache_quant_algo=None,
+        exclude_modules=[],
+    )
+    assert config.LinearMethodCls is ModelOptNvFp4LinearMethod
+    assert config.quant_method == "NVFP4"
+
+
+def test_modelopt_nvfp4_config_dispatches_w4a16_method():
+    """``quant_method="W4A16_NVFP4"`` routes to the new
+    ``ModelOptNvFp4W4A16LinearMethod`` instead of the W4A4 sibling.
+
+    Mirrors the FP8 dispatch precedent (``ModelOptFp8Config`` selects
+    one of three FP8 LinearMethods on ``quant_method``); a regression
+    here would mean a W4A16 NVFP4 checkpoint silently loaded under the
+    W4A4 method, which would try to register an ``input_scale`` runtime
+    parameter and (more importantly) call the cutlass W4A4 NVFP4 GEMM
+    instead of FP4 Marlin.
+    """
+    from vllm.model_executor.layers.quantization.modelopt import (
+        ModelOptNvFp4Config,
+        ModelOptNvFp4LinearMethod,
+        ModelOptNvFp4W4A16LinearMethod,
+    )
+
+    config = ModelOptNvFp4Config(
+        quant_method="W4A16_NVFP4",
+        is_checkpoint_nvfp4_serialized=True,
+        kv_cache_quant_algo=None,
+        exclude_modules=[],
+    )
+    assert config.LinearMethodCls is ModelOptNvFp4W4A16LinearMethod
+    assert config.LinearMethodCls is not ModelOptNvFp4LinearMethod
+    assert config.quant_method == "W4A16_NVFP4"
diff --git a/tests/test_envs.py b/tests/test_envs.py
index 3f3add2ab764..b7cf41d34ed0 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -28,6 +28,14 @@ def test_getattr_without_cache(monkeypatch: pytest.MonkeyPatch):
     assert not hasattr(envs.__getattr__, "cache_info")
 
 
+def test_nixl_side_channel_host_is_not_compile_factor(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    monkeypatch.setenv("VLLM_NIXL_SIDE_CHANNEL_HOST", "10.0.0.15")
+
+    assert "VLLM_NIXL_SIDE_CHANNEL_HOST" not in envs.compile_factors()
+
+
 def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1")
     monkeypatch.setenv("VLLM_PORT", "1234")
diff --git a/tests/test_ray_env.py b/tests/test_ray_env.py
index c08f088acd22..945b2d80b691 100644
--- a/tests/test_ray_env.py
+++ b/tests/test_ray_env.py
@@ -6,6 +6,7 @@
 from unittest.mock import patch
 
 from vllm.ray.ray_env import get_env_vars_to_copy
+from vllm.v1.executor.ray_utils import WORKER_SPECIFIC_ENV_VARS
 
 # ---------------------------------------------------------------------------
 # Default prefix matching
@@ -106,6 +107,19 @@ def test_exclude_vars(self):
         result = get_env_vars_to_copy(exclude_vars={"CUDA_VISIBLE_DEVICES"})
         assert "CUDA_VISIBLE_DEVICES" not in result
 
+    @patch.dict(
+        os.environ,
+        {
+            "VLLM_HOST_IP": "10.0.0.1",
+            "VLLM_NIXL_SIDE_CHANNEL_HOST": "10.0.0.1",
+        },
+        clear=False,
+    )
+    def test_worker_specific_host_vars_are_excluded(self):
+        result = get_env_vars_to_copy(exclude_vars=WORKER_SPECIFIC_ENV_VARS)
+        assert "VLLM_HOST_IP" not in result
+        assert "VLLM_NIXL_SIDE_CHANNEL_HOST" not in result
+
     @patch.dict(os.environ, {"LMCACHE_LOCAL_CPU": "True"}, clear=False)
     @patch(
         "vllm.ray.ray_env.RAY_NON_CARRY_OVER_ENV_VARS",
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 385a9286c650..c38d974119aa 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -56,9 +56,10 @@ def test_gc():
 
 
 def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
-    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
+    # model: https://www.modelscope.ai/models/qwen/Qwen1.5-0.5B-Chat
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_MODELSCOPE", "True")
+        m.setenv("MODELSCOPE_DOMAIN", "www.modelscope.ai")
         # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
         # with 400 Client Error: Bad Request.
         m.setenv("HF_TOKEN", "")
diff --git a/tests/tool_parsers/test_glm47_moe_tool_parser.py b/tests/tool_parsers/test_glm47_moe_tool_parser.py
index 5e5501e4abff..51696c954788 100644
--- a/tests/tool_parsers/test_glm47_moe_tool_parser.py
+++ b/tests/tool_parsers/test_glm47_moe_tool_parser.py
@@ -91,6 +91,12 @@ def test_args_with_newlines(self, glm47_tool_parser, mock_request):
         assert r.tools_called
         assert json.loads(r.tool_calls[0].function.arguments) == {"city": "Beijing"}
 
+    def test_whitespace_preserved_in_arg_values(self, glm47_tool_parser, mock_request):
+        out = "<tool_call>get_weather<arg_key>city</arg_key><arg_value>  Beijing  </arg_value></tool_call>"
+        r = glm47_tool_parser.extract_tool_calls(out, request=mock_request)
+        assert r.tools_called
+        assert json.loads(r.tool_calls[0].function.arguments) == {"city": "  Beijing  "}
+
     def test_content_before(self, glm47_tool_parser, mock_request):
         out = "Checking.<tool_call>get_current_date</tool_call>"
         r = glm47_tool_parser.extract_tool_calls(out, request=mock_request)
diff --git a/tests/tool_parsers/test_glm4_moe_tool_parser.py b/tests/tool_parsers/test_glm4_moe_tool_parser.py
index 9f430b7814fc..c706d0a4c257 100644
--- a/tests/tool_parsers/test_glm4_moe_tool_parser.py
+++ b/tests/tool_parsers/test_glm4_moe_tool_parser.py
@@ -801,6 +801,36 @@ def test_extract_tool_calls_numeric_deserialization(glm4_moe_tool_parser, mock_r
     assert isinstance(args["enabled"], bool)
 
 
+def test_whitespace_preserved_in_arg_values(glm4_moe_tokenizer):
+    """Test that string arguments preserve leading and trailing whitespace."""
+    tools = [
+        ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="apply_diff",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "s": {"type": "string"},
+                    },
+                    "required": ["s"],
+                },
+            ),
+        ),
+    ]
+    parser = Glm4MoeModelToolParser(glm4_moe_tokenizer, tools=tools)
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+
+    model_output = """<tool_call>apply_diff
+<arg_key>s</arg_key>
+<arg_value>    indented code    </arg_value>
+</tool_call>"""
+
+    extracted_tool_calls = parser.extract_tool_calls(model_output, request=request)
+    args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+
+    assert args["s"] == "    indented code    "
+
+
 def test_zero_argument_tool_call(glm4_moe_tool_parser, mock_request):
     """Regression: zero-argument tool call crash (PR #32321)."""
     model_output = """<tool_call>get_time
diff --git a/tests/utils.py b/tests/utils.py
index cff601374b04..68a9031a2c4e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -20,9 +20,9 @@
 import warnings
 from collections.abc import Callable, Iterable, Sequence
 from contextlib import ExitStack, contextmanager
-from multiprocessing import Process
+from multiprocessing import Process, get_context
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, cast
 from unittest.mock import patch
 
 import anthropic
@@ -127,6 +127,21 @@ def _nvml():
 )
 
 
+def requires_spawn_multiprocessing() -> bool:
+    """Whether this platform requires spawn instead of fork for test processes."""
+    return current_platform.is_rocm() or current_platform.is_xpu()
+
+
+def _run_in_new_process_group(
+    child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None],
+    env_dict: dict[str, str] | None,
+    model: str,
+    vllm_serve_args: list[str],
+) -> None:
+    os.setsid()
+    child_process_fxn(env_dict, model, vllm_serve_args)
+
+
 class RemoteVLLMServer:
     """Base class for launching vLLM server subprocesses for testing.
 
@@ -738,8 +753,11 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
     def _start_server(
         self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None
     ) -> None:
-        self.proc: Process = Process(
-            target=self.child_process_fxn, args=(env_dict, model, vllm_serve_args)
+        method = "spawn" if requires_spawn_multiprocessing() else "fork"
+        ctx = get_context(method)
+        self.proc: Process = cast(Any, ctx).Process(
+            target=_run_in_new_process_group,
+            args=(self.child_process_fxn, env_dict, model, vllm_serve_args),
         )  # type: ignore[assignment]
         self.proc.start()
 
@@ -769,12 +787,40 @@ def __init__(
     def _poll(self) -> int | None:
         return self.proc.exitcode
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.proc.terminate()
-        self.proc.join(8)
+    def _terminate_process_tree(self) -> None:
+        pid = self.proc.pid
+        if pid is None:
+            return
+
+        pgid: int | None
+        try:
+            pgid = os.getpgid(pid)
+            # _run_in_new_process_group should make the child the group
+            # leader. Avoid signaling pytest's process group if startup failed
+            # before os.setsid() ran.
+            if pgid != pid:
+                pgid = None
+        except (ProcessLookupError, OSError):
+            pgid = None
+
+        with contextlib.suppress(ProcessLookupError, OSError):
+            self.proc.terminate()
+            print(f"[RemoteOpenAIServerCustom] Sent SIGTERM to process {pid}")
+
+        self.proc.join(15)
         if self.proc.is_alive():
-            # force kill if needed
-            self.proc.kill()
+            print(
+                f"[RemoteOpenAIServerCustom] Server {pid} did not respond "
+                "to SIGTERM, sending SIGKILL to process group"
+            )
+            if pgid is not None:
+                with contextlib.suppress(ProcessLookupError, OSError):
+                    os.killpg(pgid, signal.SIGKILL)
+            else:
+                self.proc.kill()
+            self.proc.join(10)
+
+        self._kill_process_group_survivors(pgid)
 
 
 def _test_completion(
@@ -1633,8 +1679,7 @@ def create_new_process_for_each_test(
         A decorator to run test functions in separate processes.
     """
     if method is None:
-        use_spawn = current_platform.is_rocm() or current_platform.is_xpu()
-        method = "spawn" if use_spawn else "fork"
+        method = "spawn" if requires_spawn_multiprocessing() else "fork"
 
     assert method in ["spawn", "fork"], "Method must be either 'spawn' or 'fork'"
 
diff --git a/tests/v1/attention/test_attention_backends_selection.py b/tests/v1/attention/test_attention_backends_selection.py
index 9d8d5d3ebb19..4242cc5ff2e2 100644
--- a/tests/v1/attention/test_attention_backends_selection.py
+++ b/tests/v1/attention/test_attention_backends_selection.py
@@ -13,6 +13,7 @@
 from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend
 
 
@@ -32,7 +33,7 @@
                 use_rms_norm=True,
             ),
             Mamba1AttentionBackend,
-            "mamba1",
+            MambaAttentionBackendEnum.MAMBA1,
         ),
         (
             MambaMixer2,
@@ -48,7 +49,7 @@
                 head_dim=32,
             ),
             Mamba2AttentionBackend,
-            "mamba2",
+            MambaAttentionBackendEnum.MAMBA2,
         ),
         (
             MiniMaxText01LinearAttention,
@@ -64,7 +65,7 @@
                 linear_layer_idx=0,
             ),
             LinearAttentionBackend,
-            "linear_attention",
+            MambaAttentionBackendEnum.LINEAR,
         ),
         (
             ShortConv,
@@ -74,7 +75,7 @@
                 layer_idx=0,
             ),
             ShortConvAttentionBackend,
-            "short_conv",
+            MambaAttentionBackendEnum.SHORT_CONV,
         ),
     ],
 )
@@ -97,10 +98,14 @@ def test_mamba_layers_get_attn_backend(
 @pytest.mark.parametrize(
     "layer_class,expected_backend,expected_mamba_type",
     [
-        (MambaMixer, Mamba1AttentionBackend, "mamba1"),
-        (MambaMixer2, Mamba2AttentionBackend, "mamba2"),
-        (MiniMaxText01LinearAttention, LinearAttentionBackend, "linear_attention"),
-        (ShortConv, ShortConvAttentionBackend, "short_conv"),
+        (MambaMixer, Mamba1AttentionBackend, MambaAttentionBackendEnum.MAMBA1),
+        (MambaMixer2, Mamba2AttentionBackend, MambaAttentionBackendEnum.MAMBA2),
+        (
+            MiniMaxText01LinearAttention,
+            LinearAttentionBackend,
+            MambaAttentionBackendEnum.LINEAR,
+        ),
+        (ShortConv, ShortConvAttentionBackend, MambaAttentionBackendEnum.SHORT_CONV),
     ],
 )
 def test_mamba_layers_have_unified_interface(
diff --git a/tests/v1/determinism/test_cutlass_batch_invariance.py b/tests/v1/determinism/test_cutlass_batch_invariance.py
new file mode 100644
index 000000000000..6df3f0a5f089
--- /dev/null
+++ b/tests/v1/determinism/test_cutlass_batch_invariance.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+import vllm.envs as envs
+from tests.utils import TestFP8Layer, requires_fp8
+from vllm.model_executor.kernels.linear.scaled_mm.cutlass import (
+    CutlassFP8ScaledMMLinearKernel,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8DynamicTokenSym,
+    kFp8StaticTensorSym,
+)
+from vllm.platforms import current_platform
+
+pytest.importorskip("torch.cuda")
+
+
+@pytest.fixture(autouse=True)
+def setup_cuda():
+    if not current_platform.is_cuda():
+        pytest.skip("CUTLASS FP8 kernels require CUDA.")
+    torch.set_default_device("cuda")
+
+
+@requires_fp8
+@pytest.mark.parametrize("weight_shape", [(1024, 2048), (4608, 4096)])
+@pytest.mark.parametrize("batch_size", [1, 16, 17, 32, 64, 65, 256, 257])
+@torch.inference_mode()
+def test_cutlass_fp8_batch_invariant_fixed_config(
+    weight_shape: tuple[int, int],
+    batch_size: int,
+    default_vllm_config,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
+    monkeypatch.setattr(envs, "VLLM_BATCH_INVARIANT", True)
+
+    torch.manual_seed(0)
+    layer = TestFP8Layer(
+        weight_shape=weight_shape,
+        activation_quant_key=kFp8DynamicTokenSym,
+        weight_quant_key=kFp8StaticTensorSym,
+        input_dtype=torch.bfloat16,
+        out_dtype=torch.bfloat16,
+        device=torch.device("cuda"),
+        force_kernel=CutlassFP8ScaledMMLinearKernel,
+    )
+    assert isinstance(layer.kernel, CutlassFP8ScaledMMLinearKernel)
+
+    in_features = weight_shape[1]
+    needle = torch.randn((1, in_features), device="cuda", dtype=torch.bfloat16)
+    baseline = layer(needle)[0]
+
+    filler = torch.randn(
+        (max(batch_size - 1, 0), in_features), device="cuda", dtype=torch.bfloat16
+    )
+
+    front_batch = torch.cat([needle, filler], dim=0)
+    back_batch = torch.cat([filler, needle], dim=0)
+
+    front_output = layer(front_batch)[0]
+    back_output = layer(back_batch)[-1]
+
+    torch.testing.assert_close(front_output, baseline, rtol=0, atol=0)
+    torch.testing.assert_close(back_output, baseline, rtol=0, atol=0)
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
index 2e122680ca40..a7dab5d5d9d1 100644
--- a/tests/v1/ec_connector/integration/README.md
+++ b/tests/v1/ec_connector/integration/README.md
@@ -13,7 +13,7 @@ The test ensures that disaggregated encoding produces **identical** outputs to t
 
 Note that currently PD disaggregation set up may give slightly different results from a single instance. Therefore, we need the result from 1P+1D as the baseline for 1E+1P+1D
 
-Please refer to [Disaggregated Encoder Feature](../../../docs/features/disagg_encoder.md) for the detailed explanation for the EPD features.
+Please refer to [Disaggregated Encoder Feature](../../../../docs/features/disagg_encoder.md) for the detailed explanation for the EPD features.
 
 ## Files
 
diff --git a/tests/v1/engine/test_abort_final_step.py b/tests/v1/engine/test_abort_final_step.py
index 81a120d151d6..8f1e8029955f 100644
--- a/tests/v1/engine/test_abort_final_step.py
+++ b/tests/v1/engine/test_abort_final_step.py
@@ -66,7 +66,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         role: KVConnectorRole,
-        kv_cache_config: KVCacheConfig | None = None,
+        kv_cache_config: KVCacheConfig,
     ):
         super().__init__(vllm_config, role, kv_cache_config)
         # Get the status file path from extra config
diff --git a/tests/v1/engine/test_core_engine_actor_manager.py b/tests/v1/engine/test_core_engine_actor_manager.py
new file mode 100644
index 000000000000..195ddda05a6a
--- /dev/null
+++ b/tests/v1/engine/test_core_engine_actor_manager.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import uuid
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+import ray
+
+from vllm.v1.engine.core import EngineCoreActorMixin
+from vllm.v1.engine.utils import CoreEngineActorManager, EngineZmqAddresses
+
+
+class _StubEngineCoreActor(EngineCoreActorMixin):
+    def __init__(
+        self,
+        vllm_config: Any,
+        local_client: bool,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Any],
+        log_stats: bool,
+        dp_rank: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        # Exercise the production Ray actor mixin without loading a model.
+        EngineCoreActorMixin.__init__(
+            self, vllm_config, addresses, dp_rank, local_dp_rank
+        )
+
+    def _set_visible_devices(self, vllm_config: Any, local_dp_rank: int) -> None:
+        pass
+
+    def wait_for_init(self) -> None:
+        pass
+
+    def run(self) -> None:
+        pass
+
+    def get_nixl_side_channel_host(self) -> str | None:
+        return os.environ.get("VLLM_NIXL_SIDE_CHANNEL_HOST")
+
+
+class _DummyExecutor:
+    pass
+
+
+def _make_vllm_config() -> SimpleNamespace:
+    return SimpleNamespace(
+        parallel_config=SimpleNamespace(
+            data_parallel_size=1,
+            data_parallel_size_local=1,
+            enable_elastic_ep=False,
+            world_size=1,
+        ),
+        model_config=SimpleNamespace(is_moe=False),
+        kv_transfer_config=None,
+    )
+
+
+def _make_addresses() -> EngineZmqAddresses:
+    return EngineZmqAddresses(
+        inputs=["tcp://127.0.0.1:12345"],
+        outputs=["tcp://127.0.0.1:12346"],
+    )
+
+
+def _make_cpu_placement_group():
+    pg = ray.util.placement_group(
+        [{"CPU": 0.001}, {"CPU": 1.0}],
+        strategy="PACK",
+    )
+    ray.get(pg.ready())
+    return pg
+
+
+@pytest.fixture
+def ray_context():
+    started_ray = False
+    if not ray.is_initialized():
+        project_root = str(Path(__file__).resolve().parents[3])
+        ray.init(
+            num_cpus=2,
+            runtime_env={"env_vars": {"PYTHONPATH": project_root}},
+            log_to_driver=False,
+        )
+        started_ray = True
+
+    yield
+
+    if started_ray:
+        ray.shutdown()
+
+
+@pytest.mark.usefixtures("ray_context")
+def test_driver_nixl_side_channel_host_does_not_leak_to_engine_core_actor(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    driver_marker = f"driver-only-nixl-host-{uuid.uuid4()}"
+    created_placement_groups: list[Any] = []
+    manager: CoreEngineActorManager | None = None
+
+    def create_dp_placement_groups(vllm_config: Any):
+        pg = _make_cpu_placement_group()
+        created_placement_groups.append(pg)
+        return [pg], [0]
+
+    monkeypatch.setenv("VLLM_NIXL_SIDE_CHANNEL_HOST", driver_marker)
+    monkeypatch.setattr("vllm.v1.engine.core.EngineCoreActor", _StubEngineCoreActor)
+    monkeypatch.setattr(
+        CoreEngineActorManager,
+        "create_dp_placement_groups",
+        staticmethod(create_dp_placement_groups),
+    )
+
+    try:
+        manager = CoreEngineActorManager(
+            vllm_config=_make_vllm_config(),
+            addresses=_make_addresses(),
+            executor_class=_DummyExecutor,
+            log_stats=False,
+        )
+        actor = manager.local_engine_actors[0]
+        actor_host = ray.get(actor.get_nixl_side_channel_host.remote())
+        node_host = ray.util.get_node_ip_address()
+
+        assert actor_host != driver_marker
+        assert actor_host == node_host
+    finally:
+        if manager is not None:
+            manager.shutdown()
+        else:
+            for pg in created_placement_groups:
+                ray.util.remove_placement_group(pg)
diff --git a/tests/v1/engine/test_logprobs_processor.py b/tests/v1/engine/test_logprobs_processor.py
new file mode 100644
index 000000000000..edb8cef518ca
--- /dev/null
+++ b/tests/v1/engine/test_logprobs_processor.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for LogprobsProcessor.
+
+These tests exercise the truncation invariant that the MRV2 sampler relies
+on: when the sampler returns a row wider than a request's own
+`num_logprobs + 1` (because another request in the batch needed a wider
+row), the trailing positions are populated with sentinel values
+(`token_id=0`, `logprob=-inf`). LogprobsProcessor must read only the first
+`num_logprobs + 1` entries so those sentinels never reach the user.
+"""
+
+import numpy as np
+
+from vllm.logprobs import create_sample_logprobs
+from vllm.v1.engine.logprobs import LogprobsProcessor
+from vllm.v1.outputs import LogprobsLists
+
+
+def _make_processor(num_logprobs: int) -> LogprobsProcessor:
+    return LogprobsProcessor(
+        tokenizer=None,
+        logprobs=create_sample_logprobs(flat_logprobs=False),
+        prompt_logprobs=None,
+        cumulative_logprob=0.0,
+        num_logprobs=num_logprobs,
+        num_prompt_logprobs=None,
+    )
+
+
+def test_drops_trailing_sentinel_columns():
+    """A request that asked for 3 custom token logprobs but ended up in a
+    batch padded to width 5 must not surface the trailing -inf entries."""
+    processor = _make_processor(num_logprobs=3)
+
+    sampled = 42
+    # Layout: [sampled, custom_1, custom_2, custom_3, SENTINEL, SENTINEL]
+    # Use float32-exact values so cumulative_logprob compares cleanly.
+    token_ids = np.array([[sampled, 100, 200, 300, 0, 0]], dtype=np.int32)
+    logprobs = np.array([[-0.5, -1.0, -2.0, -3.0, -np.inf, -np.inf]], dtype=np.float32)
+    ranks = np.array([1], dtype=np.int32)
+
+    processor._update_sample_logprobs(LogprobsLists(token_ids, logprobs, ranks))
+
+    assert len(processor.logprobs) == 1
+    pos = processor.logprobs[0]
+    # Exactly sampled + 3 requested tokens; trailing sentinels dropped.
+    assert set(pos.keys()) == {sampled, 100, 200, 300}
+    assert 0 not in pos
+    assert all(np.isfinite(lp.logprob) for lp in pos.values())
+    # cumulative_logprob comes from the sampled token's logprob only.
+    assert processor.cumulative_logprob == -0.5
+
+
+def test_accepts_exactly_sized_row():
+    """When the row is exactly num_logprobs+1, no truncation needed."""
+    processor = _make_processor(num_logprobs=2)
+
+    token_ids = np.array([[7, 11, 13]], dtype=np.int32)
+    logprobs = np.array([[-0.5, -1.5, -2.5]], dtype=np.float32)
+    ranks = np.array([1], dtype=np.int32)
+
+    processor._update_sample_logprobs(LogprobsLists(token_ids, logprobs, ranks))
+
+    pos = processor.logprobs[0]
+    assert set(pos.keys()) == {7, 11, 13}
diff --git a/tests/v1/kv_connector/nixl_integration/nixl_side_channel_probe.py b/tests/v1/kv_connector/nixl_integration/nixl_side_channel_probe.py
new file mode 100644
index 000000000000..24ecbd795e41
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/nixl_side_channel_probe.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Probe a NIXL side-channel socket for handshake metadata readiness."""
+
+import argparse
+import ipaddress
+
+import msgspec
+import zmq
+
+GET_META_MSG = b"get_meta_msg"
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", required=True)
+    parser.add_argument("--port", required=True, type=int)
+    parser.add_argument("--rank", default=0, type=int)
+    parser.add_argument("--timeout-ms", default=1000, type=int)
+    return parser.parse_args()
+
+
+def make_zmq_path(host: str, port: int) -> str:
+    try:
+        if isinstance(ipaddress.ip_address(host), ipaddress.IPv6Address):
+            return f"tcp://[{host}]:{port}"
+    except ValueError:
+        pass
+    return f"tcp://{host}:{port}"
+
+
+def main() -> None:
+    args = parse_args()
+    ctx = zmq.Context()
+    sock = ctx.socket(zmq.REQ)
+    sock.setsockopt(zmq.LINGER, 0)
+    sock.setsockopt(zmq.RCVTIMEO, args.timeout_ms)
+    try:
+        sock.connect(make_zmq_path(args.host, args.port))
+        sock.send(msgspec.msgpack.encode((GET_META_MSG, args.rank)))
+        sock.recv()
+    finally:
+        sock.close()
+        ctx.term()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh b/tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
index a82dae2d5109..fdec5f40cbb0 100755
--- a/tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
@@ -51,6 +51,8 @@ PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.7}
 BLOCK_SIZE=${BLOCK_SIZE:-16}
+SERVER_HOST="${SERVER_HOST:-127.0.0.1}"
+NIXL_SIDE_CHANNEL_HOST="${NIXL_SIDE_CHANNEL_HOST:-$SERVER_HOST}"
 
 # Resolve the repository root from the script location instead of `.git`.
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
@@ -96,18 +98,57 @@ trap 'echo " Interrupted."; exit 130' INT TERM
 
 wait_for_server() {
   local port=$1
-  local deadline=600
+  local server_pid=$2
+  local server_name=$3
+  local endpoint=${4:-/v1/completions}
+  local deadline=${5:-600}
   local elapsed=0
-  echo "Waiting for server on port ${port}..."
+  echo "Waiting for ${server_name} on port ${port}..."
   while [ $elapsed -lt $deadline ]; do
-    if curl -s "localhost:${port}/v1/completions" > /dev/null 2>&1; then
-      echo "Server on port ${port} ready"
+    if ! ps -p "$server_pid" > /dev/null 2>&1; then
+      local status=0
+      wait "$server_pid" || status=$?
+      echo "FAIL: ${server_name} process ${server_pid} exited with status ${status} before port ${port} became ready"
+      exit 1
+    fi
+    if curl -s "http://${SERVER_HOST}:${port}${endpoint}" > /dev/null 2>&1; then
+      echo "${server_name} on port ${port} ready"
       return 0
     fi
     sleep 2
     elapsed=$((elapsed + 2))
   done
-  echo "FAIL: Server on port ${port} did not start within ${deadline}s"
+  echo "FAIL: ${server_name} on port ${port} did not start within ${deadline}s"
+  exit 1
+}
+
+wait_for_nixl_side_channel() {
+  local host=$1
+  local port=$2
+  local server_pid=$3
+  local server_name=$4
+  local deadline=120
+  local elapsed=0
+  echo "Waiting for ${server_name} NIXL side channel on ${host}:${port}..."
+  while [ $elapsed -lt $deadline ]; do
+    if ! ps -p "$server_pid" > /dev/null 2>&1; then
+      local status=0
+      wait "$server_pid" || status=$?
+      echo "FAIL: ${server_name} server process ${server_pid} exited with status ${status} before NIXL side channel ${host}:${port} became ready"
+      exit 1
+    fi
+    if python3 "${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/nixl_side_channel_probe.py" \
+      --host "$host" \
+      --port "$port" \
+      --timeout-ms 1000 > /dev/null 2>&1
+    then
+      echo "${server_name} NIXL side channel on ${host}:${port} ready"
+      return 0
+    fi
+    sleep 2
+    elapsed=$((elapsed + 2))
+  done
+  echo "FAIL: ${server_name} NIXL side channel ${host}:${port} did not start within ${deadline}s"
   exit 1
 }
 
@@ -158,6 +199,8 @@ run_test_for_device() {
   echo "KV buffer device:   ${kv_device}"
   echo "Attention backend:  ${ATTENTION_BACKEND}"
   echo "GPU platform:       ${GPU_PLATFORM}"
+  echo "Server host:        ${SERVER_HOST}"
+  echo "NIXL side channel:  ${NIXL_SIDE_CHANNEL_HOST}"
   echo "GPUs available:     ${ALL_GPUS[*]}"
   echo "================================================================"
 
@@ -167,7 +210,8 @@ run_test_for_device() {
   local DECODE_PORTS=()
   local GPU_IDX=0
 
-  # Start prefill instances
+  # Start prefill instances and wait for each one before allocating the next
+  # server. This keeps failures from leaving extra model servers spinning.
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     local GPU_ID="${ALL_GPUS[$GPU_IDX]}"
     GPU_IDX=$((GPU_IDX + 1))
@@ -184,6 +228,7 @@ run_test_for_device() {
     ${GPU_DEVICE_VAR}=$GPU_ID \
     VLLM_KV_CACHE_LAYOUT='HND' \
     UCX_NET_DEVICES=all \
+    VLLM_NIXL_SIDE_CHANNEL_HOST=$NIXL_SIDE_CHANNEL_HOST \
     VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \
     vllm serve $MODEL_NAME \
       --port $PORT \
@@ -195,12 +240,15 @@ run_test_for_device() {
       --kv-transfer-config "$kv_config" \
       --speculative-config "$PREFILL_SPEC_CONFIG" \
       --attention-backend $ATTENTION_BACKEND &
+    local SERVER_PID=$!
 
-    PREFILL_HOSTS+=("localhost")
+    PREFILL_HOSTS+=("$SERVER_HOST")
     PREFILL_PORTS+=("$PORT")
+    wait_for_server "$PORT" "$SERVER_PID" "prefill"
+    wait_for_nixl_side_channel "$NIXL_SIDE_CHANNEL_HOST" "$SIDE_CHANNEL_PORT" "$SERVER_PID" "prefill"
   done
 
-  # Start decode instances
+  # Start decode instances after prefill is ready.
   for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
     local GPU_ID="${ALL_GPUS[$GPU_IDX]}"
     GPU_IDX=$((GPU_IDX + 1))
@@ -217,6 +265,7 @@ run_test_for_device() {
     ${GPU_DEVICE_VAR}=$GPU_ID \
     VLLM_KV_CACHE_LAYOUT='HND' \
     UCX_NET_DEVICES=all \
+    VLLM_NIXL_SIDE_CHANNEL_HOST=$NIXL_SIDE_CHANNEL_HOST \
     VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \
     vllm serve $MODEL_NAME \
       --port $PORT \
@@ -228,17 +277,12 @@ run_test_for_device() {
       --kv-transfer-config "$kv_config" \
       --speculative-config "$DECODE_SPEC_CONFIG" \
       --attention-backend $ATTENTION_BACKEND &
+    local SERVER_PID=$!
 
-    DECODE_HOSTS+=("localhost")
+    DECODE_HOSTS+=("$SERVER_HOST")
     DECODE_PORTS+=("$PORT")
-  done
-
-  # Wait for servers
-  for PORT in "${PREFILL_PORTS[@]}"; do
-    wait_for_server "$PORT"
-  done
-  for PORT in "${DECODE_PORTS[@]}"; do
-    wait_for_server "$PORT"
+    wait_for_server "$PORT" "$SERVER_PID" "decode"
+    wait_for_nixl_side_channel "$NIXL_SIDE_CHANNEL_HOST" "$SIDE_CHANNEL_PORT" "$SERVER_PID" "decode"
   done
 
   # Start proxy
@@ -250,12 +294,14 @@ run_test_for_device() {
     --prefiller-ports ${PREFILL_PORTS[*]} \
     --decoder-hosts ${DECODE_HOSTS[*]} \
     --decoder-ports ${DECODE_PORTS[*]} &
+  local PROXY_PID=$!
 
-  sleep 5
+  wait_for_server "$PROXY_PORT" "$PROXY_PID" "proxy" "/healthcheck" 60
 
   # Run test
   echo "Running spec decode acceptance test (kv_buffer_device=${kv_device}, backend=${ATTENTION_BACKEND})..."
   DECODE_PORT=${DECODE_PORTS[0]} \
+  SERVER_HOST=$SERVER_HOST \
   TEST_MODEL=$MODEL_NAME \
   python3 -m pytest -s -x "${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_spec_decode_acceptance.py"
 
diff --git a/tests/v1/kv_connector/nixl_integration/test_spec_decode_acceptance.py b/tests/v1/kv_connector/nixl_integration/test_spec_decode_acceptance.py
index b747f953a220..67cb753a9887 100644
--- a/tests/v1/kv_connector/nixl_integration/test_spec_decode_acceptance.py
+++ b/tests/v1/kv_connector/nixl_integration/test_spec_decode_acceptance.py
@@ -27,7 +27,8 @@
 
 from vllm.benchmarks.datasets import get_samples
 
-PROXY_BASE_URL = "http://localhost:8192/v1"
+SERVER_HOST = os.environ.get("SERVER_HOST", "127.0.0.1")
+PROXY_BASE_URL = f"http://{SERVER_HOST}:8192/v1"
 DECODE_PORT = os.environ.get("DECODE_PORT", "8200")
 MODEL_NAME = os.environ.get("TEST_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
 
@@ -101,7 +102,7 @@ def _get_mt_bench_prompts() -> list[str]:
 
 def _fetch_metric(metric_name: str) -> float:
     """Fetch a single counter metric from the decode server's /metrics."""
-    url = f"http://localhost:{DECODE_PORT}/metrics"
+    url = f"http://{SERVER_HOST}:{DECODE_PORT}/metrics"
     body = urlopen(url).read().decode()
     for line in body.split("\n"):
         if line.startswith(metric_name + "{") or line.startswith(metric_name + " "):
@@ -111,7 +112,7 @@ def _fetch_metric(metric_name: str) -> float:
 
 def _fetch_per_position_acceptance() -> dict[int, float]:
     """Fetch per-position acceptance counts from decode /metrics."""
-    url = f"http://localhost:{DECODE_PORT}/metrics"
+    url = f"http://{SERVER_HOST}:{DECODE_PORT}/metrics"
     body = urlopen(url).read().decode()
     counts: dict[int, float] = {}
     for line in body.split("\n"):
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
deleted file mode 100644
index da6a5aadbc6d..000000000000
--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Unit tests for backwards compatibility with external KV connector implementations.
-
-This test ensures that external connectors (loaded via kv_connector_module_path)
-implemented with the old signature continue to work:
-- Old signature: __init__(self, vllm_config, role)
-- New signature: __init__(self, vllm_config, role, kv_cache_config)
-"""
-
-from typing import TYPE_CHECKING
-from unittest.mock import patch
-
-import pytest
-
-from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
-from vllm.distributed.kv_transfer.kv_connector.v1 import (
-    KVConnectorBase_V1,
-    KVConnectorRole,
-)
-from vllm.v1.attention.backend import AttentionMetadata
-from vllm.v1.core.sched.output import SchedulerOutput
-
-from .utils import create_scheduler, create_vllm_config
-
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-    from vllm.forward_context import ForwardContext
-    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
-    from vllm.v1.kv_cache_interface import KVCacheConfig
-    from vllm.v1.request import Request
-
-
-class OldStyleTestConnector(KVConnectorBase_V1):
-    """
-    Test connector using the old signature with 2 required arguments.
-    This simulates external connectors that haven't been updated yet.
-    """
-
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        # Old-style call to super().__init__ with only 2 arguments
-        super().__init__(vllm_config=vllm_config, role=role)
-
-    def get_num_new_matched_tokens(
-        self, request: "Request", num_computed_tokens: int
-    ) -> tuple[int | None, bool]:
-        return 0, False
-
-    def update_state_after_alloc(
-        self,
-        request: "Request",
-        blocks: "KVCacheBlocks",
-        num_external_tokens: int,
-    ):
-        pass
-
-    def build_connector_meta(self, scheduler_output: SchedulerOutput):
-        return None
-
-    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
-        pass
-
-    def wait_for_layer_load(self, layer_name: str) -> None:
-        pass
-
-    def save_kv_layer(
-        self,
-        layer_name: str,
-        kv_layer,
-        attn_metadata: AttentionMetadata,
-        **kwargs,
-    ) -> None:
-        pass
-
-    def wait_for_save(self):
-        pass
-
-
-class NewStyleTestConnector(KVConnectorBase_V1):
-    """
-    Test connector using the new signature with 3 required arguments.
-    """
-
-    def __init__(
-        self,
-        vllm_config: "VllmConfig",
-        role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig",
-    ):
-        # New-style call to super().__init__ with all 3 arguments
-        super().__init__(
-            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
-        )
-
-    def get_num_new_matched_tokens(
-        self, request: "Request", num_computed_tokens: int
-    ) -> tuple[int | None, bool]:
-        return 0, False
-
-    def update_state_after_alloc(
-        self,
-        request: "Request",
-        blocks: "KVCacheBlocks",
-        num_external_tokens: int,
-    ):
-        pass
-
-    def build_connector_meta(self, scheduler_output: SchedulerOutput):
-        return None
-
-    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
-        pass
-
-    def wait_for_layer_load(self, layer_name: str) -> None:
-        pass
-
-    def save_kv_layer(
-        self,
-        layer_name: str,
-        kv_layer,
-        attn_metadata: AttentionMetadata,
-        **kwargs,
-    ) -> None:
-        pass
-
-    def wait_for_save(self):
-        pass
-
-
-@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
-def test_external_old_signature_factory_instantiation(role):
-    """
-    Test that external connectors with old signature (2 required args) loaded
-    via kv_connector_module_path are correctly instantiated with backwards
-    compatibility support.
-    """
-    vllm_config = create_vllm_config()
-    vllm_config.kv_transfer_config.kv_connector = "OldStyleTestConnector"
-    vllm_config.kv_transfer_config.kv_connector_module_path = (
-        "tests.v1.kv_connector.unit.test_backwards_compatibility"
-    )
-
-    scheduler = create_scheduler(vllm_config)
-    kv_cache_config = scheduler.kv_cache_config
-
-    connector = KVConnectorFactory.create_connector(vllm_config, role, kv_cache_config)
-
-    assert connector is not None
-    assert isinstance(connector, OldStyleTestConnector)
-    assert connector.role == role
-    assert connector._kv_cache_config is None
-
-
-@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
-def test_external_new_signature_factory_instantiation(role):
-    """
-    Test that external connectors with new signature (3 required args) loaded
-    via kv_connector_module_path are correctly instantiated.
-    """
-    vllm_config = create_vllm_config()
-    vllm_config.kv_transfer_config.kv_connector = "NewStyleTestConnector"
-    vllm_config.kv_transfer_config.kv_connector_module_path = (
-        "tests.v1.kv_connector.unit.test_backwards_compatibility"
-    )
-
-    scheduler = create_scheduler(vllm_config)
-    kv_cache_config = scheduler.kv_cache_config
-
-    connector = KVConnectorFactory.create_connector(vllm_config, role, kv_cache_config)
-
-    assert connector is not None
-    assert isinstance(connector, NewStyleTestConnector)
-    assert connector.role == role
-    assert connector._kv_cache_config is not None
-    assert connector._kv_cache_config == kv_cache_config
-
-
-@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
-def test_old_signature_super_init(role):
-    """
-    Test that old-style connectors can call super().__init__() without
-    kv_cache_config parameter.
-    """
-    vllm_config = create_vllm_config()
-
-    connector = OldStyleTestConnector(vllm_config, role)
-
-    assert connector is not None
-    assert connector.role == role
-    assert connector._kv_cache_config is None
-
-
-def test_old_signature_super_init_with_kwargs():
-    """
-    Test that old-style connectors can call super().__init__() with keyword
-    arguments in different orders.
-    """
-    vllm_config = create_vllm_config()
-
-    # Test with vllm_config= and role= kwargs
-    connector1 = OldStyleTestConnector(
-        vllm_config=vllm_config, role=KVConnectorRole.SCHEDULER
-    )
-    assert connector1 is not None
-    assert connector1._kv_cache_config is None
-
-    # Test with role= and vllm_config= in reversed order
-    connector2 = OldStyleTestConnector(
-        role=KVConnectorRole.WORKER, vllm_config=vllm_config
-    )
-    assert connector2 is not None
-    assert connector2._kv_cache_config is None
-
-
-def test_internal_connector_uses_new_signature():
-    """
-    Test that internal connectors (registered in factory) always use the new
-    signature and get kv_cache_config.
-    """
-    from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
-        ExampleConnector,
-    )
-
-    vllm_config = create_vllm_config()
-    vllm_config.kv_transfer_config.kv_connector = "ExampleConnector"
-
-    scheduler = create_scheduler(vllm_config)
-    kv_cache_config = scheduler.kv_cache_config
-
-    connector = KVConnectorFactory.create_connector(
-        vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
-    )
-
-    assert connector is not None
-    assert isinstance(connector, ExampleConnector)
-    assert connector._kv_cache_config is not None
-    assert connector._kv_cache_config == kv_cache_config
-
-
-def test_signature_detection_with_mocking():
-    """
-    Test that the factory correctly applies compat_sig flag returned from
-    _get_connector_class_with_compat.
-    """
-    vllm_config = create_vllm_config()
-    scheduler = create_scheduler(vllm_config)
-    kv_cache_config = scheduler.kv_cache_config
-
-    # Mock _get_connector_class_with_compat to return old-style connector
-    with patch.object(
-        KVConnectorFactory,
-        "_get_connector_class_with_compat",
-        return_value=(OldStyleTestConnector, True),
-    ):
-        old_connector = KVConnectorFactory.create_connector(
-            vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
-        )
-        assert old_connector is not None
-        assert isinstance(old_connector, OldStyleTestConnector)
-        assert old_connector._kv_cache_config is None
-
-    # Mock _get_connector_class_with_compat to return new-style connector
-    with patch.object(
-        KVConnectorFactory,
-        "_get_connector_class_with_compat",
-        return_value=(NewStyleTestConnector, False),
-    ):
-        new_connector = KVConnectorFactory.create_connector(
-            vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
-        )
-        assert new_connector is not None
-        assert isinstance(new_connector, NewStyleTestConnector)
-        assert new_connector._kv_cache_config is not None
-        assert new_connector._kv_cache_config == kv_cache_config
diff --git a/tests/v1/kv_connector/unit/test_decode_bench_connector.py b/tests/v1/kv_connector/unit/test_decode_bench_connector.py
index 30652b3d5c51..3af58d63c9a1 100644
--- a/tests/v1/kv_connector/unit/test_decode_bench_connector.py
+++ b/tests/v1/kv_connector/unit/test_decode_bench_connector.py
@@ -58,7 +58,9 @@ def __init__(self, block_size: int, num_gpu_blocks: int):
 
         # Create worker-side connector
         self.worker_connector = DecodeBenchConnector(
-            vllm_config, KVConnectorRole.WORKER
+            vllm_config,
+            KVConnectorRole.WORKER,
+            self.scheduler.kv_cache_config,
         )
 
         # Create dummy KV caches for testing
diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecycle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecycle.py
index 5121dcf90c13..a9a38a17b949 100644
--- a/tests/v1/kv_connector/unit/test_kv_connector_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecycle.py
@@ -10,6 +10,7 @@
     get_kv_transfer_group,
 )
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 
 # Importing utils registers TestExampleConnector with the factory
@@ -38,7 +39,10 @@ def test_kv_connector_mixin_clears_metadata():
     vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = "unit"
 
     # Initialize the global connector instance
-    ensure_kv_transfer_initialized(vllm_config)
+    kv_cache_config = KVCacheConfig(
+        num_blocks=0, kv_cache_tensors=[], kv_cache_groups=[]
+    )
+    ensure_kv_transfer_initialized(vllm_config, kv_cache_config)
 
     try:
         # Minimal scheduler output with empty metadata; mixin should still
diff --git a/tests/v1/kv_connector/unit/test_mooncake_connector.py b/tests/v1/kv_connector/unit/test_mooncake_connector.py
index c3ce836423fa..44292d94e147 100644
--- a/tests/v1/kv_connector/unit/test_mooncake_connector.py
+++ b/tests/v1/kv_connector/unit/test_mooncake_connector.py
@@ -26,11 +26,16 @@
 )
 from vllm.utils.network_utils import get_open_port
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.request import RequestStatus
 
 from .utils import create_request, create_scheduler, create_vllm_config
 
 
+def _make_test_kv_cache_config() -> KVCacheConfig:
+    return KVCacheConfig(num_blocks=0, kv_cache_tensors=[], kv_cache_groups=[])
+
+
 class FakeMooncakeWrapper:
     """Mock Mooncake TransferEngine for unit testing environments."""
 
@@ -321,7 +326,11 @@ async def test_kv_producer(monkeypatch):
     )
 
     with set_current_vllm_config(vllm_config), patch_worker_dependencies():
-        prefill_connector = MooncakeConnector(vllm_config, KVConnectorRole.WORKER)
+        prefill_connector = MooncakeConnector(
+            vllm_config,
+            KVConnectorRole.WORKER,
+            _make_test_kv_cache_config(),
+        )
         prefill_worker = prefill_connector.connector_worker
         prefill_worker.kv_caches_base_addr = [0x1000]
         block_len = 4096
@@ -473,7 +482,11 @@ async def test_kv_consumuer(monkeypatch):
     )
 
     with set_current_vllm_config(vllm_config), patch_worker_dependencies() as mocks:
-        decode_connector = MooncakeConnector(vllm_config, KVConnectorRole.WORKER)
+        decode_connector = MooncakeConnector(
+            vllm_config,
+            KVConnectorRole.WORKER,
+            _make_test_kv_cache_config(),
+        )
         decode_worker = decode_connector.connector_worker
         decode_worker.kv_caches_base_addr = [0x1000]
         decode_worker.rpc_port = 54321
@@ -533,7 +546,11 @@ async def test_worker_get_finished_timeout(monkeypatch):
         kv_connector="MooncakeConnector", kv_role="kv_producer"
     )
     with set_current_vllm_config(vllm_config), patch_worker_dependencies():
-        prefill_connector = MooncakeConnector(vllm_config, KVConnectorRole.WORKER)
+        prefill_connector = MooncakeConnector(
+            vllm_config,
+            KVConnectorRole.WORKER,
+            _make_test_kv_cache_config(),
+        )
         prefill_worker = prefill_connector.connector_worker
 
         # Add an expired request (expire_time is in the past).
@@ -579,7 +596,11 @@ def test_register_kv_caches():
             "vllm.distributed.kv_transfer.kv_connector.v1.mooncake.mooncake_connector.threading.Thread"
         ) as mock_thread,
     ):
-        connector = MooncakeConnector(vllm_config, KVConnectorRole.WORKER)
+        connector = MooncakeConnector(
+            vllm_config,
+            KVConnectorRole.WORKER,
+            _make_test_kv_cache_config(),
+        )
         worker = connector.connector_worker
         mock_thread.return_value.is_alive.return_value = False
 
@@ -628,7 +649,11 @@ def test_register_kv_caches_supports_mixed_mla_and_eagle_shapes():
             "vllm.distributed.kv_transfer.kv_connector.v1.mooncake.mooncake_connector.threading.Thread"
         ) as mock_thread,
     ):
-        connector = MooncakeConnector(vllm_config, KVConnectorRole.WORKER)
+        connector = MooncakeConnector(
+            vllm_config,
+            KVConnectorRole.WORKER,
+            _make_test_kv_cache_config(),
+        )
         worker = connector.connector_worker
         mock_thread.return_value.is_alive.return_value = False
 
@@ -688,7 +713,11 @@ async def test_kv_producer_heterogeneous_tp(monkeypatch, d_tp_size):
     )
 
     with set_current_vllm_config(vllm_config), patch_worker_dependencies():
-        prefill_connector = MooncakeConnector(vllm_config, KVConnectorRole.WORKER)
+        prefill_connector = MooncakeConnector(
+            vllm_config,
+            KVConnectorRole.WORKER,
+            _make_test_kv_cache_config(),
+        )
         prefill_worker = prefill_connector.connector_worker
 
         # Override TP rank/size to simulate P TP=2
diff --git a/tests/v1/kv_connector/unit/test_mooncake_connector_hma.py b/tests/v1/kv_connector/unit/test_mooncake_connector_hma.py
index dbcfda6309c5..8e25df7ca837 100644
--- a/tests/v1/kv_connector/unit/test_mooncake_connector_hma.py
+++ b/tests/v1/kv_connector/unit/test_mooncake_connector_hma.py
@@ -221,9 +221,14 @@ async def test_build_transfer_params_multi_group_trimming(monkeypatch):
     vllm_config = create_vllm_config(
         kv_connector="MooncakeConnector", kv_role="kv_producer"
     )
+    kv_cache_config = make_kv_cache_config(
+        block_size=vllm_config.cache_config.block_size, swa_enabled=True
+    )
 
     with set_current_vllm_config(vllm_config), patch_worker_dependencies():
-        connector = MooncakeConnector(vllm_config, KVConnectorRole.WORKER)
+        connector = MooncakeConnector(
+            vllm_config, KVConnectorRole.WORKER, kv_cache_config
+        )
         worker = connector.connector_worker
 
         block_len = 4096
@@ -304,9 +309,14 @@ async def test_build_transfer_params_group_count_mismatch(monkeypatch):
     vllm_config = create_vllm_config(
         kv_connector="MooncakeConnector", kv_role="kv_producer"
     )
+    kv_cache_config = make_kv_cache_config(
+        block_size=vllm_config.cache_config.block_size, swa_enabled=True
+    )
 
     with set_current_vllm_config(vllm_config), patch_worker_dependencies():
-        connector = MooncakeConnector(vllm_config, KVConnectorRole.WORKER)
+        connector = MooncakeConnector(
+            vllm_config, KVConnectorRole.WORKER, kv_cache_config
+        )
         worker = connector.connector_worker
 
         block_len = 4096
diff --git a/tests/v1/kv_connector/unit/test_moriio_connector.py b/tests/v1/kv_connector/unit/test_moriio_connector.py
index 16d34d90896b..da78b62b9a03 100644
--- a/tests/v1/kv_connector/unit/test_moriio_connector.py
+++ b/tests/v1/kv_connector/unit/test_moriio_connector.py
@@ -37,9 +37,15 @@
     get_ip,
     make_zmq_path,
 )
+from vllm.v1.kv_cache_interface import KVCacheConfig
 
 from .utils import create_request, create_scheduler
 
+
+def _make_test_kv_cache_config() -> KVCacheConfig:
+    return KVCacheConfig(num_blocks=0, kv_cache_tensors=[], kv_cache_groups=[])
+
+
 aiter_available = importlib.util.find_spec("aiter") is not None
 mori_available = importlib.util.find_spec("mori") is not None
 
@@ -462,7 +468,11 @@ def test_register_kv_caches(mock_parallel_groups):
         )
 
         with set_current_vllm_config(vllm_config):
-            connector = MoRIIOConnector(vllm_config, KVConnectorRole.WORKER)
+            connector = MoRIIOConnector(
+                vllm_config,
+                KVConnectorRole.WORKER,
+                _make_test_kv_cache_config(),
+            )
             connector.connector_worker = FakeMoRIIOConnectorWorker(
                 vllm_config, connector.engine_id, hand_shake_latency=0
             )
@@ -554,7 +564,11 @@ def test_moriio_handshake_returns_metadata(mock_parallel_groups):
             }
         )
         with set_current_vllm_config(vllm_config):
-            connector = MoRIIOConnector(vllm_config, KVConnectorRole.WORKER)
+            connector = MoRIIOConnector(
+                vllm_config,
+                KVConnectorRole.WORKER,
+                _make_test_kv_cache_config(),
+            )
 
         # Execute register_kv_caches
         connector.register_kv_caches(kv_caches)
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index 51024fb92171..f0aa4e260f60 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -262,9 +262,11 @@ def test_multi_example_connector_consistency():
 
     events = get_connector_events()
     # First event is set_xfer_handshake_metadata from initialization, then
+    # on_new_request when the request is enqueued, then
     # get_num_new_matched_tokens and update_state_after_alloc from generate().
-    assert events["storage1-SCHEDULER"][:4] == [
+    assert events["storage1-SCHEDULER"][:5] == [
         "set_xfer_handshake_metadata",
+        "on_new_request",
         "get_num_new_matched_tokens 0",
         "update_state_after_alloc num_blocks=[0] 0",
         "build_connector_meta",
@@ -281,8 +283,9 @@ def test_multi_example_connector_consistency():
         "wait_for_layer_load",
         "save_kv_layer",
     ]
-    assert events["storage2-SCHEDULER"][:4] == [
+    assert events["storage2-SCHEDULER"][:5] == [
         "set_xfer_handshake_metadata",
+        "on_new_request",
         "get_num_new_matched_tokens 0",
         "update_state_after_alloc num_blocks=[0] 0",
         "build_connector_meta",
@@ -310,12 +313,14 @@ def test_multi_example_connector_consistency():
     # connector so update_state_after_alloc will be with allocated blocks
     # on that one but with zero blocks for others (first nonzero match is
     # chosen).
-    assert events["storage1-SCHEDULER"][:3] == [
+    assert events["storage1-SCHEDULER"][:4] == [
+        "on_new_request",
         "get_num_new_matched_tokens 0",
         "update_state_after_alloc num_blocks=[7] 96",
         "build_connector_meta",
     ]
-    assert events["storage2-SCHEDULER"][:3] == [
+    assert events["storage2-SCHEDULER"][:4] == [
+        "on_new_request",
         "get_num_new_matched_tokens 0",
         "update_state_after_alloc num_blocks=[0] 0",
         "build_connector_meta",
@@ -336,12 +341,14 @@ def test_multi_example_connector_consistency():
     # return 0 from the first connector, but the second connector should have
     # a hit, so update_state_after_alloc will only be called with allocated
     # blocks for the second connector.
-    assert events["storage1-SCHEDULER"][:3] == [
+    assert events["storage1-SCHEDULER"][:4] == [
+        "on_new_request",
         "get_num_new_matched_tokens 0",
         "update_state_after_alloc num_blocks=[0] 0",
         "build_connector_meta",
     ]
-    assert events["storage2-SCHEDULER"][:3] == [
+    assert events["storage2-SCHEDULER"][:4] == [
+        "on_new_request",
         "get_num_new_matched_tokens 0",
         "update_state_after_alloc num_blocks=[7] 96",
         "build_connector_meta",
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 3803e4fd3869..3f92b183dca7 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -321,6 +321,34 @@ def test_prompt_less_than_block_size():
     assert len(scheduler_output.scheduled_new_reqs) == 0
 
 
+def test_abort_immediately_remote_prefill_enqueues_empty_recv():
+    """A remote-prefill request added with abort_immediately=True should
+    be added to the scheduler's waiting queue then immediately aborted, so the
+    NIXL connector's request_finished hook enqueues an empty recv to notify
+    the prefill instance to free its blocks."""
+    from vllm.v1.request import RequestStatus
+
+    scheduler = create_scheduler(create_vllm_config())
+
+    request = create_request(request_id=42, num_tokens=10, do_remote_prefill=True)
+    assert request.kv_transfer_params is not None
+    assert request.kv_transfer_params["do_remote_prefill"] is True
+
+    # Mimic the EngineCore.add_request path for an abort-immediately req.
+    scheduler.add_request(request)
+    scheduler.finish_requests([request.request_id], RequestStatus.FINISHED_ABORTED)
+
+    scheduler_output = scheduler.schedule()
+    meta = scheduler_output.kv_connector_metadata
+    assert isinstance(meta, NixlConnectorMetadata)
+    assert set(meta.reqs_to_recv) == {request.request_id}
+    req_meta = meta.reqs_to_recv[request.request_id]
+    assert req_meta.local_block_ids == []
+    assert req_meta.remote.request_id == f"prefill-{42}"
+    # do_remote_prefill is consumed by request_finished to prevent re-issuing.
+    assert request.kv_transfer_params["do_remote_prefill"] is False
+
+
 @patch(
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker.NixlWrapper",
     FakeNixlWrapper,
@@ -1346,9 +1374,11 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
             |  {eventually free blocks}
     """
     model_name = "Qwen/Qwen3-0.6B"
+    timeout = 6
     kv_transfer_config = KVTransferConfig(
         kv_connector="NixlConnector",
         kv_role="kv_both",
+        kv_connector_extra_config={"kv_lease_duration": timeout},
     )
     llm_kwargs = {
         "model": model_name,
@@ -1358,9 +1388,7 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
         "distributed_executor_backend": distributed_executor_backend,
     }
 
-    timeout = 6
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
-    monkeypatch.setenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", str(timeout))
 
     def run_test_and_cleanup():
         llm = LLM(**llm_kwargs)
@@ -1375,8 +1403,6 @@ def run_test_and_cleanup():
             runtime_env = {
                 "working_dir": working_dir,  # ship fake nixl package
                 "env_vars": {
-                    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout),
-                    # TODO: for ray to carry over, remove once we set
                     "NIXL_TELEMETRY_ENABLE": "1",
                 },
             }
@@ -2241,6 +2267,127 @@ def test_transfer_setup_failure_returns_finished(default_vllm_config, dist_init)
     assert request_id in done_recving
 
 
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker.NixlWrapper",
+    FailingNixlWrapper,
+)
+@pytest.mark.parametrize(
+    "failure_mode",
+    [
+        "handshake",
+        "transfer_setup",
+        "transfer_failed",
+        "transfer_exception",
+    ],
+)
+def test_failed_request_skips_kv_postprocessing(
+    default_vllm_config, dist_init, failure_mode
+):
+    """Test that failed requests skip KV sync and post-processing in
+    get_finished().
+
+    This is the core safety behavior: when a KV transfer fails at any stage,
+    the request must still appear in done_recving (so the scheduler can apply
+    kv_load_failure_policy), but sync_recved_kv_to_device and post-processing
+    must NOT be called since no valid KV data was received.
+
+    Covers all failure paths that involve an actual (attempted) KV transfer:
+    - handshake: add_remote_agent raises during async handshake
+    - transfer_setup: make_prepped_xfer raises before handle is in transfers
+    - transfer_failed: check_xfer_state returns bad state ("ERR") in
+      _pop_done_transfers — this is the path that previously had the bug
+      where post-processing was NOT skipped
+    - transfer_exception: check_xfer_state raises in _pop_done_transfers
+
+    Note: notification_failed (send_notif raises on the full-cache-hit path)
+    is intentionally excluded. That path is a best-effort D→P courtesy
+    notification; the blocks are already in D's cache, so no KV transfer
+    was attempted and done_recving is correctly empty.
+    """
+    # Map each failure mode to the FailingNixlWrapper attribute to set.
+    _WRAPPER_CONFIG: dict[str, str] = {
+        "handshake": "fail_handshake",
+        "transfer_setup": "fail_transfer_setup",
+        "transfer_failed": "fail_transfer_state",
+        "transfer_exception": "fail_transfer_exception",
+    }
+
+    # Use enable_permute_local_kv=True so that
+    # post_process_device_kv_on_receive would be called on the success path,
+    # making the assertion meaningful (not trivially true).
+    vllm_config = create_vllm_config(enable_permute_local_kv=True)
+
+    connector = NixlConnector(
+        vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16)
+    )
+    connector.connector_worker = FakeNixlConnectorWorker(
+        vllm_config,
+        connector.engine_id,
+        hand_shake_latency=0.1 if failure_mode == "handshake" else 0,
+    )
+    worker = connector.connector_worker
+    setattr(worker.nixl_wrapper, _WRAPPER_CONFIG[failure_mode], True)
+
+    request_id = f"test_{failure_mode}_skip_postprocess"
+    metadata = NixlConnectorMetadata()
+    metadata.add_new_req_to_recv(
+        request_id=request_id,
+        local_block_ids=([1, 2, 3],),
+        kv_transfer_params={
+            "remote_block_ids": ([4, 5, 6],),
+            "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"prefill-{request_id}",
+            "remote_host": "localhost",
+            "remote_port": 1234,
+            "remote_tp_size": 1,
+        },
+    )
+    connector.bind_connector_metadata(metadata)
+
+    dummy_ctx = ForwardContext(
+        no_compile_layers={},
+        attn_metadata={},
+        slot_mapping={},
+    )
+    connector.start_load_kv(dummy_ctx)
+
+    if failure_mode == "handshake":
+        # Wait for async handshake to fail.
+        time.sleep(0.3)
+    else:
+        # All other modes: let the handshake complete, then process the
+        # ready_requests queue. For transfer_failed / transfer_exception the
+        # handle ends up in _recving_transfers; the failure surfaces in
+        # get_finished() via _pop_done_transfers below.
+        connector.bind_connector_metadata(NixlConnectorMetadata())
+        time.sleep(0.1)
+        connector.start_load_kv(dummy_ctx)
+
+    # Spy on sync_recved_kv_to_device and post_process_device_kv_on_receive
+    # to verify they are NOT called for the failed request.
+    with (
+        patch.object(worker, "sync_recved_kv_to_device") as mock_sync,
+        patch.object(worker, "post_process_device_kv_on_receive") as mock_postprocess,
+    ):
+        _, done_recving = connector.get_finished(finished_req_ids=set())
+
+    # The failed request must appear in done_recving so the scheduler
+    # can handle it (e.g., trigger recompute via kv_load_failure_policy).
+    assert request_id in done_recving
+
+    # Critical: KV sync and post-processing must NOT have been called
+    # since no valid KV data was received for the failed request.
+    mock_sync.assert_not_called()
+    mock_postprocess.assert_not_called()
+
+    # Metadata for the request should have been cleaned up.
+    assert request_id not in worker._recving_metadata
+
+    # Blocks should have been marked as invalid.
+    invalid_blocks = connector.get_block_ids_with_load_errors()
+    assert invalid_blocks == {1, 2, 3}
+
+
 @pytest.mark.parametrize(
     "mismatch_type,config_overrides,version_override,should_fail,enforce_handshake_compat",
     [
diff --git a/tests/v1/kv_connector/unit/test_nixl_heartbeat.py b/tests/v1/kv_connector/unit/test_nixl_heartbeat.py
new file mode 100644
index 000000000000..345e10621f0b
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_nixl_heartbeat.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for the scheduler-driven heartbeat / lease-renewal system."""
+
+import time
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.v1.outputs import KVConnectorOutput
+
+from .utils import create_request, make_nixl_scheduler
+
+_ENGINE_A = "my-engine-id"
+
+
+def _sched(kv_lease_duration: int = 30):
+    return make_nixl_scheduler(heartbeat=True, kv_lease_duration=kv_lease_duration)
+
+
+def _req(request_id: int = 1):
+    return create_request(request_id=request_id, do_remote_prefill=True)
+
+
+def _worker_stub():
+    from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
+        NixlConnectorWorker,
+    )
+
+    w = object.__new__(NixlConnectorWorker)
+    w._reqs_to_send = {}
+    w._lease_extension = 20
+    return w
+
+
+# ===================================================================
+# Scheduler: on_new_request
+# ===================================================================
+
+
+def test_on_new_request_tracks_and_groups():
+    """Add two reqs to same engine, one to another; verify grouping."""
+    s = _sched()
+    s.on_new_request(_req(1))
+    s.on_new_request(_req(2))
+
+    assert s._heartbeat_by_engine[_ENGINE_A].req_ids == {"prefill-1", "prefill-2"}
+    info = s._heartbeat_by_engine[_ENGINE_A]
+    assert (info.host, info.port, info.tp_size) == ("my-host", 1234, 1)
+    assert s._heartbeat_req_engine["id-1"] == (_ENGINE_A, "prefill-1")
+
+    # Different engine.
+    r3 = _req(3)
+    r3.kv_transfer_params["remote_engine_id"] = "engine-b"
+    s.on_new_request(r3)
+    assert len(s._heartbeat_by_engine) == 2
+
+
+@pytest.mark.parametrize(
+    "make_req",
+    [
+        lambda: create_request(request_id=2, do_remote_decode=True),
+        lambda: create_request(request_id=3),  # no kv_transfer_params
+    ],
+    ids=["decode", "plain"],
+)
+def test_on_new_request_ignores_non_prefill(make_req):
+    s = _sched()
+    s.on_new_request(make_req())
+    assert len(s._heartbeat_by_engine) == 0
+
+
+# ===================================================================
+# Scheduler: _stop_heartbeat
+# ===================================================================
+
+
+def test_stop_heartbeat_partial_and_full():
+    """Stop one of two reqs on same engine, then stop the other."""
+    s = _sched()
+    s.on_new_request(_req(1))
+    s.on_new_request(_req(2))
+
+    s._stop_heartbeat("id-1")
+    assert s._heartbeat_by_engine[_ENGINE_A].req_ids == {"prefill-2"}
+    assert "id-1" not in s._heartbeat_req_engine
+
+    s._stop_heartbeat("id-2")
+    assert len(s._heartbeat_by_engine) == 0
+    assert len(s._heartbeat_req_engine) == 0
+
+
+# ===================================================================
+# Scheduler: build_connector_meta throttling
+# ===================================================================
+
+
+def test_build_connector_meta_heartbeat_throttling():
+    # kv_lease_duration=30 => _heartbeat_interval = 30 // 6 = 5
+    s = _sched(kv_lease_duration=30)
+    s.on_new_request(_req(1))
+
+    # Ensure the first call triggers by placing last_heartbeat far in the past.
+    s._last_heartbeat_time = time.perf_counter() - 10
+    meta1 = s.build_connector_meta(MagicMock())
+    assert _ENGINE_A in meta1.heartbeat_by_engine
+
+    # Immediate second call is throttled (< 5s since last).
+    meta2 = s.build_connector_meta(MagicMock())
+    assert len(meta2.heartbeat_by_engine) == 0
+
+
+# ===================================================================
+# Scheduler: cleanup paths (update_connector_output / request_finished)
+# ===================================================================
+
+
+def test_update_connector_output_stops_heartbeat():
+    s = _sched()
+    s.on_new_request(_req(1))
+
+    s.update_connector_output(
+        KVConnectorOutput(
+            finished_sending=None,
+            finished_recving={"id-1"},
+            invalid_block_ids=set(),
+        )
+    )
+
+    assert len(s._heartbeat_by_engine) == 0
+    assert len(s._heartbeat_req_engine) == 0
+
+
+def test_request_finished_stops_heartbeat():
+    s = _sched()
+    r = _req(1)
+    s.on_new_request(r)
+
+    # Simulate update_state_after_alloc having consumed do_remote_prefill.
+    r.kv_transfer_params["do_remote_prefill"] = False
+    s.request_finished(r, block_ids=())
+
+    assert len(s._heartbeat_by_engine) == 0
+    assert len(s._heartbeat_req_engine) == 0
+
+
+# ===================================================================
+# Worker: _handle_heartbeat
+# ===================================================================
+
+
+def test_handle_heartbeat():
+    w = _worker_stub()
+    far_future = time.perf_counter() + 99999
+    w._reqs_to_send = {"req-a": 100.0, "req-b": far_future}
+
+    before = time.perf_counter()
+    w._handle_heartbeat("req-a,req-b,req-unknown")
+
+    # req-a: pushed forward to ~now+20.
+    assert w._reqs_to_send["req-a"] >= before + 20
+    # req-b: already far out, max() keeps it.
+    assert w._reqs_to_send["req-b"] >= far_future
+    # req-unknown: not added.
+    assert "req-unknown" not in w._reqs_to_send
diff --git a/tests/v1/kv_connector/unit/test_scheduler_kv_connector_override.py b/tests/v1/kv_connector/unit/test_scheduler_kv_connector_override.py
index 2834647fe1ff..164d9025dfef 100644
--- a/tests/v1/kv_connector/unit/test_scheduler_kv_connector_override.py
+++ b/tests/v1/kv_connector/unit/test_scheduler_kv_connector_override.py
@@ -17,6 +17,7 @@
 from vllm.v1.core.kv_cache_utils import BlockHash
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.request import Request
 
 
@@ -26,7 +27,7 @@ def __init__(self, block_hashes_by_req: dict[str, list[BlockHash]]):
 
 
 class DummyKVConnector(KVConnectorBase_V1):
-    def __init__(self, vllm_config, role, kv_cache_config=None):
+    def __init__(self, vllm_config, role, kv_cache_config: KVCacheConfig):
         super().__init__(vllm_config, role, kv_cache_config)
 
     def get_num_new_matched_tokens(
diff --git a/tests/v1/kv_connector/unit/test_tp_mapping.py b/tests/v1/kv_connector/unit/test_tp_mapping.py
index e57244a31f79..95d49faf042f 100644
--- a/tests/v1/kv_connector/unit/test_tp_mapping.py
+++ b/tests/v1/kv_connector/unit/test_tp_mapping.py
@@ -9,6 +9,8 @@
 
 from __future__ import annotations
 
+from types import SimpleNamespace
+
 import pytest
 
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl.tp_mapping import (
@@ -33,12 +35,15 @@ def _compute_mapping(
     num_kv_heads: int = 8,
     group_spec_types: tuple[type, ...] = (FullAttentionSpec,),
 ) -> TPMapping:
-    return compute_tp_mapping(
+    transfer_topology = SimpleNamespace(
         tp_rank=tp_rank,
         tp_size=tp_size,
-        remote_tp_size=remote_tp_size,
         is_mla=is_mla,
         total_num_kv_heads=num_kv_heads,
+    )
+    return compute_tp_mapping(
+        transfer_topology=transfer_topology,
+        remote_tp_size=remote_tp_size,
         group_spec_types=group_spec_types,
     )
 
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 8e4e1cae0676..5db2f7e7a919 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -223,6 +223,7 @@ def create_request(
             remote_block_ids=list(range(num_remote_blocks)),
             remote_host="my-host",
             remote_port=1234,
+            tp_size=1,
         )
 
     max_tokens = 1 if do_remote_decode else max_tokens
@@ -293,9 +294,14 @@ def create_model_runner_output(
 
 
 class TestExampleConnector(ExampleConnector):
-    def __init__(self, config: VllmConfig, role, kv_cache_config):
+    def __init__(
+        self,
+        config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig,
+    ):
         self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
-        self._connector = ExampleConnector(config, role)
+        self._connector = ExampleConnector(config, role, kv_cache_config)
         self.call_record: dict[str, int] = defaultdict(int)
         # Use a unique temp file per connector
         self._event_file = (
@@ -368,7 +374,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         role: KVConnectorRole,
-        kv_cache_config: KVCacheConfig | None = None,
+        kv_cache_config: KVCacheConfig,
     ):
         super().__init__(vllm_config, role, kv_cache_config)
         extra_config = self._kv_transfer_config.kv_connector_extra_config
@@ -477,10 +483,16 @@ def make_kv_cache_config(
     )
 
 
-def make_nixl_scheduler(has_mamba: bool = False, is_hma_required: bool = False):
+def make_nixl_scheduler(
+    has_mamba: bool = False,
+    is_hma_required: bool = False,
+    heartbeat: bool = False,
+    kv_lease_duration: int = 30,
+):
     """Create a NixlConnectorScheduler via __new__ (skipping __init__).
 
-    Only sets the two flags needed by the N-1 prefill logic.
+    Only sets the flags needed by the tests.  When *heartbeat=True* the
+    scheduler-side heartbeat bookkeeping fields are also initialised.
     """
     from vllm.distributed.kv_transfer.kv_connector.v1.nixl.scheduler import (
         NixlConnectorScheduler,
@@ -489,4 +501,23 @@ def make_nixl_scheduler(has_mamba: bool = False, is_hma_required: bool = False):
     sched = object.__new__(NixlConnectorScheduler)
     sched._has_mamba = has_mamba
     sched._is_hma_required = is_hma_required
+
+    if heartbeat:
+        sched._heartbeat_by_engine = {}
+        sched._heartbeat_req_engine = {}
+        sched._last_heartbeat_time = 0.0
+        sched._kv_lease_duration = kv_lease_duration
+        sched._heartbeat_interval = kv_lease_duration // 6
+        # Fields touched by build_connector_meta / request_finished:
+        sched._reqs_need_recv = {}
+        sched._reqs_need_send = {}
+        sched._reqs_in_batch = set()
+        sched._reqs_not_processed = set()
+        sched._reqs_need_save = {}
+        sched.use_host_buffer = False
+        sched.engine_id = "test-engine"
+        sched.side_channel_host = "localhost"
+        sched.side_channel_port = 5555
+        sched.blocks_per_sw = []
+        sched.is_bidirectional_kv_xfer_enabled = False
     return sched
diff --git a/tests/v1/kv_offload/cpu/test_manager.py b/tests/v1/kv_offload/cpu/test_manager.py
index e043590a4184..4641a3fe9f4a 100644
--- a/tests/v1/kv_offload/cpu/test_manager.py
+++ b/tests/v1/kv_offload/cpu/test_manager.py
@@ -17,7 +17,6 @@
 from vllm.v1.kv_offload.cpu.common import CPULoadStoreSpec
 from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
 from vllm.v1.kv_offload.cpu.policies.arc import ARCCachePolicy
-from vllm.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
 
 
 def make_req_context(kv_transfer_params: dict | None = None) -> ReqContext:
@@ -117,10 +116,10 @@ def test_already_stored_block_not_evicted_during_prepare_store(eviction_policy):
 
     # store [1, 2] and complete
     manager.prepare_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
-    manager.complete_store(to_keys([1, 2]))
+    manager.complete_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
 
     # touch [1] to make block 2 the LRU candidate
-    manager.touch(to_keys([1]))
+    manager.touch(to_keys([1]), _EMPTY_REQ_CTX)
 
     # prepare_store([2, 3, 4, 5]):
     #   - block 2 is already stored -> filtered out of keys_to_store
@@ -137,7 +136,7 @@ def test_already_stored_block_not_evicted_during_prepare_store(eviction_policy):
     )
 
     # complete_store must not silently drop block 2
-    manager.complete_store(to_keys([2, 3, 4, 5]))
+    manager.complete_store(to_keys([2, 3, 4, 5]), _EMPTY_REQ_CTX)
 
     # block 2 must still be present in the cache
     assert manager.lookup(to_key(2), _EMPTY_REQ_CTX) is True
@@ -171,7 +170,7 @@ def test_cpu_manager():
     assert list(cpu_manager.take_events()) == []
 
     # complete store [1, 2]
-    cpu_manager.complete_store(to_keys([1, 2]))
+    cpu_manager.complete_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
     verify_events(cpu_manager.take_events(), expected_stores=({1, 2},))
 
     # lookup [1, 2]
@@ -199,7 +198,7 @@ def test_cpu_manager():
     assert cpu_manager.prepare_store(to_keys([1, 6]), _EMPTY_REQ_CTX) is None
 
     # complete store [2, 3, 4, 5]
-    cpu_manager.complete_store(to_keys([2, 3, 4, 5]))
+    cpu_manager.complete_store(to_keys([2, 3, 4, 5]), _EMPTY_REQ_CTX)
 
     # lookup (now that we have [2, 3, 4, 5])
     assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is False
@@ -217,7 +216,7 @@ def test_cpu_manager():
     assert cpu_manager.prepare_store(to_keys([6, 7, 8]), _EMPTY_REQ_CTX) is None
 
     # complete load [2, 3]
-    cpu_manager.complete_load(to_keys([2, 3]))
+    cpu_manager.complete_load(to_keys([2, 3]), _EMPTY_REQ_CTX)
 
     # prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest)
     prepare_store_output = cpu_manager.prepare_store(to_keys([6, 7, 8]), _EMPTY_REQ_CTX)
@@ -231,10 +230,10 @@ def test_cpu_manager():
     )
 
     # complete store [6, 7, 8]
-    cpu_manager.complete_store(to_keys([6, 7, 8]))
+    cpu_manager.complete_store(to_keys([6, 7, 8]), _EMPTY_REQ_CTX)
 
     # touch [5, 6, 7] (move to end of LRU order)
-    cpu_manager.touch(to_keys([5, 6, 7]))
+    cpu_manager.touch(to_keys([5, 6, 7]), _EMPTY_REQ_CTX)
 
     # prepare store [7, 9] -> evicts [8] (oldest following previous touch)
     prepare_store_output = cpu_manager.prepare_store(to_keys([9]), _EMPTY_REQ_CTX)
@@ -248,7 +247,7 @@ def test_cpu_manager():
     )
 
     # complete store [7, 9] with failure
-    cpu_manager.complete_store(to_keys([7, 9]), success=False)
+    cpu_manager.complete_store(to_keys([7, 9]), _EMPTY_REQ_CTX, success=False)
 
     # assert [7] is still stored, but [9] is not
     assert cpu_manager.lookup(to_key(7), _EMPTY_REQ_CTX) is True
@@ -304,7 +303,7 @@ def test_basic(self):
         assert list(cpu_manager.take_events()) == []
 
         # complete store [1, 2]
-        cpu_manager.complete_store(to_keys([1, 2]))
+        cpu_manager.complete_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
         verify_events(cpu_manager.take_events(), expected_stores=({1, 2},))
 
         # lookup [1, 2]
@@ -325,14 +324,14 @@ def test_t1_to_t2_promotion(self):
 
         # store and complete block 1
         cpu_manager.prepare_store(to_keys([1]), _EMPTY_REQ_CTX)
-        cpu_manager.complete_store(to_keys([1]))
+        cpu_manager.complete_store(to_keys([1]), _EMPTY_REQ_CTX)
 
         # block 1 starts in T1 (recent)
         assert to_keys([1])[0] in arc_policy.t1
         assert to_keys([1])[0] not in arc_policy.t2
 
         # touch block 1 (simulate second access)
-        cpu_manager.touch(to_keys([1]))
+        cpu_manager.touch(to_keys([1]), _EMPTY_REQ_CTX)
 
         # block 1 should now be in T2 (frequent)
         assert to_keys([1])[0] not in arc_policy.t1
@@ -357,7 +356,7 @@ def test_eviction_with_load(self):
                 evicted_keys=[],
             ),
         )
-        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
+        cpu_manager.complete_store(to_keys([1, 2, 3, 4]), _EMPTY_REQ_CTX)
 
         # prepare load [2, 3] (increases ref_cnt)
         prepare_load_output = cpu_manager.prepare_load(to_keys([2, 3]), _EMPTY_REQ_CTX)
@@ -368,7 +367,7 @@ def test_eviction_with_load(self):
         assert cpu_manager.prepare_store(to_keys([5, 6, 7]), _EMPTY_REQ_CTX) is None
 
         # complete load [2, 3]
-        cpu_manager.complete_load(to_keys([2, 3]))
+        cpu_manager.complete_load(to_keys([2, 3]), _EMPTY_REQ_CTX)
 
         # now prepare store [5, 6, 7] should succeed
         # ARC will evict blocks one at a time from T1 as needed
@@ -389,20 +388,20 @@ def test_adaptive_target(self):
 
         # store blocks 1, 2 (fills cache)
         cpu_manager.prepare_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
-        cpu_manager.complete_store(to_keys([1, 2]))
+        cpu_manager.complete_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
 
         initial_target = arc_policy.target_t1_size
 
         # store block 3, evicting block 1 (moves to B1 ghost list)
         cpu_manager.prepare_store(to_keys([3]), _EMPTY_REQ_CTX)
-        cpu_manager.complete_store(to_keys([3]))
+        cpu_manager.complete_store(to_keys([3]), _EMPTY_REQ_CTX)
 
         # block 1 should be in B1 (ghost list)
         assert to_keys([1])[0] in arc_policy.b1
 
         # touch block 1 (cache miss, but in B1)
         # this should increase target_t1_size (favor recency)
-        cpu_manager.touch(to_keys([1]))
+        cpu_manager.touch(to_keys([1]), _EMPTY_REQ_CTX)
 
         # target should have increased
         assert arc_policy.target_t1_size > initial_target
@@ -416,10 +415,10 @@ def test_t1_t2_eviction_policy(self):
 
         # store blocks 1, 2, 3, 4
         cpu_manager.prepare_store(to_keys([1, 2, 3, 4]), _EMPTY_REQ_CTX)
-        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
+        cpu_manager.complete_store(to_keys([1, 2, 3, 4]), _EMPTY_REQ_CTX)
 
         # promote blocks 3, 4 to T2 by touching them
-        cpu_manager.touch(to_keys([3, 4]))
+        cpu_manager.touch(to_keys([3, 4]), _EMPTY_REQ_CTX)
 
         # now: T1 = {1, 2}, T2 = {3, 4}
         assert len(arc_policy.t1) == 2
@@ -434,7 +433,7 @@ def test_t1_t2_eviction_policy(self):
         assert output is not None
         assert to_keys([1]) == output.evicted_keys
 
-        cpu_manager.complete_store(to_keys([5]))
+        cpu_manager.complete_store(to_keys([5]), _EMPTY_REQ_CTX)
 
         # block 1 should be in B1 (ghost list)
         assert to_keys([1])[0] in arc_policy.b1
@@ -450,12 +449,12 @@ def test_ghost_list_bounds(self):
 
         # fill cache with blocks 1, 2
         cpu_manager.prepare_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
-        cpu_manager.complete_store(to_keys([1, 2]))
+        cpu_manager.complete_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
 
         # store many blocks to fill ghost lists
         for i in range(3, 20):
             cpu_manager.prepare_store(to_keys([i]), _EMPTY_REQ_CTX)
-            cpu_manager.complete_store(to_keys([i]))
+            cpu_manager.complete_store(to_keys([i]), _EMPTY_REQ_CTX)
 
         # ghost lists should not exceed cache_capacity
         assert len(arc_policy.b1) <= arc_policy.cache_capacity
@@ -470,14 +469,14 @@ def test_touch_ordering(self):
 
         # store blocks 1, 2, 3, 4
         cpu_manager.prepare_store(to_keys([1, 2, 3, 4]), _EMPTY_REQ_CTX)
-        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
+        cpu_manager.complete_store(to_keys([1, 2, 3, 4]), _EMPTY_REQ_CTX)
 
         # promote 3, 4 to T2
-        cpu_manager.touch(to_keys([3, 4]))
+        cpu_manager.touch(to_keys([3, 4]), _EMPTY_REQ_CTX)
 
         # T1 = {1, 2}, T2 = {3, 4}
         # touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
-        cpu_manager.touch(to_keys([1, 3, 4]))
+        cpu_manager.touch(to_keys([1, 3, 4]), _EMPTY_REQ_CTX)
 
         # T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
         assert len(arc_policy.t1) == 1
@@ -503,7 +502,7 @@ def test_failed_store(self):
 
         # store blocks 1, 2, 3, 4
         cpu_manager.prepare_store(to_keys([1, 2, 3, 4]), _EMPTY_REQ_CTX)
-        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
+        cpu_manager.complete_store(to_keys([1, 2, 3, 4]), _EMPTY_REQ_CTX)
 
         # prepare store block 5 (will evict block 1)
         prepare_store_output = cpu_manager.prepare_store(to_keys([5]), _EMPTY_REQ_CTX)
@@ -511,7 +510,7 @@ def test_failed_store(self):
         assert len(prepare_store_output.evicted_keys) == 1
 
         # complete store with failure
-        cpu_manager.complete_store(to_keys([5]), success=False)
+        cpu_manager.complete_store(to_keys([5]), _EMPTY_REQ_CTX, success=False)
 
         # block 5 should not be in cache
         assert cpu_manager.lookup(to_key(5), _EMPTY_REQ_CTX) is False
@@ -532,7 +531,7 @@ def test_full_scenario(self):
 
         # store [1, 2]
         cpu_manager.prepare_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
-        cpu_manager.complete_store(to_keys([1, 2]))
+        cpu_manager.complete_store(to_keys([1, 2]), _EMPTY_REQ_CTX)
 
         # store [3, 4, 5] -> evicts [1]
         prepare_store_output = cpu_manager.prepare_store(
@@ -540,10 +539,10 @@ def test_full_scenario(self):
         )
         assert prepare_store_output is not None
         assert len(prepare_store_output.evicted_keys) == 1
-        cpu_manager.complete_store(to_keys([3, 4, 5]))
+        cpu_manager.complete_store(to_keys([3, 4, 5]), _EMPTY_REQ_CTX)
 
         # promote some blocks to T2
-        cpu_manager.touch(to_keys([2, 3]))
+        cpu_manager.touch(to_keys([2, 3]), _EMPTY_REQ_CTX)
 
         # T1 has {4, 5}, T2 has {2, 3}
         assert len(arc_policy.t1) == 2
@@ -552,7 +551,7 @@ def test_full_scenario(self):
         # store [6] -> should evict from T1 (4 is oldest in T1)
         prepare_store_output = cpu_manager.prepare_store(to_keys([6]), _EMPTY_REQ_CTX)
         assert prepare_store_output is not None
-        cpu_manager.complete_store(to_keys([6]))
+        cpu_manager.complete_store(to_keys([6]), _EMPTY_REQ_CTX)
 
         # verify blocks 2, 3 (in T2) are still present
         assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is True
@@ -565,14 +564,14 @@ def test_full_scenario(self):
 
 def test_filter_reused_manager():
     """
-    Tests FilterReusedOffloadingManager with a CPUOffloadingManager.
+    Tests CPUOffloadingManager reuse filtering (store_threshold=2).
     """
-    lru_manager = CPUOffloadingManager(
-        num_blocks=4, cache_policy="lru", enable_events=True
-    )
-
-    manager = FilterReusedOffloadingManager(
-        backing=lru_manager, store_threshold=2, max_tracker_size=3
+    manager = CPUOffloadingManager(
+        num_blocks=4,
+        cache_policy="lru",
+        enable_events=True,
+        store_threshold=2,
+        max_tracker_size=3,
     )
 
     # Lookup [1, 2] -> 1st time, added to tracker but not eligible for store yet
@@ -609,4 +608,4 @@ def test_filter_reused_manager():
     assert prepare_store_output is not None
     assert prepare_store_output.keys_to_store == []
 
-    manager.complete_store(to_keys([1]))
+    manager.complete_store(to_keys([1]), _EMPTY_REQ_CTX)
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index 29ec72186b8d..325ca48b597e 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -16,8 +16,8 @@
     DummyLogitsProcessor,
     WrappedPerReqLogitsProcessor,
     prompts,
+    setup_fake_entrypoint,
 )
-from tests.v1.logits_processors.utils import entry_points as fake_entry_points
 from vllm import LLM, SamplingParams
 from vllm.v1.sample.logits_processor import (
     STR_POOLING_REJECTS_LOGITSPROCS,
@@ -145,13 +145,9 @@ def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource
 
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
         # Scenario: vLLM loads a logitproc from a preconfigured entrypoint
-        # To that end, mock a dummy logitproc entrypoint
-        import importlib.metadata
-
-        importlib.metadata.entry_points = fake_entry_points  # type: ignore
-
-        # fork is required for workers to see entrypoint patch
-        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
+        # To that end, register a real dist-info package so spawned
+        # workers can discover the entrypoint via PYTHONPATH
+        setup_fake_entrypoint(monkeypatch)
         _run_test({}, logitproc_loaded=True)
         return
 
@@ -266,14 +262,9 @@ def test_rejects_custom_logitsprocs(
         # Scenario: vLLM loads a model and ignores a logitproc that is
         # available at a preconfigured entrypoint
 
-        # Patch in dummy logitproc entrypoint
-        import importlib.metadata
-
-        importlib.metadata.entry_points = fake_entry_points  # type: ignore
-
-        # fork is required for entrypoint patch to be visible to workers,
-        # although they should ignore the entrypoint patch anyway
-        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
+        # Register real dist-info package so spawned workers can
+        # discover the entrypoint via PYTHONPATH (spawn-compatible)
+        setup_fake_entrypoint(monkeypatch)
 
         llm = LLM(**llm_kwargs)
         # Require that no custom logitsprocs have been loaded
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
index 05ac70349736..3b7add3b80f4 100644
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -18,8 +18,8 @@
     MODEL_NAME,
     TEMP_GREEDY,
     prompts,
+    setup_fake_entrypoint,
 )
-from tests.v1.logits_processors.utils import entry_points as fake_entry_points
 
 
 def _server_with_logitproc_entrypoint(
@@ -27,16 +27,9 @@ def _server_with_logitproc_entrypoint(
     model: str,
     vllm_serve_args: list[str],
 ) -> None:
-    """Start vLLM server, inject dummy logitproc entrypoint"""
-
-    # Patch `entry_points` to inject logitproc entrypoint
-    import importlib.metadata
-
-    importlib.metadata.entry_points = fake_entry_points  # type: ignore
+    """Start vLLM server with dummy logitproc entrypoint."""
     from vllm.entrypoints.cli import main
 
-    # fork is required for workers to see entrypoint patch
-    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "fork"
     if env_dict is not None:
         os.environ.update(env_dict)
 
@@ -50,7 +43,7 @@ def _server_with_logitproc_fqcn(
     model: str,
     vllm_serve_args: list[str],
 ) -> None:
-    """Start vLLM server, inject module with dummy logitproc"""
+    """Start vLLM server with dummy logitproc specified by FQCN."""
     from vllm.entrypoints.cli import main
 
     if env_dict is not None:
@@ -80,8 +73,8 @@ def default_server_args():
 def server(default_server_args, request, monkeypatch):
     """Consider two server configurations:
     (1) --logits-processors cli arg specifies dummy logits processor via fully-
-    qualified class name (FQCN); patch in a dummy logits processor module
-    (2) No --logits-processors cli arg; patch in a dummy logits processor
+    qualified class name (FQCN)
+    (2) No --logits-processors cli arg; inject a dummy logits processor
     entrypoint
     """
 
@@ -94,6 +87,7 @@ def server(default_server_args, request, monkeypatch):
         _server_fxn = _server_with_logitproc_fqcn
     else:
         # Launch server, inject dummy logitproc entrypoint
+        setup_fake_entrypoint(monkeypatch)
         args = default_server_args
         _server_fxn = _server_with_logitproc_entrypoint
 
@@ -119,7 +113,6 @@ async def client(server):
 }
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
     [MODEL_NAME],
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index e54da72e5e2e..fc8ce50c05fa 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import types
+import os
+import tempfile
 from enum import Enum, auto
+from pathlib import Path
 from typing import Any
 
 import torch
 
+from tests.utils import requires_spawn_multiprocessing
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
@@ -102,11 +105,6 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
         return logits
 
 
-"""Dummy module with dummy logitproc class"""
-dummy_module = types.ModuleType(DUMMY_LOGITPROC_MODULE)
-dummy_module.DummyLogitsProcessor = DummyLogitsProcessor  # type: ignore
-
-
 class EntryPoint:
     """Dummy entrypoint class for logitsprocs testing"""
 
@@ -187,5 +185,59 @@ def new_req_logits_processor(
         return DummyPerReqLogitsProcessor(target_token)
 
 
-"""Fake version of importlib.metadata.entry_points"""
-entry_points = lambda group: EntryPoints(group)
+def register_fake_entrypoint(monkeypatch) -> str:
+    """Register the dummy logitsproc entrypoint in a way that is visible
+    to spawned subprocesses by creating a real dist-info directory on disk.
+
+    Unlike monkey-patching importlib.metadata.entry_points (which only works
+    with fork), this approach writes a real dist-info package that
+    importlib.metadata can discover in any subprocess via PYTHONPATH.
+
+    Returns the temp directory path.
+    """
+    tmpdir = Path(tempfile.mkdtemp(prefix="dummy-logitproc-"))
+    dist_info = tmpdir / "dummy_logitproc-0.1.dist-info"
+    dist_info.mkdir()
+
+    # Write METADATA file (required by importlib.metadata)
+    (dist_info / "METADATA").write_text(
+        "Metadata-Version: 2.1\nName: dummy-logitproc\nVersion: 0.1\n",
+        encoding="utf-8",
+    )
+
+    # Write entry_points.txt
+    (dist_info / "entry_points.txt").write_text(
+        f"[{LOGITSPROCS_GROUP}]\n"
+        f"{DUMMY_LOGITPROC_ENTRYPOINT} = {DUMMY_LOGITPROC_FQCN}\n",
+        encoding="utf-8",
+    )
+
+    # Add to PYTHONPATH so spawned subprocesses can discover it
+    existing = os.environ.get("PYTHONPATH", "")
+    monkeypatch.setenv(
+        "PYTHONPATH", str(tmpdir) + (os.pathsep + existing if existing else "")
+    )
+
+    # Also update sys.path for the current process so the driver can
+    # discover the entrypoint.
+    monkeypatch.syspath_prepend(str(tmpdir))
+
+    return str(tmpdir)
+
+
+def fake_entry_points(group: str) -> EntryPoints:
+    """Fake version of importlib.metadata.entry_points."""
+    return EntryPoints(group)
+
+
+def setup_fake_entrypoint(monkeypatch) -> None:
+    """Expose the dummy logitproc entrypoint for the current platform."""
+    if requires_spawn_multiprocessing():
+        register_fake_entrypoint(monkeypatch)
+        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+        return
+
+    import importlib.metadata
+
+    monkeypatch.setattr(importlib.metadata, "entry_points", fake_entry_points)
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 564276adaaf1..553513de6c04 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2805,6 +2805,7 @@ def swap_blocks_batch(
     src_ptrs: torch.Tensor,
     dst_ptrs: torch.Tensor,
     sizes: torch.Tensor,
+    is_src_access_order_any: bool = False,
 ) -> None:
     """
     Batch version of swap_blocks: submit all copies in a single driver call.
@@ -2813,8 +2814,16 @@ def swap_blocks_batch(
     of sizes[i] bytes. All three tensors must be int64 CPU tensors.
     On CUDA 12.8+ this uses cuMemcpyBatchAsync for minimal submission
     overhead; on older CUDA it falls back to a loop of cudaMemcpyAsync.
+
+    is_src_access_order_any: if True, pass CU_MEMCPY_SRC_ACCESS_ORDER_ANY to
+        cuMemcpyBatchAsync, letting the DMA engine prefetch source bytes
+        out of stream order. Only safe when no GPU stream is concurrently
+        writing to the source. Defaults to False (STREAM ordering), which
+        is always safe.
     """
-    torch.ops._C_cache_ops.swap_blocks_batch(src_ptrs, dst_ptrs, sizes)
+    torch.ops._C_cache_ops.swap_blocks_batch(
+        src_ptrs, dst_ptrs, sizes, is_src_access_order_any
+    )
 
 
 def convert_fp8(
diff --git a/vllm/compilation/passes/fusion/collective_fusion.py b/vllm/compilation/passes/fusion/collective_fusion.py
index 2b74eae8dd32..29d79c9b92ce 100644
--- a/vllm/compilation/passes/fusion/collective_fusion.py
+++ b/vllm/compilation/passes/fusion/collective_fusion.py
@@ -74,6 +74,36 @@ def _flashinfer_scaled_mm_out(
     )
 
 
+def _flashinfer_fp4_mm_out(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    *,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out: torch.Tensor,
+    alpha: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+    use_8x4_sf_layout: bool = False,
+    backend: str = "cutlass",
+) -> None:
+    from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm_out
+
+    assert A.ndim == 2 and B.ndim == 2 and out.ndim == 2, (
+        "FlashInfer FP4 symm_mem adapter expects 2D inputs and output"
+    )
+    flashinfer_scaled_fp4_mm_out(
+        A,
+        B,
+        scale_a,
+        scale_b,
+        alpha,
+        out=out,
+        out_dtype=out_dtype or out.dtype,
+        use_8x4_sf_layout=use_8x4_sf_layout,
+        backend=backend,
+    )
+
+
 def fused_flashinfer_scaled_matmul_reduce_scatter_fake(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -197,6 +227,90 @@ def fused_all_gather_flashinfer_scaled_matmul(
     return outputs[0]
 
 
+def fused_all_gather_flashinfer_fp4_matmul_fake(
+    A_shard: torch.Tensor,
+    B: torch.Tensor,
+    A_scale_shard: torch.Tensor,
+    B_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    gather_dim: int,
+    group_name: str,
+    out_dtype: torch.dtype | None = None,
+    view_a_scale_as_fp8: bool = False,
+    use_8x4_sf_layout: bool = False,
+    backend: str = "cutlass",
+) -> torch.Tensor:
+    world_size = c10d._resolve_process_group(group_name).size()
+    output_shape = list(A_shard.shape)
+    output_shape[gather_dim] *= world_size
+    output_shape[-1] = B.shape[1]
+    return torch.empty(
+        output_shape,
+        dtype=out_dtype or torch.bfloat16,
+        device=A_shard.device,
+    )
+
+
+def fused_all_gather_flashinfer_fp4_matmul(
+    A_shard: torch.Tensor,
+    B: torch.Tensor,
+    A_scale_shard: torch.Tensor,
+    B_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    gather_dim: int,
+    group_name: str,
+    out_dtype: torch.dtype | None = None,
+    view_a_scale_as_fp8: bool = False,
+    use_8x4_sf_layout: bool = False,
+    backend: str = "cutlass",
+) -> torch.Tensor:
+    assert gather_dim == 0, (
+        "FlashInfer FP4 symm_mem adapter currently only supports gather_dim=0"
+    )
+    assert A_shard.ndim == 2 and A_scale_shard.ndim == 2 and B.ndim == 2, (
+        "FlashInfer FP4 symm_mem adapter expects 2D inputs"
+    )
+    if view_a_scale_as_fp8:
+        A_scale_shard = A_scale_shard.view(torch.float8_e4m3fn)
+
+    group = c10d._resolve_process_group(group_name)
+    world_size = group.size()
+    output = A_shard.new_empty(
+        A_shard.shape[0] * world_size,
+        B.shape[1],
+        dtype=out_dtype or torch.bfloat16,
+    )
+    output_shards = output.chunk(world_size)
+
+    A = A_shard.new_empty(A_shard.shape[0] * world_size, A_shard.shape[1])
+    A_scale = A_scale_shard.new_empty(
+        A_scale_shard.shape[0] * world_size,
+        A_scale_shard.shape[1],
+    )
+
+    def fp4_shard_consumer(shards: list[torch.Tensor], rank: int) -> None:
+        _flashinfer_fp4_mm_out(
+            shards[0],
+            B,
+            scale_a=shards[1],
+            scale_b=B_scale,
+            alpha=alpha,
+            out=output_shards[rank],
+            out_dtype=out_dtype,
+            use_8x4_sf_layout=use_8x4_sf_layout,
+            backend=backend,
+        )
+
+    torch.distributed._symmetric_memory._pipelined_multi_all_gather_and_consume(
+        [A_shard, A_scale_shard],
+        fp4_shard_consumer,
+        [A, A_scale],
+        group_name,
+        False,
+    )
+    return output
+
+
 direct_register_custom_op(
     op_name="fused_flashinfer_scaled_matmul_reduce_scatter",
     op_func=fused_flashinfer_scaled_matmul_reduce_scatter,
@@ -209,6 +323,12 @@ def fused_all_gather_flashinfer_scaled_matmul(
     fake_impl=fused_all_gather_flashinfer_scaled_matmul_fake,
 )
 
+direct_register_custom_op(
+    op_name="fused_all_gather_flashinfer_fp4_matmul",
+    op_func=fused_all_gather_flashinfer_fp4_matmul,
+    fake_impl=fused_all_gather_flashinfer_fp4_matmul_fake,
+)
+
 
 class BasePattern:
     def __init__(self, dtype: torch.dtype, device: str | None) -> None:
@@ -682,6 +802,101 @@ def _replacement(
         return _replacement
 
 
+class FlashInferAllGatherFP4Pattern(
+    BasePattern, VllmPatternReplacement[..., torch.Tensor]
+):
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        device: str | None,
+        backend: str,
+        use_8x4_sf_layout: bool,
+        a_scale_view: str,
+    ) -> None:
+        super().__init__(dtype, device)
+        self.backend = backend
+        self.use_8x4_sf_layout = use_8x4_sf_layout
+        self.a_scale_view = a_scale_view
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        a_shard_2d = torch.empty([8, 8], device=self.device, dtype=torch.uint8)
+        b_2d = torch.empty([8, 16], device=self.device, dtype=torch.uint8)
+        a_scale_shard = torch.empty([128, 4], device=self.device, dtype=torch.int32)
+        b_scale = torch.empty([4, 128], device=self.device, dtype=torch.uint8)
+        alpha = torch.empty([], device=self.device, dtype=torch.float32)
+        return [
+            a_shard_2d,
+            b_2d,
+            a_scale_shard,
+            b_scale,
+            alpha,
+        ]
+
+    @property
+    def pattern(self) -> Callable[..., torch.Tensor]:
+        def _pattern(
+            a_shard_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale_shard: torch.Tensor,
+            b_scale: torch.Tensor,
+            alpha: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather_a = torch.ops.vllm.all_gather.default(
+                a_shard_2d,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name,
+            )
+            all_gather_a_scale = torch.ops.vllm.all_gather.default(
+                a_scale_shard,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name,
+            )
+            a_scale = all_gather_a_scale
+            if self.a_scale_view in ("float8", "float8_uint8"):
+                a_scale = torch.ops.aten.view.dtype(a_scale, torch.float8_e4m3fn)
+            if self.a_scale_view in ("uint8", "float8_uint8"):
+                a_scale = torch.ops.aten.view.dtype(a_scale, torch.uint8)
+            return torch.ops.vllm.flashinfer_mm_fp4.default(
+                all_gather_a,
+                b_2d,
+                a_scale,
+                b_scale,
+                alpha,
+                self.dtype,
+                self.use_8x4_sf_layout,
+                self.backend,
+            )
+
+        return _pattern
+
+    @property
+    def replacement(self) -> Callable[..., torch.Tensor]:
+        def _replacement(
+            a_shard_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale_shard: torch.Tensor,
+            b_scale: torch.Tensor,
+            alpha: torch.Tensor,
+        ) -> torch.Tensor:
+            return torch.ops.vllm.fused_all_gather_flashinfer_fp4_matmul.default(
+                a_shard_2d,
+                b_2d,
+                a_scale_shard,
+                b_scale,
+                alpha,
+                0,
+                self.tp.device_group.group_name,
+                self.dtype,
+                self.a_scale_view in ("float8", "float8_uint8"),
+                self.use_8x4_sf_layout,
+                self.backend,
+            )
+
+        return _replacement
+
+
 class AsyncTPPass(VllmFusionPatternMatcherPass):
     @enable_fake_mode
     def __init__(self, config: VllmConfig) -> None:
@@ -718,6 +933,34 @@ def __init__(self, config: VllmConfig) -> None:
                 self.register(
                     FlashInferBMMFP8ReduceScatterPattern(self.model_dtype, self.device)
                 )
+            if hasattr(torch.ops.vllm, "flashinfer_mm_fp4"):
+                for backend in ("cutlass", "cudnn"):
+                    for a_scale_view in ("float8_uint8", "uint8"):
+                        self.register(
+                            FlashInferAllGatherFP4Pattern(
+                                self.model_dtype,
+                                self.device,
+                                backend,
+                                use_8x4_sf_layout=False,
+                                a_scale_view=a_scale_view,
+                            )
+                        )
+                for use_8x4_sf_layout in (False, True):
+                    for a_scale_view in ("float8",):
+                        self.register(
+                            FlashInferAllGatherFP4Pattern(
+                                self.model_dtype,
+                                self.device,
+                                "trtllm",
+                                use_8x4_sf_layout=use_8x4_sf_layout,
+                                a_scale_view=a_scale_view,
+                            )
+                        )
+                # NVFP4 reduce-scatter does not need scale communication: FP4
+                # scales are consumed by the local GEMM and only BF16 partial
+                # outputs are reduced. Keep this PR scoped to the all-gather
+                # path; reduce-scatter needs a dedicated FP4 producer rather
+                # than the existing FP8-style helper.
 
         self.dump_patterns(config, self.pm_pass)
 
diff --git a/vllm/compilation/passes/fusion/matcher_utils.py b/vllm/compilation/passes/fusion/matcher_utils.py
index e5130c19c392..94f0e62204c3 100644
--- a/vllm/compilation/passes/fusion/matcher_utils.py
+++ b/vllm/compilation/passes/fusion/matcher_utils.py
@@ -23,6 +23,9 @@
     kNvfp4Dynamic,
 )
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding.deepseek_scaling_rope import (
+    DeepseekScalingRotaryEmbedding,
+)
 from vllm.platforms import current_platform
 
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
@@ -158,6 +161,87 @@ def forward_native(
         return result
 
 
+class MatcherDeepseekScalingRotaryEmbedding(MatcherCustomOp):
+    def __init__(
+        self,
+        is_neox: bool,
+        head_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        use_flashinfer: bool = False,
+        enabled: bool | None = None,
+    ) -> None:
+        if enabled is None:
+            enabled = DeepseekScalingRotaryEmbedding.enabled()
+
+        super().__init__(enabled)
+        self.is_neox = is_neox
+        self.head_size = head_size
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.q_size = self.num_heads * self.head_size
+        self.kv_size = self.num_kv_heads * self.head_size
+        self.rotary_dim = head_size
+        self.use_flashinfer = use_flashinfer
+
+    def inputs(self) -> list[torch.Tensor]:
+        positions = self.empty_int64(5)
+        query = self.empty(5, self.num_heads, self.head_size)
+        key = self.empty(5, self.num_kv_heads, self.head_size)
+        cos_sin_cache = self.empty(4096, self.rotary_dim)
+        return [positions, query, key, cos_sin_cache]
+
+    def forward_custom(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        cos_sin_cache: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if self.use_flashinfer:
+            torch.ops.vllm.flashinfer_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                cos_sin_cache,
+                self.is_neox,
+            )
+            return query, key
+        result: tuple[torch.Tensor, torch.Tensor | None] = (
+            DeepseekScalingRotaryEmbedding.forward_static(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.rotary_dim,
+                cos_sin_cache,
+                self.is_neox,
+            )
+        )
+        return result
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        cos_sin_cache: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        result: tuple[torch.Tensor, torch.Tensor | None] = (
+            DeepseekScalingRotaryEmbedding.forward_static(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.rotary_dim,
+                cos_sin_cache,
+                self.is_neox,
+            )
+        )
+        return result
+
+
 class MatcherQuantFP8(MatcherCustomOp):
     def __init__(
         self,
@@ -189,12 +273,7 @@ def __init__(
                     "ROCm aiter fusion pass currently supports "
                     "quantization operation with group_size 128"
                 )
-                if current_platform.is_fp8_fnuz():
-                    self.QUANT_OP = rocm_aiter_ops.get_group_quant_op()
-                else:
-                    self.QUANT_OP = (
-                        torch.ops.vllm.triton_per_token_group_quant_fp8.default
-                    )
+                self.QUANT_OP = rocm_aiter_ops.get_group_quant_op()
 
         else:
             assert quant_key in QUANT_OPS, (
diff --git a/vllm/compilation/passes/fusion/mla_rope_kvcache_cat_fusion.py b/vllm/compilation/passes/fusion/mla_rope_kvcache_cat_fusion.py
new file mode 100644
index 000000000000..5a493149a9d5
--- /dev/null
+++ b/vllm/compilation/passes/fusion/mla_rope_kvcache_cat_fusion.py
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+import vllm._custom_ops as ops
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import MLAAttention
+from vllm.model_executor.layers.attention.attention import get_attention_context
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.utils.torch_utils import (
+    _USE_LAYERNAME,
+    LayerNameType,
+    _encode_layer_name,
+    _resolve_layer_name,
+    direct_register_custom_op,
+)
+
+from ..vllm_inductor_pass import VllmFusionPatternMatcherPass, VllmPatternReplacement
+from .matcher_utils import MatcherDeepseekScalingRotaryEmbedding, MatcherRotaryEmbedding
+
+logger = init_logger(__name__)
+
+
+def fused_rope_unified_mla_kv_cache_update_impl(
+    positions: torch.Tensor,
+    q_pe: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_c: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+    kv_cache_dtype: str,
+    kv_cache_scale: torch.Tensor,
+    layer_name: LayerNameType,
+) -> torch.Tensor:
+    layer_name = _resolve_layer_name(layer_name)
+    attn_metadata, _, kv_cache, layer_slot_mapping = get_attention_context(layer_name)
+    if layer_slot_mapping is not None:
+        ops.concat_and_cache_mla_rope_fused(
+            positions,
+            q_pe,
+            k_pe,
+            kv_c,
+            cos_sin_cache,
+            is_neox,
+            layer_slot_mapping,
+            kv_cache,
+            kv_cache_dtype,
+            kv_cache_scale,
+        )
+    return torch.empty(0, device=kv_c.device, dtype=kv_c.dtype)
+
+
+def fused_rope_unified_mla_kv_cache_update_fake(
+    positions: torch.Tensor,
+    q_pe: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_c: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+    kv_cache_dtype: str,
+    kv_cache_scale: torch.Tensor,
+    layer_name: LayerNameType,
+) -> torch.Tensor:
+    return torch.empty(0, dtype=kv_c.dtype, device=kv_c.device)
+
+
+direct_register_custom_op(
+    op_name="fused_rope_unified_mla_kv_cache_update",
+    op_func=fused_rope_unified_mla_kv_cache_update_impl,
+    fake_impl=fused_rope_unified_mla_kv_cache_update_fake,
+    mutates_args=["q_pe", "k_pe"],
+)
+
+
+class MLARoPEKVCacheCatPattern(VllmPatternReplacement):
+    FUSED_OP = torch.ops.vllm.fused_rope_unified_mla_kv_cache_update.default
+
+    def __init__(
+        self,
+        layer: MLAAttention,
+        is_neox: bool,
+        use_flashinfer: bool = False,
+        use_deepseek_scaling: bool = False,
+    ) -> None:
+        self.layer_name = layer.layer_name
+        self.kv_cache_dtype = layer.kv_cache_dtype
+        self.num_heads = layer.num_heads
+        self.num_kv_heads = layer.num_kv_heads
+        self.kv_lora_rank = layer.kv_lora_rank
+        self.qk_rope_head_dim = layer.qk_rope_head_dim
+        self.is_neox = is_neox
+        self.use_flashinfer = use_flashinfer
+        self._ln = _encode_layer_name(self.layer_name)
+
+        if use_deepseek_scaling:
+            self.rope_matcher = MatcherDeepseekScalingRotaryEmbedding(
+                is_neox=self.is_neox,
+                head_size=self.qk_rope_head_dim,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                use_flashinfer=self.use_flashinfer,
+            )
+        else:
+            self.rope_matcher = MatcherRotaryEmbedding(  # type: ignore
+                is_neox=self.is_neox,
+                head_size=self.qk_rope_head_dim,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                use_flashinfer=self.use_flashinfer,
+            )
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        T = 5
+        L = 4096
+        q_pe = self.empty_bf16(T, self.num_heads, self.qk_rope_head_dim)
+        k_pe = self.empty_bf16(T, self.qk_rope_head_dim)
+        kv_c_normed = self.empty_bf16(T, self.kv_lora_rank)
+        cos_sin_cache = self.empty_bf16(L, self.qk_rope_head_dim)
+        positions = self.empty(T, dtype=torch.int64)
+        k_scale = self.empty(0, dtype=torch.float32)
+        inputs = [
+            q_pe,
+            k_pe,
+            kv_c_normed,
+            positions,
+            cos_sin_cache,
+            k_scale,
+        ]
+        if _USE_LAYERNAME:
+            inputs.append(self._ln)
+        return inputs
+
+    @property
+    def pattern(self):
+        _ln = self._ln
+
+        if _USE_LAYERNAME:
+
+            def _pattern_with_ln(
+                q_pe: torch.Tensor,
+                k_pe: torch.Tensor,
+                kv_c_normed: torch.Tensor,
+                positions: torch.Tensor,
+                cos_sin_cache: torch.Tensor,
+                k_scale: torch.Tensor,
+                layer_name: LayerNameType,
+            ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                k_pe_unsqueezed = k_pe.unsqueeze(1)
+                q_pe, k_pe = self.rope_matcher(
+                    positions, q_pe, k_pe_unsqueezed, cos_sin_cache
+                )
+                dummy = torch.ops.vllm.unified_mla_kv_cache_update(
+                    kv_c_normed, k_pe, layer_name, self.kv_cache_dtype, k_scale
+                )
+                return dummy, q_pe, k_pe
+
+            return _pattern_with_ln
+
+        def _pattern(
+            q_pe: torch.Tensor,
+            k_pe: torch.Tensor,
+            kv_c_normed: torch.Tensor,
+            positions: torch.Tensor,
+            cos_sin_cache: torch.Tensor,
+            k_scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            k_pe_unsqueezed = k_pe.unsqueeze(1)
+            q_pe, k_pe = self.rope_matcher(
+                positions, q_pe, k_pe_unsqueezed, cos_sin_cache
+            )
+            dummy = torch.ops.vllm.unified_mla_kv_cache_update(
+                kv_c_normed, k_pe, _ln, self.kv_cache_dtype, k_scale
+            )
+            return dummy, q_pe, k_pe
+
+        return _pattern
+
+    @property
+    def replacement(self):
+        _ln = self._ln
+
+        if _USE_LAYERNAME:
+
+            def _replacement_with_ln(
+                q_pe: torch.Tensor,
+                k_pe: torch.Tensor,
+                kv_c_normed: torch.Tensor,
+                positions: torch.Tensor,
+                cos_sin_cache: torch.Tensor,
+                k_scale: torch.Tensor,
+                layer_name: LayerNameType,
+            ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                at = auto_functionalized(
+                    self.FUSED_OP,
+                    positions=positions,
+                    q_pe=q_pe,
+                    k_pe=k_pe,
+                    kv_c=kv_c_normed,
+                    cos_sin_cache=cos_sin_cache,
+                    is_neox=self.is_neox,
+                    kv_cache_dtype=self.kv_cache_dtype,
+                    kv_cache_scale=k_scale,
+                    layer_name=layer_name,
+                )
+                dummy, q_pe, k_pe_squeezed = at
+                k_pe = k_pe_squeezed.unsqueeze(1)
+                return dummy, q_pe, k_pe
+
+            return _replacement_with_ln
+
+        def _replacement(
+            q_pe: torch.Tensor,
+            k_pe: torch.Tensor,
+            kv_c_normed: torch.Tensor,
+            positions: torch.Tensor,
+            cos_sin_cache: torch.Tensor,
+            k_scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            at = auto_functionalized(
+                self.FUSED_OP,
+                positions=positions,
+                q_pe=q_pe,
+                k_pe=k_pe,
+                kv_c=kv_c_normed,
+                cos_sin_cache=cos_sin_cache,
+                is_neox=self.is_neox,
+                kv_cache_dtype=self.kv_cache_dtype,
+                kv_cache_scale=k_scale,
+                layer_name=_ln,
+            )
+            dummy, q_pe, k_pe_squeezed = at
+            k_pe = k_pe_squeezed.unsqueeze(1)
+            return dummy, q_pe, k_pe
+
+        return _replacement
+
+
+class MLARoPEKVCacheCatFusionPass(VllmFusionPatternMatcherPass):
+    def __init__(self, config: VllmConfig) -> None:
+        super().__init__(config, "mla_rope_kv_cache_fusion_pass")
+
+        attn_layers = get_layers_from_vllm_config(config, MLAAttention)
+
+        for _, layer in attn_layers.items():
+            for is_neox in [False, True]:
+                for use_deepseek_scaling in [False, True]:
+                    if RotaryEmbedding.enabled():
+                        for use_flashinfer in [False, True]:
+                            self.register(
+                                MLARoPEKVCacheCatPattern(
+                                    layer,
+                                    is_neox,
+                                    use_flashinfer,
+                                    use_deepseek_scaling,
+                                )
+                            )
+                    else:
+                        self.register(
+                            MLARoPEKVCacheCatPattern(
+                                layer,
+                                is_neox,
+                                use_deepseek_scaling=use_deepseek_scaling,
+                            )
+                        )
+
+            if _USE_LAYERNAME:
+                break
+
+        self.dump_patterns(config, self.pm_pass)
diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
index 83abcdeb5abd..9a975c5fed4d 100644
--- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
+++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
@@ -6,6 +6,7 @@
 import torch
 import torch._inductor.pattern_matcher as pm
 from torch import fx
+from torch._inductor.fx_passes.post_grad import view_to_reshape
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
 import vllm.ir.ops
@@ -293,6 +294,161 @@ def replacement(
         pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
 
 
+class DoubleAiterRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
+    """
+    Pattern matching ``rms_norm`` whose output feeds *two* distinct
+    ``rocm_aiter_group_fp8_quant`` consumers, replacing it with two
+    independent fused ``rms_norm_group_fp8_quant`` ops.
+
+    Repeating the rms_norm in the replacement is preferable to leaving
+    the fused 16-bit rms output materialized for two unfused quant
+    consumers, and matches what the previous manual graph surgery
+    achieved by cloning the rms_norm node.
+    """
+
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        match_aiter_quant: bool = True,
+        symmetric: bool = True,
+    ) -> None:
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+            result_rms = torch.ops.vllm_ir.rms_norm(input, weight, self.epsilon)
+            result1, scale1 = self.quant_matcher(result_rms)
+            result2, scale2 = self.quant_matcher(result_rms)
+            return result1, scale1, result2, scale2
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+            at1 = self.FUSED_OP(
+                x=input,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+                group_size=128,
+            )
+            at2 = self.FUSED_OP(
+                x=input,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+                group_size=128,
+            )
+
+            return at1[0], at1[1], at2[0], at2[1]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            # input, weight
+            [self.empty(5, 16), self.empty(16)],
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
+class DoubleAiterRMSFp8GroupQuantViewPattern(AiterRMSNormQuantPattern):
+    """
+    View-tolerant variant of ``DoubleAiterRMSFp8GroupQuantPattern``.
+
+    Matches the same 1-to-2 fan-out, but with a ``view``/``reshape`` between
+    the ``rms_norm`` output and the two ``rocm_aiter_group_fp8_quant``
+    consumers::
+
+        rms_norm -> view -> rocm_aiter_group_fp8_quant
+                \\-> view -> rocm_aiter_group_fp8_quant
+
+    This shape arises in DeepSeek-V3.2's MLA indexer q_c norm, where the
+    FP8 linear path's 2D-flatten boilerplate
+    (``Fp8BlockScaledMMLinearKernel.apply_weights``) inserts a view between
+    the rms_norm output and each FP8 group quant op. The non-view sibling
+    pattern silently no-ops on this graph because the pattern matcher
+    requires the in-graph and in-pattern node shapes to align.
+
+    The trace_fn runs Inductor's ``view_to_reshape`` post-grad pass to
+    normalize ``view`` to ``reshape`` in both the pattern and the input
+    graph, widening the match without touching the no-view sibling.
+    """
+
+    FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op()
+
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        match_aiter_quant: bool = True,
+        symmetric: bool = True,
+    ) -> None:
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+
+        super().__init__(epsilon, key, match_aiter_quant)
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+            result_rms = torch.ops.vllm_ir.rms_norm(input, weight, self.epsilon)
+            view_rms = result_rms.view(-1, result_rms.shape[-1])
+            result1, scale1 = self.quant_matcher(view_rms)
+            result2, scale2 = self.quant_matcher(view_rms)
+            return result1, scale1, result2, scale2
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+            at1 = self.FUSED_OP(
+                x=input,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+                group_size=128,
+            )
+            at2 = self.FUSED_OP(
+                x=input,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+                group_size=128,
+            )
+
+            return at1[0], at1[1], at2[0], at2[1]
+
+        def trace_with_view_to_reshape(*args: Any, **kwargs: Any) -> fx.GraphModule:
+            gm = pm.fwd_only(*args, **kwargs)
+            view_to_reshape(gm)
+            return gm
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            # input, weight
+            [self.empty(5, 16), self.empty(16)],
+            trace_with_view_to_reshape,
+            pm_pass,
+        )
+
+
 class RocmAiterRMSNormQuantFusionPass(VllmPatternMatcherPass):
     """
     This pass fuses aiter rms_norm & vllm/aiter quant custom ops
@@ -309,8 +465,24 @@ def __init__(self, config: VllmConfig) -> None:
         )
 
         # Make sure fused add patterns are before simple rms norm,
-        # as the latter is a subset of the former in torch ops
+        # as the latter is a subset of the former in torch ops.
+        # The DoubleQuant patterns handle 1 rms_norm -> 2 group_fp8_quant
+        # fan-out (e.g. DSv3.2) and must be registered before the single
+        # group-quant pattern so they match first. The view-tolerant variant
+        # additionally covers the rms_norm -> view -> 2x quant shape that
+        # appears when the FP8 linear path inserts a 2D-flatten boilerplate
+        # (DSv3.2 MLA indexer q_c norm).
         for epsilon in [1e-5, 1e-6]:
+            # Fuse aiter rms_norm + 2x aiter group fp8 quant
+            DoubleAiterRMSFp8GroupQuantPattern(
+                epsilon, FP8_DTYPE, GroupShape(1, 128)
+            ).register(self.patterns)
+
+            # View-tolerant sibling for DSv3.2 q_c norm fan-out
+            DoubleAiterRMSFp8GroupQuantViewPattern(
+                epsilon, FP8_DTYPE, GroupShape(1, 128)
+            ).register(self.patterns)
+
             #  Fuse aiter rms_norm + aiter dynamic group fp8 quant
             AiterRMSFp8GroupQuantPattern(
                 epsilon, FP8_DTYPE, GroupShape(1, 128)
@@ -360,6 +532,8 @@ def uuid(self) -> str:
             AiterFusedAddRMSNormDynamicQuantPattern,
             AiterRMSFp8GroupQuantPattern,
             AiterFusedAddRMSFp8GroupQuantPattern,
+            DoubleAiterRMSFp8GroupQuantPattern,
+            DoubleAiterRMSFp8GroupQuantViewPattern,
         ]
         return self.hash_source(self, *fusion_patterns)
 
diff --git a/vllm/compilation/passes/fusion/sequence_parallelism.py b/vllm/compilation/passes/fusion/sequence_parallelism.py
index 2c7a1390bdb8..8d0f40e2c775 100644
--- a/vllm/compilation/passes/fusion/sequence_parallelism.py
+++ b/vllm/compilation/passes/fusion/sequence_parallelism.py
@@ -8,13 +8,17 @@
 import torch
 import torch._inductor.pattern_matcher as pm
 import torch.fx as fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
 import vllm.ir.ops
 from vllm.config import VllmConfig
 from vllm.config.utils import Range
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
-from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     kFp8StaticTensorSym,
@@ -27,6 +31,10 @@
 
 logger = init_logger(__name__)
 
+if hasattr(torch.ops._C, "scaled_fp4_quant"):
+    SCALED_FP4_QUANT_OUT_OVERLOAD = torch.ops._C.scaled_fp4_quant.out
+    SCALED_FP4_QUANT_DEFAULT_OVERLOAD = torch.ops._C.scaled_fp4_quant.default
+
 # Min hidden size per device capability for sequence parallelism
 # Only apply sequence parallelism for models with hidden_size >= threshold
 SP_MIN_HIDDEN_SIZE: dict[int, int] = {
@@ -117,6 +125,7 @@ def __init__(
         self.device = device
         self.tp_group = get_tp_group()
         self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
 
     def _all_reduce(self, x: torch.Tensor) -> torch.Tensor:
         return tensor_model_parallel_all_reduce(x)
@@ -204,17 +213,35 @@ def replacement(
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
-            # pattern matcher replaces from top-to-bottom,
-            # so residual is still the full size here.
-            # once the seqpar pattern with the previous rmsnorm is replaced
+            # The pattern matcher replaces from the end of the graph
+            # (last layer first). At the time each match is replaced,
+            # the preceding layer has NOT been replaced yet, so
+            # `residual` is still full-size and the slice below is
+            # correct. Once the preceding layer IS replaced, its
+            # residual output shrinks to [local_len, H], and this
+            # slice becomes semantically incorrect (e.g. for rank > 0,
+            # the indices would be out of bounds). However, since the
+            # symbolic output shape equals the input shape,
+            # NoOpEliminationPass (called at the end of
+            # SequenceParallelismPass.__call__) removes these slices
+            # before the graph is ever executed or compiled.
             reduce_scatter = self._reduce_scatter(mm_1)
-            residual = residual[0 : reduce_scatter.size(0), ...]
+            local_len = reduce_scatter.size(0)
+            # when the preceding VocabParallelEmbedding is excluded
+            # from the FX graph (e.g., passing `inputs_embeds` directly in VLMs),
+            # the FirstAllReduceRMSNorm pattern is never matched. we must
+            # perform a proper TP-aware slice here. simply using `[0:local_len]`
+            # would incorrectly cause all ranks to process rank 0's chunk.
+            residual = residual[
+                self.tp_rank * local_len : self.tp_rank * local_len + local_len, ...
+            ]
             rmsnorm = vllm.ir.ops.fused_add_rms_norm(
                 reduce_scatter, residual, rms_norm_weights, self.epsilon
             )
             all_gather = self._all_gather(rmsnorm[0])
-            # shape of residual changes but that's fine,
-            # next node is already slicing it, now becomes a noop
+            # residual output is now [local_len, H]; the next layer's
+            # slice on it is semantically incorrect until
+            # NoOpEliminationPass removes it.
             return all_gather, rmsnorm[1]
 
         pm.register_replacement(
@@ -304,19 +331,29 @@ def replacement(
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
-            # pattern matcher replaces from top-to-bottom,
-            # so residual is still the full size here.
-            # add a temporary slice which will become a noop
-            # once the seqpar pattern with the previous rmsnorm is replaced
+            # See MiddleAllReduceRMSNormPattern.replacement for a
+            # detailed explanation of the temporary slice below:
+            # it is correct when first inserted, becomes semantically
+            # incorrect after the preceding layer is replaced, and is
+            # removed by NoOpEliminationPass before the graph is compiled.
             reduce_scatter = self._reduce_scatter(mm_1)
-            residual = residual[0 : reduce_scatter.size(0), ...]
+            local_len = reduce_scatter.size(0)
+            # when the preceding VocabParallelEmbedding is excluded
+            # from the FX graph (e.g., passing `inputs_embeds` directly in VLMs),
+            # the FirstAllReduceRMSNorm pattern is never matched. we must
+            # perform a proper TP-aware slice here. simply using `[0:local_len]`
+            # would incorrectly cause all ranks to process rank 0's chunk.
+            residual = residual[
+                self.tp_rank * local_len : self.tp_rank * local_len + local_len, ...
+            ]
             rms, residual_out = vllm.ir.ops.fused_add_rms_norm(
                 reduce_scatter, residual, rms_norm_weights, self.epsilon
             )
             quant, _ = self.quant_matcher(rms, scale)
             all_gather = self._all_gather(quant)
-            # shape of residual changes but that's fine,
-            # next node is already slicing it, now becomes a noop
+            # residual output is now [local_len, H]; the next layer's
+            # slice on it is semantically incorrect until
+            # NoOpEliminationPass removes it.
             return all_gather, residual_out
 
         pm.register_replacement(
@@ -332,6 +369,129 @@ def replacement(
         )
 
 
+class FirstAllReduceRMSNormStaticNVFP4Pattern(_SequenceParallelPatternHelper):
+    def get_inputs(self) -> list[torch.Tensor]:
+        input = self.empty([8, 16])
+        weight = self.empty([16])
+        input_global_scale = self.empty_f32([1, 1])
+        quant_output = torch.empty([8, 8], device=self.device, dtype=torch.uint8)
+        output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32)
+        return [input, weight, input_global_scale, quant_output, output_scale]
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            input_global_scale: torch.Tensor,
+            quant_output: torch.Tensor,
+            output_scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            all_reduce = self._all_reduce(input)
+            rms = vllm.ir.ops.rms_norm(all_reduce, weight, self.epsilon)
+            quant = auto_functionalized(
+                SCALED_FP4_QUANT_OUT_OVERLOAD,
+                input=rms,
+                input_scale=input_global_scale,
+                is_sf_swizzled_layout=True,
+                output=quant_output,
+                output_scale=output_scale,
+            )
+            return quant[1], all_reduce, quant[2]
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            input_global_scale: torch.Tensor,
+            quant_output: torch.Tensor,
+            output_scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            reduce_scatter = self._reduce_scatter(input)
+            rms = vllm.ir.ops.rms_norm(reduce_scatter, weight, self.epsilon)
+            rms = torch.ops.aten.view.default(rms, [-1, rms.shape[-1]])
+            quant = SCALED_FP4_QUANT_DEFAULT_OVERLOAD(
+                rms,
+                input_global_scale,
+                True,
+            )
+            return (
+                self._all_gather(quant[0]),
+                reduce_scatter,
+                self._all_gather(quant[1]),
+            )
+
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
+
+
+class MiddleAllReduceRMSNormStaticNVFP4Pattern(_SequenceParallelPatternHelper):
+    def get_inputs(self) -> list[torch.Tensor]:
+        mm_1 = self.empty([8, 16])
+        residual = self.empty([8, 16])
+        rms_norm_weights = self.empty([16])
+        input_global_scale = self.empty_f32([1, 1])
+        quant_output = torch.empty([8, 8], device=self.device, dtype=torch.uint8)
+        output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32)
+        return [
+            residual,
+            mm_1,
+            rms_norm_weights,
+            input_global_scale,
+            quant_output,
+            output_scale,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        def pattern(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+            input_global_scale: torch.Tensor,
+            quant_output: torch.Tensor,
+            output_scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            all_reduce = self._all_reduce(mm_1)
+            rms, residual_out = vllm.ir.ops.fused_add_rms_norm(
+                all_reduce, residual, rms_norm_weights, self.epsilon
+            )
+            quant = auto_functionalized(
+                SCALED_FP4_QUANT_OUT_OVERLOAD,
+                input=rms,
+                input_scale=input_global_scale,
+                is_sf_swizzled_layout=True,
+                output=quant_output,
+                output_scale=output_scale,
+            )
+            return quant[1], residual_out, quant[2]
+
+        def replacement(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+            input_global_scale: torch.Tensor,
+            quant_output: torch.Tensor,
+            output_scale: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            # Keep this slice in sync with the non-quantized SP replacement:
+            # once the previous SP pattern fires, it becomes a no-op.
+            reduce_scatter = self._reduce_scatter(mm_1)
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rms, residual_out = vllm.ir.ops.fused_add_rms_norm(
+                reduce_scatter, residual, rms_norm_weights, self.epsilon
+            )
+            rms = torch.ops.aten.view.default(rms, [-1, rms.shape[-1]])
+            quant = SCALED_FP4_QUANT_DEFAULT_OVERLOAD(
+                rms,
+                input_global_scale,
+                True,
+            )
+            return self._all_gather(quant[0]), residual_out, self._all_gather(quant[1])
+
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
+
+
 class SequenceParallelismPass(VllmPatternMatcherPass):
     """
     This pass enables sequence parallelism for models.
@@ -357,12 +517,15 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     gets split across TP ranks, causing size mismatches at subgraph
     boundaries.
 
-    This pass splits up the residual tensor across TP ranks and hence
-    divides its size. Because the pattern matcher starts at the end of
-    the graph, the replacement contains a slice that temporarily conforms
-    the input residual to the correct size. After all patterns have been
-    matched, we use a NoOpEliminationPass to clean up what have now
-    become no-op slices.
+    This pass splits up the residual tensor across TP ranks and hence divides
+    its size. The pattern matcher starts at the end of the graph (last layer
+    first), so when each replacement inserts a residual slice, the preceding
+    layer has not been replaced yet and the slice is correct. Once the
+    preceding layer IS replaced, its residual output shrinks and the slice
+    becomes semantically incorrect (out-of-bounds indices for rank > 0).
+    The graph is never executed in this intermediate state —
+    NoOpEliminationPass removes these slices based on symbolic shape equality
+    (input shape == output shape) before the graph is compiled.
     """
 
     @enable_fake_mode
@@ -404,6 +567,14 @@ def __init__(self, config: VllmConfig) -> None:
                 epsilon, self.model_dtype, self.device
             ).register(self.patterns)
 
+            if "SCALED_FP4_QUANT_OUT_OVERLOAD" in globals():
+                FirstAllReduceRMSNormStaticNVFP4Pattern(
+                    epsilon, self.model_dtype, self.device
+                ).register(self.patterns)
+                MiddleAllReduceRMSNormStaticNVFP4Pattern(
+                    epsilon, self.model_dtype, self.device
+                ).register(self.patterns)
+
             # Normal RMSNorm patterns
             FirstAllReduceRMSNormPattern(
                 epsilon, self.model_dtype, self.device
diff --git a/vllm/compilation/passes/pass_manager.py b/vllm/compilation/passes/pass_manager.py
index 5d4355a5b2b4..4cc6bc9e5f90 100644
--- a/vllm/compilation/passes/pass_manager.py
+++ b/vllm/compilation/passes/pass_manager.py
@@ -33,6 +33,7 @@
     from .fusion.act_quant_fusion import ActivationQuantFusionPass
     from .fusion.attn_quant_fusion import AttnQuantFusionPass
     from .fusion.mla_attn_quant_fusion import MLAAttnQuantFusionPass
+    from .fusion.mla_rope_kvcache_cat_fusion import MLARoPEKVCacheCatFusionPass
     from .fusion.qk_norm_rope_fusion import QKNormRoPEFusionPass
     from .fusion.rms_quant_fusion import RMSNormQuantFusionPass
     from .fusion.rope_kvcache_fusion import RopeKVCacheFusionPass
@@ -174,6 +175,9 @@ def configure(self, config: VllmConfig) -> None:
                 self.passes += [ScatterSplitReplacementPass(config)]
                 self.passes += [RopeKVCacheFusionPass(config)]
 
+            if self.pass_config.fuse_rope_kvcache_cat_mla:
+                self.passes += [MLARoPEKVCacheCatFusionPass(config)]
+
             if self.pass_config.fuse_attn_quant:
                 self.passes += [AttnQuantFusionPass(config)]
                 self.passes += [MLAAttnQuantFusionPass(config)]
diff --git a/vllm/compilation/passes/utility/fix_functionalization.py b/vllm/compilation/passes/utility/fix_functionalization.py
index 15eb23e6f949..2887c19ad4a2 100644
--- a/vllm/compilation/passes/utility/fix_functionalization.py
+++ b/vllm/compilation/passes/utility/fix_functionalization.py
@@ -181,6 +181,45 @@ def __call__(self, graph: torch.fx.Graph) -> None:
                     2: "key",
                 }
                 self.defunctionalize(graph, node, mutated_args=mutated_args)
+            elif (
+                hasattr(torch.ops.vllm, "fused_rope_unified_mla_kv_cache_update")
+                and at_target
+                == torch.ops.vllm.fused_rope_unified_mla_kv_cache_update.default
+            ):
+                # AOTAutograd functionalizes `q[..., nope_dim:] = rope_result` into
+                # a sequence of aten ops on q: view+slice+copy+slice_scatter.
+                # Since the fused MLA RoPE op mutates q_pe in-place, we can remove
+                # the redundant copy and slice_scatter ops during defunctionalization.
+                getitem_nodes = self.getitem_users(node)
+                q_pe_out = getitem_nodes[1]
+
+                for user in list(q_pe_out.users):
+                    if is_func(user, torch.ops.aten.copy.default):
+                        copy_temp = user
+                slice_temp = copy_temp.args[0]
+                for user in list(copy_temp.users):
+                    if is_func(user, torch.ops.aten.slice_scatter.default):
+                        slice_scatter_temp = user
+                view_temp = slice_scatter_temp.args[0]
+
+                view_orig = slice_temp.args[0]
+                slice_scatter_temp.replace_all_uses_with(view_orig)
+                self._remove(slice_scatter_temp)
+                self._remove(copy_temp)
+                self._remove(slice_temp)
+                self._remove(view_temp)
+                self._remove(q_pe_out)
+
+                # defunctionalize k_pe manually; self.replace_users_with_mutated_args
+                # does not support only replacing specific kwargs
+                k_pe_in = node.kwargs["k_pe"]
+                k_pe_out = getitem_nodes[2]
+                k_pe_out.replace_all_uses_with(k_pe_in)
+                self._remove(k_pe_out)
+
+                self.insert_defunctionalized(graph, node)
+                self._remove(node)
+
             # only used for test_functionalization::TestFunctionWithMutatedArgsAndReturn
             elif (
                 hasattr(torch.ops.vllm, "function_with_mutated_args_and_return")
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index d5fa087a329a..efcb736e88f7 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -136,8 +136,10 @@ class PassConfig:
     """Enable flashinfer allreduce fusion."""
     fuse_minimax_qk_norm: bool = None  # type: ignore[assignment]
     """Enable fused allreduce+RMSNorm for MiniMax QK norm."""
-    enable_qk_norm_rope_fusion: bool = False
+    enable_qk_norm_rope_fusion: bool = None  # type: ignore[assignment]
     """Enable fused Q/K RMSNorm + RoPE pass."""
+    fuse_rope_kvcache_cat_mla: bool = None  # type: ignore[assignment]
+    """Enable fused MLA KV cache update with RoPE."""
 
     # ROCm/AITER specific fusions
     fuse_act_padding: bool = None  # type: ignore[assignment]
@@ -228,6 +230,7 @@ def compute_hash(self) -> str:
         "fuse_act_padding",
         "fuse_mla_dual_rms_norm",
         "fuse_rope_kvcache",
+        "fuse_rope_kvcache_cat_mla",
         mode="wrap",
     )
     @classmethod
@@ -285,6 +288,12 @@ def __post_init__(self) -> None:
                 "The fusion will be disabled."
             )
             self.fuse_rope_kvcache = False
+        if self.fuse_rope_kvcache_cat_mla and not current_platform.is_cuda_alike():
+            logger.warning_once(
+                "MLA KV cache update with RoPE fusion enabled but the "
+                "current platform is not CUDA or ROCm. The fusion will be disabled."
+            )
+            self.fuse_rope_kvcache_cat_mla = False
 
     def log_enabled_passes(self) -> None:
         """
diff --git a/vllm/config/load.py b/vllm/config/load.py
index 93240ec5fc0f..07feaeb961ab 100644
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -9,6 +9,9 @@
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
 
+DEFAULT_SAFETENSORS_PREFETCH_NUM_THREADS = 8
+DEFAULT_SAFETENSORS_PREFETCH_BLOCK_SIZE = 16 * 1024 * 1024
+
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader import LoadFormats
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -79,6 +82,15 @@ class LoadConfig:
       was quantized using torchao and saved using safetensors.
       Needs `torchao >= 0.14.0`.
     """
+    safetensors_prefetch_num_threads: int = Field(
+        default=DEFAULT_SAFETENSORS_PREFETCH_NUM_THREADS, ge=1
+    )
+    """Number of worker threads used to prefetch safetensors checkpoint files
+    into the OS page cache when safetensors prefetching is enabled."""
+    safetensors_prefetch_block_size: int = Field(
+        default=DEFAULT_SAFETENSORS_PREFETCH_BLOCK_SIZE, ge=1
+    )
+    """Read size in bytes for each safetensors checkpoint file prefetch."""
     model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
     """Extra config for model loader. This will be passed to the model loader
     corresponding to the chosen load_format."""
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 53f91e1b650d..d220aa65035d 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -155,6 +155,15 @@ def enable_rope_kvcache_fusion(cfg: "VllmConfig") -> bool:
     )
 
 
+def enable_rope_kvcache_mla_fusion(cfg: "VllmConfig") -> bool:
+    """Enable if use_inductor_graph_partition is enabled."""
+
+    return (
+        cfg.compilation_config.use_inductor_graph_partition
+        or not cfg.compilation_config.splitting_ops_contain_kv_cache_update()
+    )
+
+
 def enable_norm_pad_fusion(cfg: "VllmConfig") -> bool:
     """Enable if using AITER RMSNorm and hidden size is 2880 i.e. gpt-oss."""
 
@@ -184,6 +193,7 @@ def enable_mla_dual_rms_norm_fusion(cfg: "VllmConfig") -> bool:
             "fuse_act_padding": False,
             "fuse_mla_dual_rms_norm": False,
             "fuse_rope_kvcache": False,
+            "fuse_rope_kvcache_cat_mla": False,
         },
         "cudagraph_mode": CUDAGraphMode.NONE,
         "use_inductor_graph_partition": False,
@@ -204,6 +214,7 @@ def enable_mla_dual_rms_norm_fusion(cfg: "VllmConfig") -> bool:
             "fuse_act_padding": enable_norm_pad_fusion,
             "fuse_mla_dual_rms_norm": enable_mla_dual_rms_norm_fusion,
             "fuse_rope_kvcache": False,
+            "fuse_rope_kvcache_cat_mla": False,
         },
         "cudagraph_mode": CUDAGraphMode.PIECEWISE,
         "use_inductor_graph_partition": False,
@@ -226,6 +237,7 @@ def enable_mla_dual_rms_norm_fusion(cfg: "VllmConfig") -> bool:
             "fuse_act_padding": enable_norm_pad_fusion,
             "fuse_mla_dual_rms_norm": enable_mla_dual_rms_norm_fusion,
             "fuse_rope_kvcache": enable_rope_kvcache_fusion,
+            "fuse_rope_kvcache_cat_mla": enable_rope_kvcache_mla_fusion,
         },
         "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
         "use_inductor_graph_partition": False,
@@ -248,6 +260,7 @@ def enable_mla_dual_rms_norm_fusion(cfg: "VllmConfig") -> bool:
             "fuse_act_padding": enable_norm_pad_fusion,
             "fuse_mla_dual_rms_norm": enable_mla_dual_rms_norm_fusion,
             "fuse_rope_kvcache": enable_rope_kvcache_fusion,
+            "fuse_rope_kvcache_cat_mla": enable_rope_kvcache_mla_fusion,
         },
         "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
         "use_inductor_graph_partition": False,
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 990c808a9831..9f305c718f9d 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -3,6 +3,8 @@
 
 
 # ===================== import region =====================
+import threading
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup, ReduceOp
@@ -145,8 +147,19 @@ def __init__(
 
     def destroy(self):
         if self.available and not self.disabled:
-            with torch.accelerator.device_index(self.device.index):
-                self.nccl.ncclCommDestroy(self.comm)
+            # ncclCommAbort can block until all CUDA graphs that
+            # captured NCCL ops on this comm are destroyed — and
+            # those graphs are released later in this same main-
+            # thread teardown, so a direct call here self-deadlocks.
+            # Run it in a daemon thread with a timeout: the main
+            # thread proceeds, the graphs drop, and the abort returns.
+            def _abort():
+                with torch.accelerator.device_index(self.device.index):
+                    self.nccl.ncclCommAbort(self.comm)
+
+            abort_thread = threading.Thread(target=_abort, daemon=True)
+            abort_thread.start()
+            abort_thread.join(timeout=5.0)
             self.available = False
             self.disabled = True
 
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 57c7397e01b6..5ca8cc7c77f4 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -290,6 +290,12 @@ class NCCLLibrary:
         # it is better not to call it at all.
         # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
         Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
+        # ncclCommAbort frees resources associated with the communicator
+        # without requiring a collective synchronization. Unlike
+        # ncclCommDestroy, it is safe to call during an uncoordinated
+        # shutdown when peer ranks may already be gone.
+        # ncclResult_t  ncclCommAbort(ncclComm_t comm);
+        Function("ncclCommAbort", ncclResult_t, [ncclComm_t]),
         # ncclResult_t ncclGroupStart();
         Function("ncclGroupStart", ncclResult_t, []),
         # ncclResult_t ncclGroupEnd();
@@ -548,6 +554,9 @@ def ncclBroadcast(
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
 
+    def ncclCommAbort(self, comm: ncclComm_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommAbort"](comm))
+
     def ncclGroupStart(self) -> None:
         self.NCCL_CHECK(self._funcs["ncclGroupStart"]())
 
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 9c8bf3ad165c..dc7e6d151a48 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -38,6 +38,11 @@
     is_valid_ipv6_address,
 )
 
+if envs.VLLM_USE_SPINLOOP_EXT:
+    from vllm.spinloop import spinloop
+
+SPINLOOP_TIMEOUT_SECONDS = 0.1
+
 if TYPE_CHECKING:
     from _typeshed import SizedBuffer
 
@@ -540,13 +545,17 @@ def acquire_write(self, timeout: float | None = None):
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
-                # Memory fence ensures we see the latest read flags from readers.
-                # Without this, we may read stale flags from our CPU cache and
-                # spin indefinitely even though readers have completed.
-                memory_fence()
-                read_count = sum(metadata_buffer[1:])
-                written_flag = metadata_buffer[0]
-                if written_flag and read_count != self.buffer.n_reader:
+
+                def check():
+                    memory_fence()
+                    read_count = sum(metadata_buffer[1:])
+                    written_flag = metadata_buffer[0]
+                    return not (written_flag and read_count != self.buffer.n_reader)
+
+                if envs.VLLM_USE_SPINLOOP_EXT and not check():
+                    spinloop(metadata_buffer, check, timeout=SPINLOOP_TIMEOUT_SECONDS)
+
+                if not check():
                     # this block is written and not read by all readers
                     # for writers, `self.current_idx` is the next block to write
                     # if this block is not ready to write,
@@ -657,13 +666,21 @@ def acquire_read(
         )
         with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
             while True:
-                # Memory fence ensures we see the latest writes from the writer.
-                # Without this, we may read stale flags from our CPU cache
-                # and spin indefinitely even though writer has updated them.
-                memory_fence()
-                read_flag = metadata_buffer[self.local_reader_rank + 1]
-                written_flag = metadata_buffer[0]
-                if not written_flag or read_flag:
+
+                def check():
+                    memory_fence()
+                    read_flag = metadata_buffer[self.local_reader_rank + 1]
+                    written_flag = metadata_buffer[0]
+                    return not (not written_flag or read_flag)
+
+                if envs.VLLM_USE_SPINLOOP_EXT and not check():
+                    spinloop(
+                        metadata_buffer[0 : self.local_reader_rank + 1],
+                        check,
+                        timeout=SPINLOOP_TIMEOUT_SECONDS,
+                    )
+
+                if not check():
                     # this block is either
                     # (1) not written
                     # (2) already read by this reader
diff --git a/vllm/distributed/eplb/eplb_communicator.py b/vllm/distributed/eplb/eplb_communicator.py
index 95a5ae5ff45d..f8ee90b934fb 100644
--- a/vllm/distributed/eplb/eplb_communicator.py
+++ b/vllm/distributed/eplb/eplb_communicator.py
@@ -19,6 +19,7 @@
     batch_isend_irecv,
 )
 
+import vllm.distributed.nixl_utils as nixl_utils
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import (
     ncclDataTypeEnum,
@@ -37,9 +38,7 @@
 
 def has_nixl() -> bool:
     """Whether the optional NIXL / RIXL package is available."""
-    from vllm.distributed.nixl_utils import NixlWrapper
-
-    return NixlWrapper is not None
+    return nixl_utils.NixlWrapper is not None
 
 
 class EplbCommunicator(ABC):
@@ -233,10 +232,9 @@ def __init__(
         expert_weights: Sequence[torch.Tensor],
         cuda_stream: torch.cuda.Stream | None = None,
     ) -> None:
-        from vllm.distributed.nixl_utils import NixlWrapper, nixl_agent_config
-
         assert expert_weights, "NixlEplbCommunicator requires non-empty expert_weights."
-        if NixlWrapper is None:
+        nixl_wrapper_cls = nixl_utils.NixlWrapper
+        if nixl_wrapper_cls is None:
             raise RuntimeError("NIXL/ RIXL is unavailable.")
         self._cpu_group = cpu_group
         self._cuda_stream = cuda_stream
@@ -254,12 +252,13 @@ def __init__(
                 f"expected={self._device}, got={tensor.device}"
             )
 
+        nixl_agent_config = nixl_utils.nixl_agent_config
         config = (
             nixl_agent_config(capture_telemetry=False)
             if nixl_agent_config is not None
             else None
         )
-        self._nixl_wrapper = NixlWrapper(self._make_agent_name(), config)
+        self._nixl_wrapper = nixl_wrapper_cls(self._make_agent_name(), config)
         self._nixl_memory_type = "VRAM"
         self._registered_desc: object | None = None
         self._remote_agents: dict[int, str] = {}
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index f691b9f18e92..b1ecf3c56ade 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -44,14 +44,12 @@ def create_connector(
         cls,
         config: "VllmConfig",
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ) -> KVConnectorBase:
         kv_transfer_config = config.kv_transfer_config
         if kv_transfer_config is None:
             raise ValueError("kv_transfer_config must be set to create a connector")
-        connector_cls, compat_sig = cls._get_connector_class_with_compat(
-            kv_transfer_config
-        )
+        connector_cls = cls.get_connector_class(kv_transfer_config)
 
         # check if the connector supports HMA
         hma_enabled = not config.scheduler_config.disable_hybrid_kv_cache_manager
@@ -74,12 +72,7 @@ def create_connector(
         # - Co-locate with worker process
         # - Should only be used inside the forward context & attention layer
         # We build separately to enforce strict separation
-        if compat_sig:
-            # Old signature: __init__(self, vllm_config, role)
-            return connector_cls(config, role)
-        else:
-            # New signature: __init__(self, vllm_config, role, kv_cache_config)
-            return connector_cls(config, role, kv_cache_config)
+        return connector_cls(config, role, kv_cache_config)
 
     @classmethod
     def get_connector_class_by_name(
@@ -100,13 +93,12 @@ def get_connector_class_by_name(
         return cls._registry[connector_name]()
 
     @classmethod
-    def _get_connector_class_with_compat(
+    def get_connector_class(
         cls, kv_transfer_config: "KVTransferConfig"
-    ) -> tuple[type[KVConnectorBaseType], bool]:
+    ) -> type[KVConnectorBaseType]:
         connector_name = kv_transfer_config.kv_connector
         if connector_name is None:
             raise ValueError("Connector name is not set in KVTransferConfig")
-        compat_sig = False
         connector_module_path = kv_transfer_config.kv_connector_module_path
         if connector_module_path is not None and not connector_module_path:
             raise ValueError("kv_connector_module_path cannot be an empty string.")
@@ -121,24 +113,18 @@ def _get_connector_class_with_compat(
                 ) from e
             connector_cls = cast(type[KVConnectorBaseType], connector_cls)
             if not supports_kw(connector_cls, "kv_cache_config"):
-                compat_sig = True
-                logger.warning(
-                    "Connector %s uses deprecated signature with 2 required arguments. "
-                    "Please update to include kv_cache_config as the second argument.",
-                    connector_cls.__name__,
+                msg = (
+                    f"Connector {connector_cls.__name__} uses deprecated "
+                    "2-argument constructor signature. External v1 KV "
+                    "connectors must accept kv_cache_config as the third "
+                    "constructor argument and pass it to super().__init__()."
                 )
+                logger.error(msg)
+                raise ValueError(msg)
         elif connector_name in cls._registry:
             connector_cls = cls._registry[connector_name]()
         else:
             raise ValueError(f"Unsupported connector type: {connector_name}")
-        return connector_cls, compat_sig
-
-    @classmethod
-    def get_connector_class(
-        cls, kv_transfer_config: "KVTransferConfig"
-    ) -> type[KVConnectorBaseType]:
-        """Get the connector class by name."""
-        connector_cls, _ = cls._get_connector_class_with_compat(kv_transfer_config)
         return connector_cls
 
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index b85416ab3071..a86b25c75a4f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -593,7 +593,7 @@ def describe(self, remote_engine_id: EngineId) -> str:
         return (
             f"TransferTopology("
             f"tp_ratio={self.tp_ratio(info.remote_tp_size)}, "
-            f"K={self.total_num_kv_heads}, "
+            f"num_kv_heads={self.total_num_kv_heads if not self.is_mla else 1}, "
             f"local_tp={self.tp_size}, "
             f"remote_tp={info.remote_tp_size}, "
             f"local_rank={self.tp_rank}, "
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index ef143cba7fb5..17eb591dad88 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -184,7 +184,7 @@ def __init__(
         self,
         vllm_config: "VllmConfig",
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ):
         logger.warning(
             "Initializing KVConnectorBase_V1. This API is experimental and "
@@ -197,13 +197,6 @@ def __init__(
         else:
             raise ValueError("kv_transfer_config must be set for KVConnectorBase_V1")
         self._kv_cache_config = kv_cache_config
-        if self._kv_cache_config is None:
-            logger.warning(
-                "KVConnectorBase_V1 initialized without kv_cache_config. "
-                "This is deprecated - please update your connector to accept "
-                "kv_cache_config as the third constructor argument and pass it "
-                "to super().__init__()."
-            )
         self._role = role
 
     @property
@@ -517,6 +510,14 @@ def build_connector_meta(
         """
         pass
 
+    def on_new_request(self, request: "Request") -> None:
+        """Called by the scheduler when a new request is added.
+
+        Connectors can override this to inspect the request and perform
+        bookkeeping. The default implementation is a no-op.
+        """
+        return
+
     def update_connector_output(self, connector_output: KVConnectorOutput):
         """
         Update KVConnector state from worker-side connectors output.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
index 9a39ec658fff..0f835b1eebba 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -87,7 +87,7 @@ def __init__(
         self,
         vllm_config: "VllmConfig",
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ):
         super().__init__(vllm_config, role, kv_cache_config)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
index 24e156561dfb..a0279120ef03 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
@@ -92,7 +92,7 @@ def __init__(
         self,
         vllm_config: "VllmConfig",
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ):
         super().__init__(
             vllm_config=vllm_config,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/example_hidden_states_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_hidden_states_connector.py
index fcd1f365a715..bf56db32e4f8 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/example_hidden_states_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_hidden_states_connector.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import safetensors
 import torch
@@ -120,7 +120,7 @@ def __init__(
         self,
         vllm_config: "VllmConfig",
         role: KVConnectorRole,
-        kv_cache_config: Optional["KVCacheConfig"] = None,
+        kv_cache_config: "KVCacheConfig",
     ):
         super().__init__(
             vllm_config=vllm_config,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index f55f04a08252..d7d2a55376c7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -476,7 +476,7 @@ def __init__(
         self,
         vllm_config: "VllmConfig",
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ):
         super().__init__(vllm_config, role, kv_cache_config)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
index 608fd8784778..c693902e6132 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
@@ -342,7 +342,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ):
         super().__init__(vllm_config, role, kv_cache_config)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
index baedea775cdf..c6f41bf6670c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
@@ -93,9 +93,9 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ):
-        super().__init__(vllm_config, role)
+        super().__init__(vllm_config, role, kv_cache_config)
         assert vllm_config.kv_transfer_config is not None, (
             "kv_transfer_config must be set for MoRIIOConnector"
         )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index a340f313e0a2..aaf8f5fe6b93 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -389,6 +389,10 @@ def update_state_after_alloc(
                 # Call with empty blocks for other connectors.
                 c.update_state_after_alloc(request, empty_blocks, 0)
 
+    def on_new_request(self, request: "Request") -> None:
+        for c in self._connectors:
+            c.on_new_request(request)
+
     def build_connector_meta(
         self, scheduler_output: SchedulerOutput
     ) -> MultiKVConnectorMetadata:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/connector.py
index 53ad031a4c50..187322b4ae4e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/connector.py
@@ -44,6 +44,7 @@
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import MambaSpec
+from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -156,6 +157,14 @@ def build_connector_meta(
         assert self.connector_scheduler is not None
         return self.connector_scheduler.build_connector_meta(scheduler_output)
 
+    def on_new_request(self, request: "Request") -> None:
+        assert self.connector_scheduler is not None
+        self.connector_scheduler.on_new_request(request)
+
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        assert self.connector_scheduler is not None
+        self.connector_scheduler.update_connector_output(connector_output)
+
     def request_finished(
         self,
         request: "Request",
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
index c56e373ba99d..b9e3436f5019 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
@@ -6,7 +6,7 @@
 from typing import Any
 
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.utils import BlockIds
+from vllm.distributed.kv_transfer.kv_connector.utils import BlockIds, EngineId
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorHandshakeMetadata,
     KVConnectorMetadata,
@@ -33,8 +33,9 @@
 #   1: Initial version with compatibility checking
 #   2: Add remote_request_id to kv_transfer_params
 #   3: Add physical_blocks_per_logical_kv_block to NixlAgentMetadata
+#   4: Add KV block lease renewal through heartbeats
 #
-NIXL_CONNECTOR_VERSION: int = 3
+NIXL_CONNECTOR_VERSION: int = 4
 
 
 @dataclass
@@ -133,6 +134,16 @@ def compute_nixl_compatibility_hash(
     return compat_hash
 
 
+@dataclass
+class HeartbeatInfo:
+    """Heartbeat data for a single remote engine, sent from D worker to P."""
+
+    req_ids: set[ReqId]
+    host: str
+    port: int
+    tp_size: int
+
+
 @dataclass
 class RemoteMeta:
     block_ids: BlockIds
@@ -158,6 +169,8 @@ def __init__(self):
         self.reqs_to_send: dict[ReqId, float] = {}
         self.reqs_in_batch: set[ReqId] = set()
         self.reqs_not_processed: set[ReqId] = set()
+        # Heartbeat data grouped by remote engine, sent by D worker to P.
+        self.heartbeat_by_engine: dict[EngineId, HeartbeatInfo] = {}
 
     def _add_new_req(
         self,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
index 02c418ebd8d7..b2122ed0d30b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
@@ -21,6 +21,7 @@
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl.metadata import (
     GET_META_MSG,
+    HeartbeatInfo,
     NixlConnectorMetadata,
     NixlHandshakePayload,
     ReqId,
@@ -41,6 +42,7 @@
     from vllm.config import VllmConfig
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.outputs import KVConnectorOutput
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -65,6 +67,13 @@ def __init__(
             + vllm_config.parallel_config.data_parallel_index
         )
         assert vllm_config.kv_transfer_config is not None
+        self._kv_lease_duration: int = (
+            vllm_config.kv_transfer_config.get_from_extra_config(
+                "kv_lease_duration", 30
+            )
+        )
+        # NOTE (NickLucche): For now we use a hardcoded value for a simpler interface.
+        self._heartbeat_interval = self._kv_lease_duration // 6
         if current_platform.device_type == "cpu":
             self.use_host_buffer = False
         else:
@@ -104,6 +113,13 @@ def __init__(
         # remote prefill or aborted.
         self._reqs_not_processed: set[ReqId] = set()
 
+        # Heartbeat tracking: requests needing periodic lease-renewal heartbeats to
+        # remote P-side, stored as ready-to-send HeartbeatInfo grouped by remote engine
+        self._heartbeat_by_engine: dict[EngineId, HeartbeatInfo] = {}
+        # Reverse lookup: local req_id -> (engine_id, remote_req_id) for O(1) removal
+        self._heartbeat_req_engine: dict[ReqId, tuple[EngineId, ReqId]] = {}
+        self._last_heartbeat_time: float = 0.0
+
         # Gather Sliding Window sizes for each kv cache group (if any) in number of
         # blocks per KV cache group. This is used to clip the local attention window.
         sw_sizes_tokens: list[tuple[int, int]] = [
@@ -135,12 +151,19 @@ def __init__(
                 "bidirectional_kv_xfer", False
             )
         )
+        self.decoder_kv_blocks_ttl = (
+            vllm_config.kv_transfer_config.get_from_extra_config(
+                "decoder_kv_blocks_ttl", 480
+            )
+        )
 
         if self.is_bidirectional_kv_xfer_enabled and self.kv_recompute_threshold > 0:
             logger.info(
                 "Bidirectional KV transfer is enabled and the kv "
-                "recompute threshold is set to %d tokens",
+                "recompute threshold is set to %d tokens."
+                "KV blocks on D are released after a TTL of %d seconds.",
                 self.kv_recompute_threshold,
+                self.decoder_kv_blocks_ttl,
             )
 
     def shutdown(self):
@@ -149,6 +172,50 @@ def shutdown(self):
             self._nixl_handshake_listener_t.join()
             self._nixl_handshake_listener_t = None
 
+    def on_new_request(self, request: "Request") -> None:
+        """Track a request that may need heartbeats."""
+        params = request.kv_transfer_params
+        # NOTE (NickLucche) This excludes request meant for P, ie heartbeats are
+        # effectively disabled for Bidirectional KV transfer.
+        if params is None or not params.get("do_remote_prefill"):
+            return
+        # Only track if all required remote fields are present.
+        remote_engine_id = params.get("remote_engine_id")
+        remote_request_id = params.get("remote_request_id")
+        host = params.get("remote_host")
+        port = params.get("remote_port")
+        tp_size = params.get("tp_size")
+        if (
+            remote_engine_id is None
+            or remote_request_id is None
+            or host is None
+            or port is None
+            or tp_size is None
+        ):
+            return
+        if remote_engine_id not in self._heartbeat_by_engine:
+            self._heartbeat_by_engine[remote_engine_id] = HeartbeatInfo(
+                req_ids=set(),
+                host=host,
+                port=port,
+                tp_size=tp_size,
+            )
+        self._heartbeat_by_engine[remote_engine_id].req_ids.add(remote_request_id)
+        self._heartbeat_req_engine[request.request_id] = (
+            remote_engine_id,
+            remote_request_id,
+        )
+
+    def _stop_heartbeat(self, req_id: ReqId) -> None:
+        """Remove *req_id* from heartbeat tracking (if tracked)."""
+        if key := self._heartbeat_req_engine.pop(req_id, None):
+            engine_id, remote_id = key
+            if info := self._heartbeat_by_engine.get(engine_id):
+                info.req_ids.discard(remote_id)
+                if not info.req_ids:
+                    # Clean up empty engines so we don't leak a key when remote dies.
+                    del self._heartbeat_by_engine[engine_id]
+
     def get_sw_clipped_blocks(self, block_ids: BlockIds) -> BlockIds:
         """
         Clip the number of blocks to the sliding window size for each kv cache group
@@ -209,6 +276,7 @@ def set_xfer_handshake_metadata(
                     encoded_data,
                     ready_event,
                     self._stop_event,
+                    self.side_channel_host,
                     self.side_channel_port,
                 ),
                 daemon=True,
@@ -222,6 +290,7 @@ def _nixl_handshake_listener(
         encoded_data: dict[int, Any],
         ready_event: threading.Event,
         stop_event: threading.Event,
+        host: str,
         port: int,
     ):
         """Background thread for getting new NIXL handshakes."""
@@ -229,7 +298,6 @@ def _nixl_handshake_listener(
         # to a better approach via HTTP endpoint soon.
 
         # Listen for new requests for metadata.
-        host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
         path = make_zmq_path("tcp", host, port)
         logger.debug("Starting listening on path: %s", path)
         with zmq_ctx(zmq.ROUTER, path) as sock:
@@ -489,6 +557,13 @@ def build_connector_meta(
         meta.reqs_in_batch = self._reqs_in_batch
         meta.reqs_not_processed = self._reqs_not_processed
 
+        # Package heartbeats, throttled by heartbeat_interval.
+        if self._heartbeat_by_engine:
+            now = time.perf_counter()
+            if now - self._last_heartbeat_time >= self._heartbeat_interval:
+                self._last_heartbeat_time = now
+                meta.heartbeat_by_engine = self._heartbeat_by_engine
+
         # Clear the list once workers start the transfers
         self._reqs_need_recv.clear()
         self._reqs_in_batch = set()
@@ -497,6 +572,11 @@ def build_connector_meta(
 
         return meta
 
+    def update_connector_output(self, connector_output: "KVConnectorOutput") -> None:
+        """Stop heartbeating for requests whose KV transfer completed."""
+        for req_id in connector_output.finished_recving or ():
+            self._stop_heartbeat(req_id)
+
     def request_finished(
         self,
         request: "Request",
@@ -522,10 +602,16 @@ def request_finished(
         is_p_node = bool(params.get("do_remote_decode"))
         is_d_node = not is_p_node
 
+        # Stop heartbeating for aborted requests that never reached finished_recving:
+        # normal path cleans up in update_connector_output.
+        self._stop_heartbeat(request.request_id)
+
         if params.get("do_remote_prefill"):
             # If do_remote_prefill is still True when the request is finished,
             # update_state_after_alloc must not have been called (the request
-            # must have been aborted before it was scheduled).
+            # must have been aborted before it was scheduled, e.g. via the
+            # abort_immediately path used to clean up KV-transfer requests
+            # rejected at the D-side serving layer).
             # To avoid stranding the prefill blocks in the prefill instance,
             # we must add empty block_ids to _reqs_need_recv so that our
             # worker side will notify and free blocks in the prefill instance.
@@ -553,14 +639,19 @@ def request_finished(
         remote_num_tokens = 0
         if delay_free_blocks:
             # Prefill request on remote. It will be read from D upon completion
+            request_kv_blocks_ttl = self._kv_lease_duration
+            if is_d_node:
+                # For blocks pinned on D, use a simpler timeout for now instead of a
+                # lease mechanism as turn2 request is client-driven.
+                request_kv_blocks_ttl = self.decoder_kv_blocks_ttl
             logger.debug(
                 "NIXLConnector request_finished(%s) waiting for %d seconds "
-                "for remote decode to fetch blocks",
+                "before releasing blocks",
                 request.request_id,
-                envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT,
+                request_kv_blocks_ttl,
             )
             self._reqs_need_send[request.request_id] = (
-                time.perf_counter() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
+                time.perf_counter() + request_kv_blocks_ttl
             )
             # NOTE HMA will "mark" empty/null blocks in groups with 0s (eg SWA ones),
             # trimming down after allocating for the whole sequence length. Empty
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/stats.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/stats.py
index 65c553cfec30..1e4f5c48e0f7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/stats.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/stats.py
@@ -4,7 +4,7 @@
 
 import copy
 from dataclasses import dataclass
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 
@@ -15,9 +15,11 @@
     PromMetric,
     PromMetricT,
 )
-from vllm.distributed.nixl_utils import nixlXferTelemetry
 from vllm.v1.metrics.utils import create_metric_per_engine
 
+if TYPE_CHECKING:
+    from vllm.distributed.nixl_utils import nixlXferTelemetry
+
 
 @dataclass
 class NixlKVConnectorStats(KVConnectorStats):
@@ -40,7 +42,7 @@ def reset(self):
             "num_kv_expired_reqs": [],
         }
 
-    def record_transfer(self, res: nixlXferTelemetry):
+    def record_transfer(self, res: "nixlXferTelemetry"):
         # Keep metrics units consistent with rest of the code: time us->s
         self.data["transfer_duration"].append(res.xferDuration / 1e6)
         self.data["post_duration"].append(res.postDuration / 1e6)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py
index 7115b8bed543..b034b7605087 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py
@@ -10,6 +10,7 @@
 
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     BlockIds,
+    TransferTopology,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheSpec, MambaSpec
 
@@ -62,11 +63,8 @@ class TPMapping:
 
 
 def compute_tp_mapping(
-    tp_rank: int,
-    tp_size: int,
+    transfer_topology: TransferTopology,
     remote_tp_size: int,
-    is_mla: bool,
-    total_num_kv_heads: int,
     group_spec_types: tuple[type[KVCacheSpec], ...],
 ) -> TPMapping:
     """Build the complete local-to-remote TP mapping.
@@ -74,13 +72,15 @@ def compute_tp_mapping(
     Computes source ranks, head slot assignments, and the rank offset
     factor in a single pass.
     """
+    tp_rank = transfer_topology.tp_rank
+    tp_size = transfer_topology.tp_size
+    total_num_kv_heads = transfer_topology.total_num_kv_heads
     # --- Attention source ranks ---
-    if is_mla:
-        # All heads replicated across all ranks.
-        attn_ranks = [0]
-    elif tp_size >= remote_tp_size:
+    if transfer_topology.is_mla or tp_size >= remote_tp_size:
         # D (local TP) > P (remote TP): multiple local ranks read different chunks from
         # *one* remote rank, corresponding to different kv heads.
+        # For MLA, we only need one remote since cache is duplicated. When P TP=k*TP k,
+        # this will spread mla ranks to read from remote k*tp_rank.
         attn_ranks = [tp_rank * remote_tp_size // tp_size]
     else:
         # P (remote TP) > D (local TP): one local rank
@@ -123,7 +123,7 @@ def compute_tp_mapping(
     }
 
     # --- Rank offset factor ---
-    if is_mla or tp_size <= remote_tp_size:
+    if transfer_topology.is_mla or tp_size <= remote_tp_size:
         # We don't index into remote for reading, no offset needed.
         rank_offset_factor = 0
     elif tp_size > total_num_kv_heads:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/utils.py
index d0b72464a27b..2fa3829eaecb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/utils.py
@@ -10,6 +10,7 @@
 
 from vllm.platforms import current_platform
 from vllm.utils.network_utils import make_zmq_socket
+from vllm.v1.kv_cache_interface import KVCacheSpec, UniformTypeKVCacheSpecs
 
 # Supported platforms and types of kv transfer buffer.
 # {device: tuple of supported kv buffer types}
@@ -46,3 +47,11 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
     finally:
         if ctx is not None:
             ctx.destroy(linger=0)
+
+
+def get_representative_spec_type(spec: KVCacheSpec) -> type[KVCacheSpec]:
+    if isinstance(spec, UniformTypeKVCacheSpecs):
+        # All inner specs are the same type; pick any.
+        inner = next(iter(spec.kv_cache_specs.values()))
+        return type(inner)
+    return type(spec)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
index caa5f432c5e0..f7d46af20e01 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
@@ -18,7 +18,6 @@
 import torch
 import zmq
 
-from vllm import envs
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     BlockIds,
     EngineId,
@@ -53,6 +52,7 @@
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl.utils import (
     _NIXL_SUPPORTED_DEVICE,
+    get_representative_spec_type,
     zmq_ctx,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.ssm_conv_transfer_utils import (
@@ -100,24 +100,24 @@ def _compute_desc_ids(
         num_blocks = dst_num_blocks
         if block_size_ratio is not None:
             num_blocks = int(num_blocks * block_size_ratio)
-        ratio = physical_blocks_per_logical
-        logical_blocks = num_blocks // ratio
-
         num_fa_descs = num_fa_regions * num_blocks
 
         # All-attention fast path: single vectorized broadcast.
         if num_ssm_regions == 0:
+            # NOTE (NickLucche) With HMA, every kv group has the same number of layers
+            # and layers from different groups share the same kv tensor.
+            # eg block_ids=[[1, 2], [3]]->blocks [1, 2] need to be
+            # read across all regions, same for [3], but group0-group1 blocks will
+            # always differ (different areas). Therefore we can just flatten the
+            # block_ids and compute the descs ids for all groups at once.
             block_arr = np.concatenate(block_ids)[None, :]
             region_ids = np.arange(num_fa_regions)[:, None]
             return (region_ids * num_blocks + block_arr).flatten()
 
-        # NOTE (NickLucche) With HMA, every kv group has the same number
-        # of layers and layers from different groups share the same kv
-        # tensor.  Therefore we compute desc IDs per group using the
-        # right stride:
-        # FA descs have num_blocks entries per region (kernel granularity),
-        # SSM descs have logical_blocks entries per region (no kernel
-        # splitting).
+        # Compute desc ids per group using the right stride: FA descs have
+        # num_blocks entries per region (kernel granularity), SSM descs have
+        # logical_blocks entries per region (no kernel splitting).
+        logical_blocks = num_blocks // physical_blocks_per_logical
         all_descs: list[np.ndarray] = []
         for i, group in enumerate(block_ids):
             group_arr = np.asarray(group)
@@ -197,7 +197,8 @@ def __init__(
         engine_id: str,
         kv_cache_config: "KVCacheConfig",
     ):
-        if NixlWrapper is None:
+        nixl_wrapper_cls = NixlWrapper
+        if nixl_wrapper_cls is None:
             logger.error("NIXL is not available")
             raise RuntimeError("NIXL is not available")
         logger.info("Initializing NIXL wrapper")
@@ -215,6 +216,12 @@ def __init__(
         self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
             "backends", ["UCX"]
         )
+        kv_lease_duration: int = vllm_config.kv_transfer_config.get_from_extra_config(
+            "kv_lease_duration", 30
+        )
+        # NOTE (NickLucche): For now we use a hardcoded value for a simpler interface.
+        self._lease_extension = kv_lease_duration * 2 // 3
+
         self._is_hma_required = (
             not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager
             and any(
@@ -283,7 +290,7 @@ def __init__(
                 else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
             )
 
-        self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
+        self.nixl_wrapper = nixl_wrapper_cls(str(uuid.uuid4()), config)
         # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
         self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
 
@@ -381,10 +388,12 @@ def __init__(
         # Set of requests that have been part of a batch, regardless of status.
         self._reqs_to_process: set[ReqId] = set()
 
-        # invalid blocks from failed NIXL operations
-        self._invalid_block_ids: set[int] = set()
+        # Invalid blocks from failed NIXL operations (thread-safe queue of block ids)
+        self._invalid_block_ids: queue.Queue[set[int]] = queue.Queue()
         # requests that skipped transfer (handshake or transfer failures)
-        self._failed_recv_reqs: set[ReqId] = set()
+        # Uses Queue for thread-safe cross-thread coordination with the
+        # background handshake thread, matching the _ready_requests pattern.
+        self._failed_recv_reqs: queue.Queue[ReqId] = queue.Queue()
 
         # Handshake metadata of this worker for NIXL transfers.
         self.xfer_handshake_metadata: NixlHandshakePayload | None = None
@@ -411,7 +420,10 @@ def __init__(
 
         self.kv_cache_layout = get_kv_cache_layout()
         self.host_buffer_kv_cache_layout = self.kv_cache_layout
-        logger.info("Detected attention backend %s", self.backend_name)
+        logger.info(
+            "Detected attention backend(s) %s",
+            [backend.get_name() for backend in self.attn_backends],
+        )
         logger.info("Detected kv cache layout %s", self.kv_cache_layout)
 
         # lazy initialized in register_kv_caches
@@ -426,8 +438,10 @@ def __init__(
         self._physical_blocks_per_logical_kv_block = 1
         self._sync_block_size_with_kernel()
 
+        # Unwrap UniformTypeKVCacheSpecs to get the representative spec type
         self._group_spec_types = tuple(
-            type(g.kv_cache_spec) for g in self.kv_cache_config.kv_cache_groups
+            get_representative_spec_type(g.kv_cache_spec)
+            for g in self.kv_cache_config.kv_cache_groups
         )
 
         # Per-engine TP mappings. Generated during handshake.
@@ -681,23 +695,38 @@ def _log_failure(
             stacklevel=2,
         )
 
-    def _background_nixl_handshake(
-        self, req_id: str, remote_engine_id: EngineId, meta: ReqMeta
-    ):
-        # Do NIXL handshake in background and add to _ready_requests when done.
-        fut = self._handshake_futures.get(remote_engine_id)
-        if fut is None:
-            assert meta.remote is not None
+    def _ensure_handshake(
+        self,
+        engine_id: EngineId,
+        host: str,
+        port: int,
+        tp_size: int,
+    ) -> Future[dict[int, str]] | None:
+        """
+        Ensure a handshake is in-flight (or already done) for *engine_id*.
+
+        Returns the ``Future`` if a handshake is pending (or was just
+        started), or ``None`` if the handshake already completed
+        successfully.  Callers can attach per-request callbacks to the
+        returned future.
+        Failures to handshake are logged and the request is marked as failed.
+        """
+        with self._handshake_lock:
+            if engine_id in self._remote_agents:
+                return None
+            fut = self._handshake_futures.get(engine_id)
+            if fut is not None:
+                return fut
             fut = self._handshake_initiation_executor.submit(
                 self._nixl_handshake,
-                meta.remote.host,
-                meta.remote.port,
-                meta.tp_size,
-                remote_engine_id,
+                host,
+                port,
+                tp_size,
+                engine_id,
             )
-            self._handshake_futures[remote_engine_id] = fut
+            self._handshake_futures[engine_id] = fut
 
-            def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
+            def done_callback(f: Future[dict[int, str]], eid=engine_id):
                 with self._handshake_lock:
                     del self._handshake_futures[eid]
                     try:
@@ -711,26 +740,37 @@ def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
                         )
 
             fut.add_done_callback(done_callback)
+            return fut
+
+    def _background_nixl_handshake(
+        self, req_id: str, remote_engine_id: EngineId, meta: ReqMeta
+    ):
+        # Do NIXL handshake in background and add to _ready_requests when done.
+        assert meta.remote is not None
+        fut = self._ensure_handshake(
+            remote_engine_id,
+            meta.remote.host,
+            meta.remote.port,
+            meta.tp_size,
+        )
+        if fut is None:
+            # Already handshaked — only happens if caller does not pre-check.
+            self._ready_requests.put((req_id, meta))
+            return
 
-        # check handshake success before proceeding with request
+        # Check handshake success before proceeding with request.
         def request_ready(f: Future[Any], entry=(req_id, meta)):
             try:
-                # check if handshake succeeded
                 f.result()
                 self._ready_requests.put(entry)
             except Exception as e:
-                # handshake failed - mark blocks as invalid
                 self._log_failure(
                     failure_type="handshake_failed",
                     req_id=req_id,
                     error=e,
                     meta=meta,
                 )
-                if (
-                    req_meta := self._recving_metadata.get(req_id)
-                ) and not self._is_hma_required:
-                    self._invalid_block_ids.update(req_meta.local_block_ids[0])
-                self._failed_recv_reqs.add(req_id)
+                self._handle_failed_transfer(req_id, None)
 
         fut.add_done_callback(request_ready)
 
@@ -866,9 +906,21 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                 else:
                     self.block_len_per_layer.append(physical_page_size)
 
-                assert cache.shape[0] == num_blocks, (
-                    "All kv cache tensors must have the same number of blocks"
-                )
+                if cache.shape[0] != num_blocks:
+                    raise AssertionError(
+                        "All kv cache tensors must have the same number of "
+                        f"blocks; layer={layer_name}, "
+                        f"expected_num_blocks={num_blocks}, "
+                        f"cache_shape={tuple(cache.shape)}, "
+                        f"cache_stride={tuple(cache.stride())}, "
+                        f"layer_spec={type(layer_spec).__name__}, "
+                        f"backend={self.backend_name}, "
+                        "all_backends="
+                        f"{[backend.get_name() for backend in self.attn_backends]}, "
+                        f"kv_cache_layout={self.kv_cache_layout}, "
+                        "blocks_first="
+                        f"{self.transfer_topo.is_kv_layout_blocks_first}"
+                    )
 
                 if not self.use_mla:
                     # Different kv cache shape is not supported by HeteroTP.
@@ -1259,12 +1311,9 @@ def add_remote_agent(
         logger.info("Transfer plan: %s", transfer_topo.describe(engine_id))
 
         self.tp_mappings[engine_id] = compute_tp_mapping(
-            transfer_topo.tp_rank,
-            transfer_topo.tp_size,
-            transfer_info.remote_tp_size,
-            transfer_topo.is_mla,
-            transfer_topo.total_num_kv_heads,
-            self._group_spec_types,
+            transfer_topology=transfer_topo,
+            remote_tp_size=remote_tp_size,
+            group_spec_types=self._group_spec_types,
         )
 
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
@@ -1391,7 +1440,8 @@ def _validate_remote_agent_handshake(
         )
         # num_kv_heads > tp_size with P_TP > D_TP not supported for non-mamba.
         # Mamba models can have replicated FA KV with tp_ratio < 0.
-        if not self._has_mamba:
+        # MLA models do not need to handle kv replication.
+        if not self.use_mla and not self._has_mamba:
             assert not (
                 tp_ratio < 0 and self.transfer_topo.is_kv_replicated(remote_engine_id)
             )
@@ -1644,17 +1694,26 @@ def get_finished(self) -> tuple[set[str], set[str]]:
         done_sending = self._get_new_notifs()
         done_recving = self._pop_done_transfers(self._recving_transfers)
 
-        # add requests that skipped transfer to done_recving
-        done_recving.update(self._failed_recv_reqs)
-        self._failed_recv_reqs.clear()
+        # Drain queue of requests where handshake or transfer setup failed.
+        failed_recv_reqs = set[ReqId]()
+        while not self._failed_recv_reqs.empty():
+            try:
+                failed_recv_reqs.add(self._failed_recv_reqs.get_nowait())
+            except queue.Empty:
+                break
+
+        # Add failed requests to done_recving for scheduler tracking
+        # (blocks are already marked invalid, scheduler will handle recompute)
+        done_recving.update(failed_recv_reqs)
 
         if len(done_sending) > 0 or len(done_recving) > 0:
             logger.debug(
                 "Rank %s, get_finished: %s requests done sending "
-                "and %s requests done recving",
+                "and %s requests done recving (%s failed)",
                 self.tp_rank,
                 len(done_sending),
                 len(done_recving),
+                len(failed_recv_reqs),
             )
 
         block_ids_for_blocksize_post_process = defaultdict(list)
@@ -1663,6 +1722,15 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             # clean up metadata for completed requests
             meta = self._recving_metadata.pop(req_id, None)
             assert meta is not None, f"{req_id} not found in recving_metadata list"
+
+            # Skip KV sync and post-processing for failed requests
+            if req_id in failed_recv_reqs:
+                logger.warning(
+                    "Skipping KV post-processing for failed request %s",
+                    req_id,
+                )
+                continue
+
             assert meta.remote is not None
             if self.use_host_buffer:
                 self.sync_recved_kv_to_device(req_id, meta)
@@ -1704,10 +1772,9 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             self.xfer_stats.record_kv_expired_req()
             logger.warning(
                 "Releasing expired KV blocks for request %s which were "
-                "retrieved by %d decode worker(s) within %d seconds.",
+                "retrieved by %d remote worker(s) before lease expired.",
                 req_id,
                 count,
-                envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT,
             )
             self._reqs_to_process.remove(req_id)
             del self._reqs_to_send[req_id]
@@ -1720,12 +1787,22 @@ def _get_new_notifs(self) -> set[str]:
         Get req_ids which got a remote xfer message. When multiple consumers
         are reading from the same producer (heterogeneous TP scenario), wait
         for all consumers to be done pulling.
+
+        Also handles heartbeat notifications ("HB:req1,req2,...") by
+        extending the lease on the referenced requests.
         """
         assert self.transfer_topo is not None
         notified_req_ids: set[str] = set()
         for notifs in self.nixl_wrapper.get_new_notifs().values():
             for notif in notifs:
-                req_id, tp_size = notif.decode("utf-8").rsplit(":", 1)
+                msg = notif.decode("utf-8")
+
+                # Handle heartbeat messages from D-side.
+                if msg.startswith("HB:"):
+                    self._handle_heartbeat(msg[3:])
+                    continue
+
+                req_id, tp_size = msg.rsplit(":", 1)
                 if (
                     req_id not in self._reqs_to_send
                     and req_id not in self._reqs_to_process
@@ -1760,6 +1837,27 @@ def _get_new_notifs(self) -> set[str]:
                     self._reqs_to_send.pop(req_id, None)
         return notified_req_ids
 
+    def _handle_heartbeat(self, payload: str) -> None:
+        """Extend leases for requests referenced in a heartbeat.
+
+        Args:
+            payload: comma-separated P-side request IDs, e.g.
+                     "req_abc,req_def".
+        """
+        new_expiry = time.perf_counter() + self._lease_extension
+        for req_id in payload.split(","):
+            if req_id in self._reqs_to_send:
+                old = self._reqs_to_send[req_id]
+                self._reqs_to_send[req_id] = max(old, new_expiry)
+                logger.debug(
+                    "Heartbeat extended lease for request %s "
+                    "by %ds (old_expiry=%.1f, new_expiry=%.1f)",
+                    req_id,
+                    self._lease_extension,
+                    old,
+                    new_expiry,
+                )
+
     def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
         """
         Pop completed xfers by checking for DONE state.
@@ -1807,7 +1905,7 @@ def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
                 transfers[req_id] = in_progress
         return done_req_ids
 
-    def _handle_failed_transfer(self, req_id: str, handle: int):
+    def _handle_failed_transfer(self, req_id: str, handle: int | None):
         """
         Handle a failed transfer by marking all (logical) blocks as invalid and
         recording the failure.
@@ -1819,8 +1917,10 @@ def _handle_failed_transfer(self, req_id: str, handle: int):
         # Use .get() here as the metadata cleanup is handled by get_finished()
         # TODO (NickLucche) handle failed transfer for HMA.
         if (meta := self._recving_metadata.get(req_id)) and not self._is_hma_required:
-            self._invalid_block_ids.update(meta.local_block_ids[0])
-        self.nixl_wrapper.release_xfer_handle(handle)
+            self._invalid_block_ids.put(set(meta.local_block_ids[0]))
+        self._failed_recv_reqs.put(req_id)
+        if handle is not None:
+            self.nixl_wrapper.release_xfer_handle(handle)
         self.xfer_stats.record_failed_transfer()
 
     def start_load_kv(self, metadata: NixlConnectorMetadata):
@@ -1880,6 +1980,37 @@ def start_load_kv(self, metadata: NixlConnectorMetadata):
             if req_id in self._reqs_to_process:
                 self._reqs_to_send[req_id] = expiration_time
 
+        # Send heartbeats to P-side engines to keep KV blocks alive while
+        # requests sit in the D scheduler WAITING queue.
+        self._send_heartbeats(metadata)
+
+    def _send_heartbeats(self, metadata: NixlConnectorMetadata) -> None:
+        """
+        Send heartbeat notifications to remote engines, extending lease on KV blocks.
+        """
+        for engine_id, hb_info in metadata.heartbeat_by_engine.items():
+            # Proactive handshake (this request may still be in waiting queue) so
+            # the **next** heartbeat for this remote can go through.
+            if (
+                self._ensure_handshake(
+                    engine_id, hb_info.host, hb_info.port, hb_info.tp_size
+                )
+                is not None
+            ):
+                continue  # handshake is still pending
+
+            # Build the heartbeat message: "HB:req1,req2,..."
+            hb_msg = ("HB:" + ",".join(hb_info.req_ids)).encode()
+            for agent_name in self._remote_agents[engine_id].values():
+                try:
+                    self.nixl_wrapper.send_notif(agent_name, notif_msg=hb_msg)
+                except Exception:
+                    logger.debug(
+                        "Failed to send heartbeat to engine %s",
+                        engine_id,
+                        exc_info=True,
+                    )
+
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         assert meta.remote is not None and self.transfer_topo is not None
         engine_id = meta.remote.engine_id
@@ -1915,9 +2046,9 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
 
         # D may have to perform multiple reads from different remote ranks.
         # MLA opt: when P TP > D TP, only a single read is executed for
-        # the first remote rank (cache is duplicated).
+        # the first remote rank (cache is duplicated)..
         if self.use_mla and tp_ratio < 0:
-            read_specs = read_specs[:1]
+            assert len(read_specs) == 1
 
         for i, spec in enumerate(read_specs):
             remote_block_size = remote_info.remote_block_size
@@ -1959,11 +2090,10 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         if self.use_mla and tp_ratio < 0 and read_specs:
             # ..but we still need to notify the other remote ranks that we
             # have the blocks we need so they can update the request state.
-            notif_id = f"{req_id}:{self.world_size}".encode()
+            notif_id = f"{meta.remote.request_id}:{self.world_size}".encode()
             remote_agents = self._remote_agents[meta.remote.engine_id]
-            read_ranks = {s.remote_rank for s in read_specs}
             for rank_to_notify, agent in remote_agents.items():
-                if rank_to_notify not in read_ranks:
+                if rank_to_notify != read_specs[0].remote_rank:
                     self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
 
     def _read_blocks(
@@ -2109,14 +2239,7 @@ def _read_blocks(
                 dst_engine_id=dst_engine_id,
                 remote_rank=remote_rank,
             )
-            if (
-                meta := self._recving_metadata.get(request_id)
-            ) and not self._is_hma_required:
-                self._invalid_block_ids.update(meta.local_block_ids[0])
-            self.xfer_stats.record_failed_transfer()
-            if handle is not None:
-                self.nixl_wrapper.release_xfer_handle(handle)
-            self._failed_recv_reqs.add(request_id)
+            self._handle_failed_transfer(request_id, handle)
 
     def get_mapped_blocks(
         self, block_ids: np.ndarray, block_size_ratio: int
@@ -2254,8 +2377,13 @@ def get_block_ids_with_load_errors(self) -> set[int]:
         This is called by the scheduler to identify blocks that need
         to be retried after a NIXL transfer failure.
         """
-        result = self._invalid_block_ids
-        self._invalid_block_ids = set()
+        # Drain the queue (thread-safe, no lock needed).
+        result: set[int] = set()
+        while not self._invalid_block_ids.empty():
+            try:
+                result.update(self._invalid_block_ids.get_nowait())
+            except queue.Empty:
+                break
         return result
 
     def __del__(self):
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
index 773fe8f056ac..137eaef9788c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
@@ -291,7 +291,7 @@ def _touch(self, req_status: RequestOffloadState):
             self.config.kv_group_configs, req_status.group_states
         ):
             if group_config.sliding_window_size_in_blocks is None:
-                self.manager.touch(group_state.offload_keys)
+                self.manager.touch(group_state.offload_keys, req_status.req_context)
             else:
                 # we aim to keep just blocks that are necessary to hit
                 # the original request (+ decoded blocks)
@@ -300,7 +300,10 @@ def _touch(self, req_status: RequestOffloadState):
                     group_state.num_hit_blocks
                     - group_config.sliding_window_size_in_blocks,
                 )
-                self.manager.touch(group_state.offload_keys[blocks_to_skip:])
+                self.manager.touch(
+                    group_state.offload_keys[blocks_to_skip:],
+                    req_status.req_context,
+                )
 
     def _lookup(self, req_status: RequestOffloadState) -> int | None:
         """
@@ -802,14 +805,13 @@ def update_connector_output(self, connector_output: KVConnectorOutput):
                 continue
             assert job_status.pending_count == 0
 
+            req_status = self._req_status[job_status.req_id]
             if job_status.is_store:
-                self.manager.complete_store(job_status.keys)
+                self.manager.complete_store(job_status.keys, req_status.req_context)
             else:
-                self.manager.complete_load(job_status.keys)
+                self.manager.complete_load(job_status.keys, req_status.req_context)
                 if self._blocks_being_loaded:
                     self._blocks_being_loaded.difference_update(job_status.keys)
-
-            req_status = self._req_status[job_status.req_id]
             if self._block_id_to_pending_jobs:
                 # Sliding window blocks are tracked from store creation
                 # and must be cleaned up unconditionally.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 8b264dd726e4..daa8042c8ba1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -52,11 +52,10 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         role: KVConnectorRole,
-        kv_cache_config: KVCacheConfig | None = None,
+        kv_cache_config: KVCacheConfig,
     ):
         super().__init__(vllm_config, role, kv_cache_config)
 
-        assert kv_cache_config is not None
         spec = OffloadingSpecFactory.create_spec(vllm_config, kv_cache_config)
 
         self.connector_scheduler: OffloadingConnectorScheduler | None = None
@@ -164,6 +163,10 @@ def take_events(self) -> Iterable[KVCacheEvent]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.take_events()
 
+    @classmethod
+    def get_required_kvcache_layout(cls, vllm_config: VllmConfig) -> str | None:
+        return "HND"
+
     def get_kv_connector_stats(self) -> KVConnectorStats | None:
         if self.connector_worker is None:
             return None  # We only emit stats from the worker-side
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index ce228b3c6f23..aa791921be1a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -76,7 +76,7 @@ def __init__(
         self,
         vllm_config: "VllmConfig",
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ):
         super().__init__(
             vllm_config=vllm_config,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/simple_cpu_offload_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/simple_cpu_offload_connector.py
index 6475b941ba59..c7fca1664bc6 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/simple_cpu_offload_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/simple_cpu_offload_connector.py
@@ -49,7 +49,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         role: KVConnectorRole,
-        kv_cache_config: "KVCacheConfig | None" = None,
+        kv_cache_config: "KVCacheConfig",
     ):
         super().__init__(vllm_config, role, kv_cache_config)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
index 00b8e2bb7275..4ec561335527 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
@@ -13,6 +13,7 @@
 import torch
 
 from vllm.model_executor.layers.mamba.mamba_utils import is_conv_state_dim_first
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 from vllm.v1.kv_cache_interface import MambaSpec
 
 
@@ -103,7 +104,7 @@ def derive_mamba_conv_split(
         MambaConvSplitInfo with per-rank x_local, b_local, conv_rows,
         conv_dtype_size, and ssm_sizes (conv_state_bytes, ssm_state_bytes).
     """
-    if mamba_spec.mamba_type != "mamba2":
+    if mamba_spec.mamba_type != MambaAttentionBackendEnum.MAMBA2:
         raise NotImplementedError(
             f"3-read conv transfer only supports Mamba2 models, "
             f"got mamba_type={mamba_spec.mamba_type!r}.  "
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index 2cc074bded6f..4392d6520779 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -49,7 +49,7 @@ def is_v1_kv_transfer_group(connector: KVConnectorBaseType | None = None) -> boo
 
 
 def ensure_kv_transfer_initialized(
-    vllm_config: "VllmConfig", kv_cache_config: "KVCacheConfig | None" = None
+    vllm_config: "VllmConfig", kv_cache_config: "KVCacheConfig"
 ) -> None:
     """
     Initialize KV cache transfer parallel group.
diff --git a/vllm/distributed/nixl_utils.py b/vllm/distributed/nixl_utils.py
index 2da37017a37f..d7d262672d39 100644
--- a/vllm/distributed/nixl_utils.py
+++ b/vllm/distributed/nixl_utils.py
@@ -1,54 +1,82 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib
 import os
 import sys
+from typing import Any
 
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
-if "UCX_RCACHE_MAX_UNRELEASED" not in os.environ:
+# declaration for static analyzers
+NixlWrapper: Any
+nixl_agent_config: Any
+nixlXferTelemetry: Any
+
+
+def _maybe_set_ucx_rcache_limit() -> None:
+    if "UCX_RCACHE_MAX_UNRELEASED" in os.environ:
+        return
+
     if "nixl" in sys.modules or "rixl" in sys.modules:
         logger.warning_once(
             "NIXL was already imported, we can't reset "
             "UCX_RCACHE_MAX_UNRELEASED. "
             "Please set it to '1024' manually."
         )
-    else:
-        logger.info_once(
-            "Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare "
-            "memory leak in UCX when using NIXL."
-        )
-        os.environ["UCX_RCACHE_MAX_UNRELEASED"] = "1024"
+        return
 
-try:
-    if not current_platform.is_rocm():
-        from nixl._api import nixl_agent as NixlWrapper
-    else:
-        from rixl._api import nixl_agent as NixlWrapper
+    logger.info_once(
+        "Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare "
+        "memory leak in UCX when using NIXL."
+    )
+    os.environ["UCX_RCACHE_MAX_UNRELEASED"] = "1024"
 
-    logger.info_once("NIXL is available")
-except ImportError:
-    logger.warning_once("NIXL is not available")
-    NixlWrapper = None  # type: ignore[assignment, misc]
 
-try:
-    if not current_platform.is_rocm():
-        from nixl._api import nixl_agent_config
-    else:
-        from rixl._api import nixl_agent_config
-except ImportError:
-    nixl_agent_config = None  # type: ignore[assignment]
-    logger.warning_once("NIXL agent config is not available")
-
-try:
-    if not current_platform.is_rocm():
-        from nixl._bindings import nixlXferTelemetry
+def _get_nixl_module_name(name: str) -> str:
+    package_name = "rixl" if current_platform.is_rocm() else "nixl"
+    if name == "nixlXferTelemetry":
+        return f"{package_name}._bindings"
+    return f"{package_name}._api"
+
+
+def _load_nixl_attr(name: str) -> Any:
+    attr_name = {
+        "NixlWrapper": "nixl_agent",
+        "nixl_agent_config": "nixl_agent_config",
+        "nixlXferTelemetry": "nixlXferTelemetry",
+    }[name]
+
+    _maybe_set_ucx_rcache_limit()
+    try:
+        module = importlib.import_module(_get_nixl_module_name(name))
+    except ImportError:
+        if name == "NixlWrapper":
+            logger.warning_once("NIXL is not available")
+        elif name == "nixl_agent_config":
+            logger.warning_once("NIXL agent config is not available")
+        value = None
     else:
-        from rixl._bindings import nixlXferTelemetry
-except ImportError:
-    nixlXferTelemetry = None  # type: ignore[assignment, misc]
+        value = getattr(module, attr_name, None)
+        if name == "NixlWrapper":
+            if value is None:
+                logger.warning_once("NIXL is not available")
+            else:
+                logger.info_once("NIXL is available")
+        elif name == "nixl_agent_config" and value is None:
+            logger.warning_once("NIXL agent config is not available")
+
+    globals()[name] = value
+    return value
+
+
+def __getattr__(name: str) -> Any:
+    if name in __all__:
+        return _load_nixl_attr(name)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
 
 __all__ = ["NixlWrapper", "nixl_agent_config", "nixlXferTelemetry"]
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1b3803139217..e65ef6e425bb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -335,6 +335,11 @@ def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
             kwargs[name]["type"] = parse_dataclass
             kwargs[name]["help"] += _maybe_add_docs_url(dataclass_cls)
             kwargs[name]["help"] += f"\n\n{json_tip}"
+        elif type_hints == {bool, str, type(None)}:
+            # Optional-valued flag: bare flag -> True, value -> str.
+            kwargs[name]["type"] = str
+            kwargs[name]["nargs"] = "?"
+            kwargs[name]["const"] = True
         elif contains_type(type_hints, bool):
             # Creates --no-<name> and --<name> flags
             kwargs[name]["action"] = argparse.BooleanOptionalAction
@@ -350,7 +355,11 @@ def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
             if name == "max_model_len":
                 kwargs[name]["type"] = human_readable_int_or_auto
                 kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}"
-            elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"):
+            elif name in (
+                "max_num_batched_tokens",
+                "kv_cache_memory_bytes",
+                "safetensors_prefetch_block_size",
+            ):
                 kwargs[name]["type"] = human_readable_int
                 kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
             else:
@@ -419,6 +428,8 @@ class EngineArgs:
     allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
     download_dir: str | None = LoadConfig.download_dir
     safetensors_load_strategy: str | None = LoadConfig.safetensors_load_strategy
+    safetensors_prefetch_num_threads: int = LoadConfig.safetensors_prefetch_num_threads
+    safetensors_prefetch_block_size: int = LoadConfig.safetensors_prefetch_block_size
     load_format: str | LoadFormats = LoadConfig.load_format
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
@@ -801,16 +812,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--served-model-name", **model_kwargs["served_model_name"]
         )
         model_group.add_argument("--config-format", **model_kwargs["config_format"])
-        # This one is a special case because it can bool
-        # or str. TODO: Handle this in get_kwargs
-        model_group.add_argument(
-            "--hf-token",
-            type=str,
-            nargs="?",
-            const=True,
-            default=model_kwargs["hf_token"]["default"],
-            help=model_kwargs["hf_token"]["help"],
-        )
+        model_group.add_argument("--hf-token", **model_kwargs["hf_token"])
         model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
         model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
         model_group.add_argument(
@@ -848,6 +850,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         load_group.add_argument(
             "--safetensors-load-strategy", **load_kwargs["safetensors_load_strategy"]
         )
+        load_group.add_argument(
+            "--safetensors-prefetch-num-threads",
+            **load_kwargs["safetensors_prefetch_num_threads"],
+        )
+        load_group.add_argument(
+            "--safetensors-prefetch-block-size",
+            **load_kwargs["safetensors_prefetch_block_size"],
+        )
         load_group.add_argument(
             "--model-loader-extra-config", **load_kwargs["model_loader_extra_config"]
         )
@@ -1588,6 +1598,8 @@ def create_load_config(self) -> LoadConfig:
             load_format=self.load_format,
             download_dir=self.download_dir,
             safetensors_load_strategy=self.safetensors_load_strategy,
+            safetensors_prefetch_num_threads=self.safetensors_prefetch_num_threads,
+            safetensors_prefetch_block_size=self.safetensors_prefetch_block_size,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
             use_tqdm_on_load=self.use_tqdm_on_load,
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 1471dd1c566d..3f83734a5b78 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -108,6 +108,20 @@ async def abort(self, request_id: str | Iterable[str]) -> None:
         """
         ...
 
+    @abstractmethod
+    async def notify_kv_transfer_request_rejected(
+        self,
+        request_id: str,
+        kv_transfer_params: dict[str, Any],
+        *,
+        data_parallel_rank: int | None = None,
+    ) -> None:
+        """Notify the engine that a KV-transfer request was rejected before
+        engine admission, so connector-side cleanup can run (e.g. free
+        prefill blocks pinned on the P node).
+        """
+        ...
+
     @abstractmethod
     async def is_tracing_enabled(self) -> bool: ...
 
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
index 742f9cced6f9..03e4678c8214 100644
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -111,6 +111,9 @@ class ChatCompletionResponse(OpenAIBaseModel):
     # vLLM-specific fields that are not in OpenAI spec
     prompt_logprobs: list[dict[int, Logprob] | None] | None = None
     prompt_token_ids: list[int] | None = None
+    # Rendered prompt text from chat templating (only set when
+    # ``return_prompt_text=True`` on the request).
+    prompt_text: str | None = None
     kv_transfer_params: dict[str, Any] | None = Field(
         default=None, description="KVTransfer parameters."
     )
@@ -138,6 +141,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     system_fingerprint: str | None = None
     # not part of the OpenAI spec but for tracing the tokens
     prompt_token_ids: list[int] | None = None
+    # Rendered prompt text from chat templating (only set when
+    # ``return_prompt_text=True`` on the request); only sent on the first chunk.
+    prompt_text: str | None = None
 
 
 class ChatCompletionToolsParam(OpenAIBaseModel):
@@ -352,6 +358,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "need to map generated text back to input tokens."
         ),
     )
+    return_prompt_text: bool | None = Field(
+        default=None,
+        description=(
+            "If true, the response will include ``prompt_text`` containing the "
+            "prompt string produced by chat templating. In streaming mode it "
+            "is sent only on the first chunk. This is useful for inspecting "
+            "exactly what was fed into the model."
+        ),
+    )
 
     cache_salt: str | None = Field(
         default=None,
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
index 1026e0a1e3f7..10bde6847891 100644
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -234,6 +234,15 @@ async def create_chat_completion(
         for the API specification. This API mimics the OpenAI
         Chat Completion API.
         """
+        return await self._with_kv_transfer_rejection_cleanup(
+            self._create_chat_completion(request, raw_request), request, raw_request
+        )
+
+    async def _create_chat_completion(
+        self,
+        request: ChatCompletionRequest,
+        raw_request: Request | None = None,
+    ) -> AsyncGenerator[str, None] | ChatCompletionResponse | ErrorResponse:
         # Streaming response
         tokenizer = self.renderer.tokenizer
         assert tokenizer is not None
@@ -499,6 +508,9 @@ async def chat_completion_stream_generator(
                     # the role
                     role = self.get_chat_request_role(request)
 
+                    # ``res.prompt`` is the rendered chat-templated prompt
+                    prompt_text = res.prompt if request.return_prompt_text else None
+
                     # NOTE num_choices defaults to 1 so this usually executes
                     # once per request
                     for i in range(num_choices):
@@ -524,6 +536,7 @@ async def chat_completion_stream_generator(
                                 if request.return_token_ids
                                 else None
                             ),
+                            prompt_text=prompt_text,
                         )
 
                         # if continuous usage stats are requested, add it
@@ -1362,6 +1375,9 @@ async def chat_completion_full_generator(
         if final_res.prompt_routed_experts is not None:
             prompt_routed_experts = final_res.prompt_routed_experts.tolist()
 
+        # ``final_res.prompt`` is the rendered chat-templated prompt text
+        prompt_text = final_res.prompt if request.return_prompt_text else None
+
         response = ChatCompletionResponse(
             id=request_id,
             created=created_time,
@@ -1373,6 +1389,7 @@ async def chat_completion_full_generator(
             prompt_token_ids=(
                 final_res.prompt_token_ids if request.return_token_ids else None
             ),
+            prompt_text=prompt_text,
             kv_transfer_params=final_res.kv_transfer_params,
             prompt_routed_experts=prompt_routed_experts,
         )
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
index ee4ca9f3ada3..05efe86466d4 100644
--- a/vllm/entrypoints/openai/completion/serving.py
+++ b/vllm/entrypoints/openai/completion/serving.py
@@ -118,6 +118,15 @@ async def create_completion(
             - suffix (the language models we currently support do not support
             suffix)
         """
+        return await self._with_kv_transfer_rejection_cleanup(
+            self._create_completion(request, raw_request), request, raw_request
+        )
+
+    async def _create_completion(
+        self,
+        request: CompletionRequest,
+        raw_request: Request | None = None,
+    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
         if request.stream and request.use_beam_search:
             return self.create_error_response(
                 "Streaming is not currently supported with beam search"
diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index f0f84a82204c..2a51cc0bfac0 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -4,7 +4,7 @@
 import contextlib
 import json
 import time
-from collections.abc import AsyncGenerator, Mapping
+from collections.abc import AsyncGenerator, Awaitable, Mapping
 from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Any, ClassVar, Generic, Protocol, TypeAlias, TypeVar
@@ -118,6 +118,7 @@ def build_chat_params(
 )
 
 RequestT = TypeVar("RequestT", bound=AnyRequest)
+_T = TypeVar("_T")
 
 
 @dataclass(kw_only=True)
@@ -156,6 +157,9 @@ def __init__(
         self.model_config = engine_client.model_config
         self.renderer = engine_client.renderer
         self.input_processor = engine_client.input_processor
+        vllm_config = getattr(engine_client, "vllm_config", None)
+        kv_transfer_config = getattr(vllm_config, "kv_transfer_config", None)
+        self.has_kv_connector = kv_transfer_config is not None
 
         # Computed once at startup (cached by ``vllm_config`` identity) and
         # stamped on non-streaming responses. Streaming chunks deliberately
@@ -616,6 +620,40 @@ def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
         except ValueError:
             return None
 
+    async def _with_kv_transfer_rejection_cleanup(
+        self,
+        awaitable: Awaitable[_T],
+        request: ChatCompletionRequest | CompletionRequest | ResponsesRequest,
+        raw_request: Request | None,
+    ) -> _T:
+        """Wrap a `create_*` coroutine so that, if it raises or returns an
+        ErrorResponse (i.e. the request never reached the engine), the KV
+        connector is notified to free any pinned remote-prefill blocks."""
+        kv_transfer_params = self.has_kv_connector and request.kv_transfer_params
+        if not kv_transfer_params or not kv_transfer_params.get("do_remote_prefill"):
+            return await awaitable
+
+        notify = True
+        try:
+            result = await awaitable
+            if not isinstance(result, ErrorResponse):
+                notify = False
+            return result
+        finally:
+            if notify:
+                try:
+                    await self.engine_client.notify_kv_transfer_request_rejected(
+                        request.request_id,
+                        kv_transfer_params,
+                        data_parallel_rank=self._get_data_parallel_rank(raw_request),
+                    )
+                except Exception:
+                    logger.warning(
+                        "Failed to notify KV connector about rejected request %s",
+                        request.request_id,
+                        exc_info=True,
+                    )
+
     @staticmethod
     def _parse_tool_calls_from_content(
         request: ResponsesRequest | ChatCompletionRequest,
diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py
index 4386baa14e10..84a7fddeabe3 100644
--- a/vllm/entrypoints/openai/generate/api_router.py
+++ b/vllm/entrypoints/openai/generate/api_router.py
@@ -103,6 +103,7 @@ async def init_generate_state(
             enable_prompt_tokens_details=args.enable_prompt_tokens_details,
             enable_force_include_usage=args.enable_force_include_usage,
             enable_log_outputs=args.enable_log_outputs,
+            default_chat_template_kwargs=args.default_chat_template_kwargs,
         )
         if "generate" in supported_tasks
         else None
diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py
index b5d69ea1cccc..10aa5bde392b 100644
--- a/vllm/entrypoints/openai/responses/protocol.py
+++ b/vllm/entrypoints/openai/responses/protocol.py
@@ -276,6 +276,13 @@ class ResponsesRequest(OpenAIBaseModel):
         default=None,
         description="KVTransfer parameters used for disaggregated serving.",
     )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the chat template renderer. "
+            "Will be accessible by the template."
+        ),
+    )
     # --8<-- [end:responses-extra-params]
 
     def build_chat_params(
@@ -296,7 +303,7 @@ def build_chat_params(
             chat_template=default_template,
             chat_template_content_format=default_template_content_format,
             chat_template_kwargs=merge_kwargs(  # To remove unset values
-                {},
+                self.chat_template_kwargs,
                 dict(
                     add_generation_prompt=not continue_final,
                     continue_final_message=continue_final,
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
index 9c4dc48589ff..92b19f175ced 100644
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -167,6 +167,7 @@ def __init__(
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
+        default_chat_template_kwargs: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -178,6 +179,7 @@ def __init__(
         self.openai_serving_render = openai_serving_render
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
+        self.chat_template_kwargs = default_chat_template_kwargs or {}
         self.enable_log_outputs = enable_log_outputs
 
         # Set up the unified parser - either a unified parser or fall back to
@@ -254,10 +256,14 @@ def __init__(
     def _effective_chat_template_kwargs(
         self, request: ResponsesRequest
     ) -> dict[str, Any]:
-        return request.build_chat_params(
-            self.chat_template,
-            self.chat_template_content_format,
-        ).chat_template_kwargs
+        return (
+            request.build_chat_params(
+                self.chat_template,
+                self.chat_template_content_format,
+            )
+            .with_defaults(self.chat_template_kwargs)
+            .chat_template_kwargs
+        )
 
     def _validate_generator_input(
         self,
@@ -323,6 +329,17 @@ async def create_responses(
         AsyncGenerator[StreamingResponsesResponse, None]
         | ResponsesResponse
         | ErrorResponse
+    ):
+        return await self._with_kv_transfer_rejection_cleanup(
+            self._create_responses(request, raw_request), request, raw_request
+        )
+
+    async def _create_responses(
+        self, request: ResponsesRequest, raw_request: Request | None = None
+    ) -> (
+        AsyncGenerator[StreamingResponsesResponse, None]
+        | ResponsesResponse
+        | ErrorResponse
     ):
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
@@ -590,13 +607,13 @@ async def _make_request(
             prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
             prev_response_output=prev_response.output if prev_response else None,
         )
-
+        chat_template_kwargs = self._effective_chat_template_kwargs(request)
         _, engine_inputs = await self.openai_serving_render.preprocess_chat(
             request,
             messages,
             default_template=self.chat_template,
             default_template_content_format=self.chat_template_content_format,
-            default_template_kwargs=None,
+            default_template_kwargs=chat_template_kwargs,
             tool_dicts=tool_dicts,
             tool_parser=self.parser.tool_parser_cls if self.parser else None,
             reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
@@ -615,13 +632,13 @@ async def _render_next_turn(
         new_messages = construct_input_messages(
             request_input=messages,
         )
-
+        chat_template_kwargs = self._effective_chat_template_kwargs(request)
         _, engine_inputs = await self.openai_serving_render.preprocess_chat(
             request,
             new_messages,
             default_template=chat_template,
             default_template_content_format=chat_template_content_format,
-            default_template_kwargs=None,
+            default_template_kwargs=chat_template_kwargs,
             tool_dicts=tool_dicts,
             tool_parser=tool_parser,
             reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
index c4c10c35f3cc..8c3aed4531d4 100644
--- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
@@ -49,6 +49,7 @@
 from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import get_tokenizer
+from vllm.utils.async_utils import merge_async_iterators
 
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 SpeechToTextResponseVerbose: TypeAlias = (
@@ -163,10 +164,17 @@ async def _detect_language(
             request_id,
         )
 
-        final_output: RequestOutput
-        async for final_output in result_generator:
-            if final_output.finished:
-                break
+        try:
+            final_output: RequestOutput
+            async for final_output in result_generator:
+                if final_output.finished:
+                    break
+        except asyncio.CancelledError:
+            await asyncio.gather(
+                self.engine_client.abort(request_id),
+                return_exceptions=True,
+            )
+            raise
 
         token_ids = list(final_output.outputs[0].token_ids)
         lang = self.model_cls.parse_language_detection_output(
@@ -458,41 +466,55 @@ async def _create_speech_to_text(
         if request.response_format == "verbose_json":
             sampling_params.logprobs = 1
 
+        engine_request_ids = [
+            request_id if len(engine_inputs) == 1 else f"{request_id}-{idx}"
+            for idx in range(len(engine_inputs))
+        ]
         list_result_generator = []
-        for i, engine_input in enumerate(engine_inputs):
-            request_id_item = f"{request_id}_{i}"
-
-            self._log_inputs(
-                request_id_item,
-                engine_input,
-                params=sampling_params,
-                lora_request=lora_request,
-            )
-
-            trace_headers = (
-                None
-                if raw_request is None
-                else await self._get_trace_headers(raw_request.headers)
-            )
-
-            if isinstance(sampling_params, BeamSearchParams):
-                generator = self.beam_search(
-                    prompt=engine_input,
+        try:
+            for request_id_item, engine_input in zip(engine_request_ids, engine_inputs):
+                self._log_inputs(
+                    request_id_item,
+                    engine_input,
                     params=sampling_params,
-                    request_id=request_id_item,
                     lora_request=lora_request,
-                    trace_headers=trace_headers,
                 )
-            else:
-                generator = self.engine_client.generate(
-                    engine_input,
-                    sampling_params,
-                    request_id_item,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
+
+                trace_headers = (
+                    None
+                    if raw_request is None
+                    else await self._get_trace_headers(raw_request.headers)
                 )
 
-            list_result_generator.append(generator)
+                if isinstance(sampling_params, BeamSearchParams):
+                    generator = self.beam_search(
+                        prompt=engine_input,
+                        params=sampling_params,
+                        request_id=request_id_item,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                    )
+                else:
+                    generator = self.engine_client.generate(
+                        engine_input,
+                        sampling_params,
+                        request_id_item,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                    )
+
+                list_result_generator.append(generator)
+        except asyncio.CancelledError:
+            logger.info(
+                "Request %s cancelled; aborting %d transcription engine request(s).",
+                request_id,
+                len(engine_request_ids),
+            )
+            await asyncio.gather(
+                self.engine_client.abort(engine_request_ids),
+                return_exceptions=True,
+            )
+            raise
 
         separator = asr_inter_chunk_separator(
             request.language, self.model_cls.no_space_languages
@@ -508,10 +530,12 @@ async def _create_speech_to_text(
                 separator,
             )
         # Non-streaming response.
-        total_segments = []
-        text_parts = []
         try:
             assert list_result_generator is not None
+            chunk_segment_parts: list[list[SpeechToTextSegment]] = [
+                [] for _ in list_result_generator
+            ]
+            chunk_text_parts: list[list[str]] = [[] for _ in list_result_generator]
             segments_types: dict[str, type[SpeechToTextSegment]] = {
                 "transcribe": TranscriptionSegment,
                 "translate": TranslationSegment,
@@ -522,28 +546,34 @@ async def _create_speech_to_text(
                 assert len(list_result_generator) == 1, (
                     "`max_audio_clip_s` is set to None, audio cannot be chunked"
                 )
-            for idx, result_generator in enumerate(list_result_generator):
+            result_generator = merge_async_iterators(*list_result_generator)
+            async for idx, op in result_generator:
                 start_time = (
                     float(idx * chunk_size_in_s) if chunk_size_in_s is not None else 0.0
                 )
-                async for op in result_generator:
-                    if request.response_format == "verbose_json":
-                        assert op.outputs[0].logprobs
-                        segments: list[SpeechToTextSegment] = (
-                            self._get_verbose_segments(
-                                tokens=tuple(op.outputs[0].token_ids),
-                                segment_class=segment_class,
-                                request=request,
-                                start_time=start_time,
-                                log_probs=op.outputs[0].logprobs,
-                            )
-                        )
+                if request.response_format == "verbose_json":
+                    assert op.outputs[0].logprobs
+                    segments: list[SpeechToTextSegment] = self._get_verbose_segments(
+                        tokens=tuple(op.outputs[0].token_ids),
+                        segment_class=segment_class,
+                        request=request,
+                        start_time=start_time,
+                        log_probs=op.outputs[0].logprobs,
+                    )
 
-                        total_segments.extend(segments)
-                        text_parts.extend([seg.text for seg in segments])
-                    else:
-                        raw_text = op.outputs[0].text
-                        text_parts.append(self.model_cls.post_process_output(raw_text))
+                    chunk_segment_parts[idx].extend(segments)
+                    chunk_text_parts[idx].extend([seg.text for seg in segments])
+                else:
+                    raw_text = op.outputs[0].text
+                    chunk_text_parts[idx].append(
+                        self.model_cls.post_process_output(raw_text)
+                    )
+            total_segments = [
+                segment
+                for segment_parts in chunk_segment_parts
+                for segment in segment_parts
+            ]
+            text_parts = [text for text_part in chunk_text_parts for text in text_part]
             text = separator.join(text_parts)
             if self.task_type == "transcribe":
                 final_response: ResponseType
@@ -583,7 +613,16 @@ async def _create_speech_to_text(
                     )
             return final_response
         except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
+            logger.info(
+                "Request %s cancelled; aborting %d transcription engine request(s).",
+                request_id,
+                len(engine_request_ids),
+            )
+            await asyncio.gather(
+                self.engine_client.abort(engine_request_ids),
+                return_exceptions=True,
+            )
+            raise
 
     async def _speech_to_text_stream_generator(
         self,
diff --git a/vllm/envs.py b/vllm/envs.py
index 73e4b147f88b..ddd6254b3b92 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -203,7 +203,6 @@
     ] = "NONE"
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
     VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: int | None = None
-    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
     VLLM_MORIIO_CONNECTOR_READ_MODE: bool = False
     VLLM_MORIIO_QP_PER_TRANSFER: int = 1
     VLLM_MORIIO_POST_BATCH_SIZE: int = -1
@@ -1078,29 +1077,34 @@ def _get_or_set_default() -> str:
     "VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT": lambda: (
         os.getenv("VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT", "False").lower() in ("true", "1")
     ),
-    # Master switch for the pre-rebase ROCm-native code paths used by
+    # Master switch for the ROCm-native code paths used by
     # DeepSeek-V4 (DSv4-Flash-FP8). When True (default on ROCm) the model
-    # selects the validated pre-rebase implementations at four call sites:
+    # selects the triton/torch fallbacks at three call sites:
     #
     #   1. SWA K-cache writer: torch reference
     #      (``_deepseek_v4_qnorm_rope_kv_insert_reference``) instead of
     #      upstream's HIPified ``fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert``
     #      C++ kernel, whose FP8 dtype is selected at compile time
     #      (``HIP_FP8_TYPE_OCP``) and silently corrupts every K byte on
-    #      MI300X (FNUZ-only). This is the regression fix; the other three
-    #      below are kept for defense in depth and bisection.
-    #   2. MLA decode: ``flash_mla_with_kvcache_rocm`` Triton kernel
-    #      (95% GSM8K validated) instead of upstream's
-    #      ``rocm_forward_decode_fallback``.
-    #   3. MLA sparse prefill: ``flash_mla_sparse_fwd_rocm`` Triton kernel
-    #      instead of upstream's ``rocm_sparse_attn_prefill``.
-    #   4. Sparse indexer: recovered ``rocm_sparse_attn_indexer_no_insert``
+    #      MI300X (FNUZ-only). This is the regression fix.
+    #   2. Sparse indexer: ``rocm_sparse_attn_indexer_no_insert``
     #      orchestration instead of upstream's
     #      ``rocm_aiter_sparse_attn_indexer_native``.
+    #   3. MLA sparse backend dispatch: route through the unified
+    #      ``DeepseekV4FlashMLASparseBackend`` (whose ROCm kernels are
+    #      supplied by ``flash_mla_with_kvcache_rocm`` /
+    #      ``flash_mla_sparse_fwd_rocm`` via ``flashmla.py``) instead of
+    #      ``DeepseekV4ROCMAiterMLASparseBackend`` /
+    #      ``Impl`` (whose ``_sparse_attn_decode_ragged_kernel`` Triton
+    #      kernel currently hard-codes the SM89 ``tl.float8e4b15`` dtype
+    #      in the ``IS_FNUZ`` branch and crashes JIT-compile on
+    #      gfx942 — see logs/0512/server_log2.txt).
     #
-    # Set to "0" to opt back into the upstream paths for bisection / perf
-    # comparison (note: requires the SWA writer fix below to also be in place
-    # — flipping this alone reproduces the deterministic-garbage regression).
+    # Set to "0" to opt back into the upstream AITER + native paths for
+    # bisection (note: the SWA-writer C++ kernel still produces
+    # deterministic garbage on MI300X, and the AITER Triton kernel has the
+    # ``fp8e4b15`` bug above, so env=0 is only useful for kernel debugging
+    # at present).
     "VLLM_ROCM_USE_V4_TRITON_FALLBACK": lambda: (
         os.getenv("VLLM_ROCM_USE_V4_TRITON_FALLBACK", "True").lower() in ("true", "1")
     ),
@@ -1492,13 +1496,6 @@ def _get_or_set_default() -> str:
     "VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
         int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
     ),
-    # Time (in seconds) after which the KV cache on the producer side is
-    # automatically cleared if no READ notification is received from the
-    # consumer. This is only applicable when using NixlConnector in a
-    # disaggregated decode-prefill setup.
-    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
-        os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")
-    ),
     # Controls the read mode for the Mori-IO connector
     "VLLM_MORIIO_CONNECTOR_READ_MODE": lambda: (
         os.getenv("VLLM_MORIIO_CONNECTOR_READ_MODE", "False").lower() in ("true", "1")
@@ -1815,6 +1812,9 @@ def _get_or_set_default() -> str:
     "VLLM_LORA_ENABLE_DUAL_STREAM": lambda: bool(
         int(os.getenv("VLLM_LORA_ENABLE_DUAL_STREAM", "0"))
     ),
+    # If set to 1, use Python spinloop extension to poll in a more efficient
+    # way when using the mp backend.
+    "VLLM_USE_SPINLOOP_EXT": lambda: bool(int(os.getenv("VLLM_USE_SPINLOOP_EXT", "0"))),
 }
 
 
@@ -1911,6 +1911,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_SERVER_DEV_MODE",
         "VLLM_DP_MASTER_IP",
         "VLLM_DP_MASTER_PORT",
+        "VLLM_NIXL_SIDE_CHANNEL_HOST",
         "VLLM_RANDOMIZE_DP_DUMMY_INPUTS",
         "VLLM_CI_USE_S3",
         "VLLM_MODEL_REDIRECT_PATH",
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index aed6b5ba891e..01bbc9366d8d 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -195,9 +195,9 @@ def __init__(
         # There are two LoRA layers
         # the output_sizes in MergedColumnParallelLinear is not sharded by tp
         # we need to divide it by the tp_size to get correct slices size
-        output_sizes = self.base_layer.output_sizes
+        self.output_sizes = self.base_layer.output_sizes
         self.output_slices = tuple(
-            divide(output_size, self.tp_size) for output_size in output_sizes
+            divide(output_size, self.tp_size) for output_size in self.output_sizes
         )
         self.n_slices = len(self.output_slices)
         self.output_ids = (self.tp_rank,) * self.n_slices
@@ -261,6 +261,42 @@ def slice_lora_b(
                 ]
         return sliced_lora_b
 
+    def expand_packed_lora(
+        self,
+        lora_a: list[torch.Tensor],
+        lora_b: list[torch.Tensor],
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """
+        Expand packed adapter groups when they don't match n_slices.
+        E.g. in_proj_qkv (covers Q+K+V) + in_proj_z
+        """
+        expanded_a: list[torch.Tensor] = []
+        expanded_b: list[torch.Tensor] = []
+        start_idx = 0
+        for a_i, b_i in zip(lora_a, lora_b):
+            # Determine which output slices this b_i covers.
+            b_rows, cu_rows, covered = b_i.shape[0], 0, 0
+            for i in range(start_idx, self.n_slices):
+                cu_rows += self.output_sizes[i]
+                if cu_rows == b_rows:
+                    covered = i - start_idx + 1
+                    break
+            else:
+                raise ValueError(
+                    f"Cannot determine how to split lora_b with {b_rows} rows "
+                    f"into {self.n_slices} slices with output sizes "
+                    f"{self.output_sizes} starting from index {start_idx}."
+                )
+            # Split b_i into per-slice tensors and replicate a_i for each.
+            start = 0
+            for j in range(covered):
+                size = self.output_sizes[start_idx + j]
+                expanded_b.append(b_i[start : start + size, :])
+                expanded_a.append(a_i)
+                start += size
+            start_idx += covered
+        return expanded_a, expanded_b
+
     def set_lora(
         self,
         index: int,
@@ -269,6 +305,12 @@ def set_lora(
     ):
         self.reset_lora(index)
 
+        # Expand packed adapter groups when they don't match n_slices.
+        # E.g. in_proj_qkv (covers Q+K+V) + in_proj_z as 2 groups for a
+        # 4-slice layer: split b_qkv by output_sizes and replicate a_qkv.
+        if isinstance(lora_b, list) and len(lora_b) != self.n_slices:
+            lora_a, lora_b = self.expand_packed_lora(lora_a, lora_b)
+
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
@@ -497,18 +539,14 @@ class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLo
     def slice_lora_a(
         self, lora_a: list[torch.Tensor | None]
     ) -> list[torch.Tensor | None]:
-        # NOTE: lora_a contains 2 subloras, and each sublora could be None.
         output_shard_size = self.lora_a_stacked[0].shape[2]
         output_start_idx = self.tp_rank * output_shard_size
-        lora_a = [
-            lora_a[0][output_start_idx : output_start_idx + output_shard_size, :]
-            if lora_a[0] is not None
-            else None,
-            lora_a[1][output_start_idx : output_start_idx + output_shard_size, :]
-            if lora_a[1] is not None
-            else None,
+        return [
+            lora_a_i[output_start_idx : output_start_idx + output_shard_size, :]
+            if (lora_a_i := lora_a[i]) is not None
+            else None
+            for i in range(len(lora_a))
         ]
-        return lora_a
 
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 2f9a4701b0d1..8cb32f07965b 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -7,10 +7,6 @@
 
 from vllm import envs
 from vllm.config.lora import LoRAConfig
-from vllm.distributed.parallel_state import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
 from vllm.distributed.utils import divide
 from vllm.lora.layers.base import BaseLayerWithLoRA
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -30,15 +26,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
     def __init__(self, base_layer: FusedMoE) -> None:
         super().__init__()
         self.base_layer = base_layer
-
-        assert not self.base_layer.use_ep, (
-            "EP support for Fused MoE LoRA is not implemented yet."
-        )
-        assert not self.base_layer.quant_method.is_monolithic, (
-            "Monolithic kernels are not supported for Fused MoE LoRA."
-        )
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
+        self._ep_check()
+        # Use the MoE-aware TP rank/size: when EP is active, FusedMoE collapses
+        # moe_parallel_config.tp_size to 1 (experts are sharded across the
+        # TP group instead).
+        self.tp_size = self.base_layer.tp_size
+        self.tp_rank = self.base_layer.tp_rank
         self.device = _get_lora_device(base_layer)
         # For non-gated MoE (is_act_and_mul=False), only 1 slice is needed
         # since there's only up_proj (w1), not gate_proj + up_proj (w1 + w3)
@@ -65,7 +58,7 @@ def __init__(self, base_layer: FusedMoE) -> None:
             "For quantized MoE, mix LoRAExpertsMixin into the experts class "
             "and consume self._lora_context in apply()."
         )
-        self._fused_experts = moe_kernel.fused_experts
+        self._moe_kernel = moe_kernel
         self.base_layer._replace_quant_method(
             FusedMoEModularMethod(self.base_layer.quant_method, moe_kernel)
         )
@@ -150,6 +143,26 @@ def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig):
             ),
         )
 
+    def _ep_check(self):
+        if self.base_layer.use_ep:
+            moe_config = self.base_layer.moe_config
+            all2all_backend = moe_config.moe_parallel_config.all2all_backend
+            assert all2all_backend == "allgather_reducescatter", (
+                "Fused MoE LoRA with EP currently only supports "
+                f"all2all_backend='allgather_reducescatter', got '{all2all_backend}'."
+            )
+            assert not moe_config.moe_parallel_config.is_sequence_parallel
+
+    def _verify_ep_fs(self, lora_config: LoRAConfig):
+        # EP and fully_sharded LoRA both partition along the same TP group —
+        # EP on the expert dim, fully_sharded on the LoRA rank dim — with
+        # mutually contradictory assumptions about which rank holds which
+        # expert's rank-shard.
+        assert not (self.base_layer.use_ep and lora_config.fully_sharded_loras), (
+            "Fused MoE LoRA does not support enable_expert_parallel=True "
+            "together with fully_sharded_loras=True. Disable one of them."
+        )
+
     def create_lora_weights(
         self,
         max_loras: int,
@@ -157,6 +170,8 @@ def create_lora_weights(
         model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
+
+        self._verify_ep_fs(lora_config)
         self.max_loras = lora_config.max_loras
         self.fully_sharded = lora_config.fully_sharded_loras
 
@@ -282,6 +297,10 @@ def set_lora(
 
         w1_lora_a, w2_lora_a, w3_lora_a = lora_a
         w1_lora_b, w2_lora_b, w3_lora_b = lora_b
+
+        # EP slicing is done once at add time in
+        # LoRAModelManager._slice_moe_lora_ep, so by here the cached
+        # tensors already match the local-expert dim of the stacked buffers.
         assert (
             num_experts
             == w1_lora_a.shape[0]
@@ -326,7 +345,11 @@ def set_lora(
 
     def set_mapping(self, punica_wrapper):
         super().set_mapping(punica_wrapper)
-        self._fused_experts.set_lora_context(self._build_lora_context())
+        lora_context = self._build_lora_context()
+        self._moe_kernel.fused_experts.set_lora_context(lora_context)
+        prepare_finalize = self._moe_kernel.prepare_finalize
+        if hasattr(prepare_finalize, "set_lora_context"):
+            prepare_finalize.set_lora_context(lora_context)
 
     def forward(self, *args, **kwargs):
         return self.base_layer.forward(*args, **kwargs)
@@ -400,6 +423,7 @@ def create_lora_weights(
         """Initializes lora matrices."""
 
         assert isinstance(model_config, PretrainedConfig)
+        self._verify_ep_fs(lora_config)
         self._base_model = model_config.architectures[0]
         self.max_loras = lora_config.max_loras
         self.fully_sharded = lora_config.fully_sharded_loras
diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py
index 52ff8ebc91f3..d68d0c4fbb74 100644
--- a/vllm/lora/model_manager.py
+++ b/vllm/lora/model_manager.py
@@ -14,6 +14,7 @@
 from vllm.lora.layers import (
     BaseLayerWithLoRA,
     FusedMoE3DWithLoRA,
+    FusedMoEWithLoRA,
     LoRAMapping,
     LoRAMappingType,
 )
@@ -562,7 +563,16 @@ def create_dummy_lora(
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
+                n_slices = getattr(module, "n_slices", len(replacements))
+                if module.__class__.__name__ == "FusedMoEWithLoRA":
+                    replacements = replacements[
+                        : len(module.lora_a_stacked) // self.lora_slots
+                    ]
                 subloras: list[LoRALayerWeights | None] = []
+                # HACK: overrides replacements for qkvz = qkv + z case.
+                # Any better methods to handle this case?
+                if n_slices != len(replacements):
+                    replacements = [f"slice_{i}" for i in range(n_slices)]
                 for i, r in enumerate(replacements):
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name + "." + r,
@@ -716,6 +726,8 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
         for module_name, module in self.modules.items():
             if isinstance(module, FusedMoE3DWithLoRA):
                 self._stack_moe_lora_weights(lora_model, module, module_name)
+            elif isinstance(module, FusedMoEWithLoRA):
+                self._slice_moe_lora_ep(lora_model, module, module_name)
 
         first_lora: LoRALayerWeights = next(iter(lora_model.loras.values()))
         assert first_lora.lora_a is not None
@@ -762,23 +774,33 @@ def _stack_moe_lora_weights(
             assert gate_up_proj_lora is not None
             assert down_proj_lora is not None
             if self._is_3d_moe_model:
-                num_experts = module.w13_lora_a_stacked[0].shape[1]
+                local_num_experts = module.w13_lora_a_stacked[0].shape[1]
+                # The checkpoint holds weights for all global experts, but
+                # each EP rank owns only local_num_experts. Reshape against
+                # the adapter's actual expert count, then slice this rank's
+                # owned expert range before it gets copied into the local
+                # stacked buffer. For non-EP (local == global) this is a
+                # no-op slice.
+                global_num_experts = module.base_layer.global_num_experts
+                ep_rank = module.base_layer.ep_rank
+                expert_start = ep_rank * local_num_experts
+                expert_end = expert_start + local_num_experts
 
                 # (num_experts,rank,input_size)
                 gate_up_proj_lora.lora_a = gate_up_proj_lora.lora_a.reshape(
-                    num_experts, -1, gate_up_proj_lora.lora_a.shape[-1]
-                )
+                    global_num_experts, -1, gate_up_proj_lora.lora_a.shape[-1]
+                )[expert_start:expert_end].contiguous()
                 down_proj_lora.lora_a = down_proj_lora.lora_a.reshape(
-                    num_experts, -1, down_proj_lora.lora_a.shape[-1]
-                )
+                    global_num_experts, -1, down_proj_lora.lora_a.shape[-1]
+                )[expert_start:expert_end].contiguous()
 
                 # (output_size,rank,num_experts)
                 gate_up_proj_lora.lora_b = gate_up_proj_lora.lora_b.reshape(
-                    gate_up_proj_lora.lora_b.shape[0], -1, num_experts
-                )
+                    gate_up_proj_lora.lora_b.shape[0], -1, global_num_experts
+                )[..., expert_start:expert_end]
                 down_proj_lora.lora_b = down_proj_lora.lora_b.reshape(
-                    down_proj_lora.lora_b.shape[0], -1, num_experts
-                )
+                    down_proj_lora.lora_b.shape[0], -1, global_num_experts
+                )[..., expert_start:expert_end]
 
                 # (num_experts,output_size,rank)
                 gate_up_proj_lora.lora_b = gate_up_proj_lora.lora_b.permute(
@@ -828,6 +850,43 @@ def _stack_moe_lora_weights(
                 module_lora.lora_a = lora_a
                 module_lora.lora_b = lora_b
 
+    def _slice_moe_lora_ep(
+        self,
+        lora_model: LoRAModel,
+        module: FusedMoEWithLoRA,
+        module_name: str,
+    ) -> None:
+        """Slice the cached LoRA tensors down to this rank's local experts.
+
+        The 2D MoE checkpoint enters as a list of per-(w1/w2/w3) tensors of
+        shape (num_experts, rank, in) / (num_experts, out, rank). When EP
+        is active each rank only owns local_num_experts; without this slice
+        the CPU LoRAModel keeps the full global weight and set_lora has to
+        re-slice on every activation.
+        """
+        if not module.base_layer.use_ep:
+            return
+        module_lora = self._get_lora_layer_weights(lora_model, module_name)
+        if module_lora is None or not isinstance(module_lora.lora_a, list):
+            return
+
+        local_num_experts = module.base_layer.local_num_experts
+        global_num_experts = module.base_layer.global_num_experts
+        ep_rank = module.base_layer.ep_rank
+        expert_start = ep_rank * local_num_experts
+        expert_end = expert_start + local_num_experts
+
+        new_lora_a: list[torch.Tensor | None] = []
+        new_lora_b: list[torch.Tensor | None] = []
+        for a, b in zip(module_lora.lora_a, module_lora.lora_b):
+            if a is not None and b is not None and a.shape[0] == global_num_experts:
+                a = a[expert_start:expert_end].contiguous()
+                b = b[expert_start:expert_end].contiguous()
+            new_lora_a.append(a)
+            new_lora_b.append(b)
+        module_lora.lora_a = new_lora_a
+        module_lora.lora_b = new_lora_b
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None:
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 4ab66dccdc29..0448a6d00cda 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -514,6 +514,7 @@ def add_lora_w13(
         num_slices: int,
         fully_sharded: bool,
         use_tuned_config: bool,
+        token_lora_mapping: torch.Tensor | None = None,
     ) -> tuple[
         torch.Tensor | None,
         torch.Tensor | None,
@@ -522,6 +523,10 @@ def add_lora_w13(
     ]:
         """Apply w13 LoRA to y (intermediate_cache1) in-place before activation.
 
+        When `token_lora_mapping` is provided it overrides the punica_wrapper's
+        global mapping — used by EP+LoRA to pass the per-rank-local mapping
+        after all-to-all dispatch.
+
         Returns (sorted_token_ids_lora, expert_ids_lora,
                  num_tokens_post_padded_lora, token_lora_mapping)
         for reuse by add_lora_w2.
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 44d1dbd50728..bf951e074949 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -335,25 +335,49 @@ def moe_lora_align_block_size(
         expert_map: torch.Tensor | None = None,
         pad_sorted_ids: bool = False,
         naive_block_assignment: bool = False,
+        token_lora_mapping: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Aligns tokens and experts into block-sized chunks for LoRA-based
         mixture-of-experts (MoE) execution.
+
+        When `token_lora_mapping` is provided, it overrides the global mapping
+        read from `self.token_mapping_meta`. This is how EP+LoRA injects the
+        per-rank-local token→LoRA map after all-to-all dispatch.
         """
-        (token_lora_mapping, _, _, _, lora_ids, _, _) = (
-            self.token_mapping_meta.meta_args(
-                num_tokens, self.lora_config.specialize_active_lora
-            )
+        (
+            token_lora_mapping_meta,
+            _,
+            _,
+            _,
+            lora_ids,
+            _,
+            _,
+        ) = self.token_mapping_meta.meta_args(
+            num_tokens, self.lora_config.specialize_active_lora
+        )
+        if token_lora_mapping is None:
+            token_lora_mapping = token_lora_mapping_meta
+        # Under EP the caller passes local_num_experts but topk_ids carries
+        # GLOBAL expert indices. The CUDA kernel uses num_experts to size
+        # its bucketing table; with EP we must size by global_num_experts
+        # so global topk_ids don't overflow. expert_map inside the kernel
+        # then translates global→local so the output expert_ids are local
+        # (mirrors the non-LoRA moe_align_block_size behavior).
+        kernel_num_experts = (
+            expert_map.numel() if expert_map is not None else num_experts
         )
         if naive_block_assignment:
             expert_ids = topk_ids.reshape(-1)
             sorted_ids = None
             num_tokens_post_pad = None
         else:
-            max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+            max_num_tokens_padded = topk_ids.numel() + kernel_num_experts * (
+                block_size - 1
+            )
             if pad_sorted_ids:
                 max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
-            if topk_ids.numel() < num_experts:
+            if topk_ids.numel() < kernel_num_experts:
                 max_num_tokens_padded = topk_ids.numel() * block_size
             sorted_ids = torch.empty(
                 (max_loras * max_num_tokens_padded,),
@@ -361,9 +385,12 @@ def moe_lora_align_block_size(
                 device=topk_ids.device,
             )
             max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
-            # Expert ids must be set default to -1 to prevent a blank block
-            expert_ids = torch.empty(
+            # Expert ids are initialized to -1 so unused (lora, expert)
+            # slots don't drive the LoRA Triton kernel into the wrong bucket.
+            # The kernel overwrites only active slots.
+            expert_ids = torch.full(
                 (max_loras * max_num_m_blocks,),
+                -1,
                 dtype=torch.int32,
                 device=topk_ids.device,
             )
@@ -374,7 +401,7 @@ def moe_lora_align_block_size(
             ops.moe_lora_align_block_size(
                 topk_ids,
                 token_lora_mapping,
-                num_experts,
+                kernel_num_experts,
                 block_size,
                 max_loras,
                 max_num_tokens_padded,
@@ -384,11 +411,10 @@ def moe_lora_align_block_size(
                 num_tokens_post_pad,
                 adapter_enabled,
                 lora_ids,
+                expert_map,
             )
-            if expert_map is not None:
-                expert_ids = expert_map[expert_ids]
 
-        return None, sorted_ids, expert_ids, num_tokens_post_pad
+        return token_lora_mapping, sorted_ids, expert_ids, num_tokens_post_pad
 
     def add_lora_fused_moe(
         self,
@@ -480,6 +506,7 @@ def add_lora_w13(
         num_slices: int,
         fully_sharded: bool,
         use_tuned_config: bool,
+        token_lora_mapping: torch.Tensor | None = None,
     ) -> tuple[
         torch.Tensor | None,
         torch.Tensor | None,
@@ -558,6 +585,7 @@ def add_lora_w13(
             adapter_enabled,
             expert_map,
             naive_block_assignment=naive_block_assignment,
+            token_lora_mapping=token_lora_mapping,
         )
 
         _sorted = sorted_token_ids_lora
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index 20dde67b068d..58316cb75970 100755
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -461,6 +461,7 @@ def add_lora_w13(
         num_slices: int,
         fully_sharded: bool,
         use_tuned_config: bool,
+        token_lora_mapping: torch.Tensor | None = None,
     ) -> tuple[
         torch.Tensor | None,
         torch.Tensor | None,
diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index 9cf7ebf5cdfc..9b4395f41d20 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -345,6 +345,7 @@ def __init__(
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        attn_backend: type[AttentionBackend] | None = None,
         use_sparse: bool = False,
         indexer: object | None = None,
         **extra_impl_args,
@@ -374,14 +375,21 @@ def __init__(
         self.quant_config = quant_config
 
         dtype = torch.get_default_dtype()
-        self.attn_backend = get_attn_backend(
-            self.head_size,
-            dtype,
-            kv_cache_dtype,
-            use_mla=True,
-            use_sparse=use_sparse,
-            num_heads=self.num_heads,
-        )
+        if attn_backend is not None:
+            assert attn_backend.is_mla(), (
+                f"MLAAttention: attn_backend must be an MLA backend, "
+                f"got {attn_backend.get_name()} instead"
+            )
+            self.attn_backend = attn_backend
+        else:
+            self.attn_backend = get_attn_backend(
+                self.head_size,
+                dtype,
+                kv_cache_dtype,
+                use_mla=True,
+                use_sparse=use_sparse,
+                num_heads=self.num_heads,
+            )
 
         # FlashMLA Sparse Attention fp8 backend uses "fp8_ds_mla" kv-cache format
         # Automatically convert fp8 kv-cache format to "fp8_ds_mla"
@@ -1008,23 +1016,9 @@ def unified_mla_kv_cache_update(
     the data dependency between them to ensure torch.compile preserves ordering.
     """
     layer_name = _resolve_layer_name(layer_name)
-    forward_context = get_forward_context()
-    attn_layer = forward_context.no_compile_layers[layer_name]
-    kv_cache = attn_layer.kv_cache
-
-    # This needs to run even when we don't have metadata yet, so that the op
-    # is correctly captured.
-    if kv_cache.numel() == 0:
-        # Can't update an empty KV cache.
-        return torch.empty(0, device=kv_c_normed.device, dtype=kv_c_normed.dtype)
-
-    slot_mapping = forward_context.slot_mapping
-    assert isinstance(slot_mapping, dict), (
-        f"Expected slot_mapping to be a dict, got {type(slot_mapping)}. "
-    )
-    layer_slot_mapping = slot_mapping.get(layer_name)
+    _, attn_layer, kv_cache, layer_slot_mapping = get_attention_context(layer_name)
     if layer_slot_mapping is not None:
-        attn_layer.impl.do_kv_cache_update(
+        attn_layer.impl.do_kv_cache_update(  # type: ignore[attr-defined]
             kv_c_normed,
             k_pe,
             kv_cache,
@@ -1113,6 +1107,7 @@ def unified_mla_attention_with_output_fake(
     mutates_args=["output", "output_block_scale"],
     fake_impl=unified_mla_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
+    tags=(torch.Tag.flexible_layout,),
 )
 
 
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index bcdd30500329..2e1beeec1b71 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -8,15 +8,12 @@
 import torch
 
 import vllm.envs as envs
-from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 from vllm.utils.platform_utils import num_compute_units
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-logger = init_logger(__name__)
-
 
 def _matmul_launch_metadata(
     grid: Callable[..., Any], kernel: Any, args: dict[str, Any]
@@ -24,15 +21,8 @@ def _matmul_launch_metadata(
     ret = {}
     m, n, k = args["M"], args["N"], args["K"]
     ret["name"] = f"{kernel.name} [M={m}, N={n}, K={k}]"
-    if "tiles_per_update" in args:
-        ret["name"] = (
-            f"{kernel.name} [M={m}, N={n}, K={k}, "
-            f"tiles_per_update={args['tiles_per_update']:02}]"
-        )
-    if "c_ptr" in args:
-        bytes_per_elem = args["c_ptr"].element_size()
-    else:
-        bytes_per_elem = 1 if args["FP8_OUTPUT"] else 2
+
+    bytes_per_elem = args["c_ptr"].element_size()
     ret[f"flops{bytes_per_elem * 8}"] = 2.0 * m * n * k
     ret["bytes"] = bytes_per_elem * (m * k + n * k + m * n)
     return ret
@@ -191,7 +181,6 @@ def grid(META):
             "num_warps": 8,
         },
     }
-    # print(a.device, b.device, c.device)
     matmul_kernel_persistent[grid](
         a,
         b,
@@ -420,7 +409,7 @@ def log_softmax(input: torch.Tensor, dim: int = -1) -> torch.Tensor:
         input: Input tensor
         dim: Dimension along which to compute log_softmax
              (only -1 or last dim supported)
-    >> Stashed changes
+
     Returns:
         Tensor with log_softmax applied along the specified dimension
     """
@@ -910,18 +899,11 @@ def linear_batch_invariant(input, weight, bias=None):
 
 _batch_invariant_MODE = False
 _batch_invariant_LIB = None
-_original_torch_bmm = None
-_original_fp16_reduction_precision = None
-_original_bf16_reduction_precision = None
-_original_cublas_workspace_cfg = None
-_original_cublaslt_workspace_size = None
 _fp16_block_size_n = 256
 
 
 def enable_batch_invariant_mode():
-    global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
-    global _original_fp16_reduction_precision, _original_bf16_reduction_precision
-    global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size
+    global _batch_invariant_MODE, _batch_invariant_LIB
     global _fp16_block_size_n
 
     if _batch_invariant_MODE:
@@ -941,10 +923,6 @@ def enable_batch_invariant_mode():
         # Hopper (SM90) and Blackwell (SM100): the only source of batch
         # variance is split-k, which we disable via the cuBLAS workspace
         # config.
-        _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None)
-        _original_cublaslt_workspace_size = os.environ.get(
-            "CUBLASLT_WORKSPACE_SIZE", None
-        )
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
         os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
 
@@ -966,16 +944,8 @@ def enable_batch_invariant_mode():
     _batch_invariant_LIB.impl(
         "aten::bmm", bmm_batch_invariant, "CUDA", allow_override=True
     )
-    _original_torch_bmm = torch.bmm
     torch.bmm = bmm_batch_invariant
 
-    _original_bf16_reduction_precision = (
-        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
-    )
-    _original_fp16_reduction_precision = (
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
-    )
-
     reduced_precision_val = (
         (False, False) if is_torch_equal_or_newer("2.10.0") else False
     )
diff --git a/vllm/model_executor/layers/deepseek_v4_attention.py b/vllm/model_executor/layers/deepseek_v4_attention.py
index bf044ba5e7d6..6e5c04817b3b 100644
--- a/vllm/model_executor/layers/deepseek_v4_attention.py
+++ b/vllm/model_executor/layers/deepseek_v4_attention.py
@@ -29,11 +29,7 @@
     fused_q_kv_rmsnorm,
     quantize_and_insert_k_cache,
 )
-from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
-    rocm_forward_decode_fallback,
-    rocm_inv_rope_einsum,
-    rocm_sparse_attn_prefill,
-)
+from vllm.v1.attention.ops.rocm_aiter_mla_sparse import rocm_inv_rope_einsum
 
 if TYPE_CHECKING:
     from vllm.v1.attention.backends.mla.sparse_swa import (
@@ -835,6 +831,21 @@ def __init__(
         self.kv_cache = torch.tensor([])
 
     def get_attn_backend(self) -> type[AttentionBackend]:
+        if (
+            current_platform.is_rocm()
+            and not envs.VLLM_ROCM_USE_V4_TRITON_FALLBACK
+        ):
+            # Opt-in (env=0) routes ROCm through the new AITER sparse
+            # MLA backend. Default (env=1) falls through to the unified
+            # FlashMLASparse backend; the actual kernels are then
+            # supplied by ``vllm.v1.attention.ops.flashmla`` which on
+            # ROCm hands off to our Triton fallbacks
+            # (``flash_mla_with_kvcache_rocm`` / ``flash_mla_sparse_fwd_rocm``).
+            from vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse_dsv4 import (
+                DeepseekV4ROCMAiterMLASparseBackend,
+            )
+
+            return DeepseekV4ROCMAiterMLASparseBackend
         return DeepseekV4FlashMLASparseBackend
 
     def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None:
@@ -867,6 +878,21 @@ def forward(
             f"output buffer dtype {output.dtype} must match q dtype {q.dtype}"
         )
 
+        if (
+            current_platform.is_rocm()
+            and not envs.VLLM_ROCM_USE_V4_TRITON_FALLBACK
+        ):
+            # See the matching gate in ``get_attn_backend``: env=0 opts
+            # into the AITER sparse MLA impl. Default (env=1) falls
+            # through to the unified path below, which routes ROCm to our
+            # Triton kernels via ``flashmla.py``.
+            from vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse_dsv4 import (
+                DeepseekV4ROCMAiterMLASparseImpl,
+            )
+
+            DeepseekV4ROCMAiterMLASparseImpl.forward(self, q, kv, positions, output)
+            return
+
         # Get SWA and indexer metadata from forward context
         forward_context = get_forward_context()
         attn_metadata = forward_context.attn_metadata
@@ -949,34 +975,6 @@ def _forward_decode(
         swa_indices = swa_metadata.decode_swa_indices
         swa_lens = swa_metadata.decode_swa_lens
 
-        # When VLLM_ROCM_USE_V4_TRITON_FALLBACK is enabled (default on ROCm),
-        # we deliberately skip the upstream `rocm_forward_decode_fallback` and
-        # let the standard `flash_mla_with_kvcache` call below run. That call
-        # is mapped by `vllm.v1.attention.ops.flashmla` to our pre-rebase
-        # `flash_mla_with_kvcache_rocm` Triton/online-softmax fallback, which
-        # is the path that produced 95% GSM8K accuracy. The upstream torch
-        # reference (`rocm_ref_sparse_attn_decode`) has its own bugs that
-        # collapse generation to the base-model prior, so we keep it gated as
-        # an opt-in fallback for bisection only.
-        if current_platform.is_rocm() and not envs.VLLM_ROCM_USE_V4_TRITON_FALLBACK:
-            rocm_forward_decode_fallback(
-                q=q,
-                kv_cache=kv_cache,
-                swa_k_cache=self.swa_cache_layer.kv_cache,
-                swa_only=swa_only,
-                topk_indices=topk_indices,
-                topk_lens=topk_lens,
-                swa_indices=swa_indices,
-                swa_lens=swa_lens,
-                attn_sink=self.attn_sink,
-                scale=self.scale,
-                head_dim=self.head_dim,
-                nope_head_dim=self.nope_head_dim,
-                rope_head_dim=self.rope_head_dim,
-                output=output,
-            )
-            return
-
         # We treat queries in the same seq as different queries
         # and later we only attend by generated indices.
         # q arrives pre-padded to self.padded_heads by the outer wrapper.
@@ -1147,38 +1145,15 @@ def _forward_prefill(
                 M,
                 N,
             )
-
-            # See the matching comment in `_forward_decode`: by default
-            # (VLLM_ROCM_USE_V4_TRITON_FALLBACK=True) we send the prefill
-            # forward through `flash_mla_sparse_fwd`, which on ROCm is bound
-            # to our pre-rebase `flash_mla_sparse_fwd_rocm` chunked-online-
-            # softmax kernel via `vllm.v1.attention.ops.flashmla`. Set the env
-            # var to "0" to opt back into upstream's `rocm_sparse_attn_prefill`
-            # torch reference (kept for bisection / regression testing).
-            if (
-                current_platform.is_rocm()
-                and not envs.VLLM_ROCM_USE_V4_TRITON_FALLBACK
-            ):
-                rocm_sparse_attn_prefill(
-                    q=q[query_start:query_end],
-                    kv=kv.view(-1, 1, q.shape[-1]),
-                    indices=combined_indices.unsqueeze(1),
-                    topk_length=combined_lens,
-                    scale=self.scale,
-                    head_dim=self.head_dim,
-                    attn_sink=self.attn_sink,
-                    output=output[query_start:query_end],
-                )
-            else:
-                output_chunk, _, _ = flash_mla_sparse_fwd(
-                    q=q[query_start:query_end],
-                    kv=kv.view(-1, 1, q.shape[-1]),
-                    indices=combined_indices.unsqueeze(1),
-                    sm_scale=self.scale,
-                    attn_sink=self.attn_sink,
-                    topk_length=combined_lens,
-                    out=output[query_start:query_end],
-                )
+            flash_mla_sparse_fwd(
+                q=q[query_start:query_end],
+                kv=kv.view(-1, 1, q.shape[-1]),
+                indices=combined_indices.unsqueeze(1),
+                sm_scale=self.scale,
+                attn_sink=self.attn_sink,
+                topk_length=combined_lens,
+                out=output[query_start:query_end],
+            )
 
 
 class DeepseekV4IndexerCache(torch.nn.Module, AttentionLayerBase):
diff --git a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
index 3f7628487d69..a715504da6b7 100644
--- a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
+++ b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
@@ -83,7 +83,7 @@ def chunk_scaled_dot_kkt_fwd_kernel(
         )
         b_k = tl.load(p_k, boundary_check=(0, 1))
         b_kb = b_k * b_beta[:, None]
-        b_A += tl.dot(b_kb.to(b_k.dtype), tl.trans(b_k))
+        b_A += tl.dot(b_kb, tl.trans(b_k).to(b_kb.dtype))
 
     if USE_G:
         p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 75a9faddc1f0..aeb704e18d86 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -85,6 +85,13 @@ def get_config() -> dict[str, Any] | None:
     from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
         DeepGemmExperts,
     )
+    from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
+        AiterExperts,
+    )
+    from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
+        TritonExperts,
+        TritonWNA16Experts,
+    )
     from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
         XPUExperts,
         XPUExpertsFp8,
@@ -94,14 +101,9 @@ def get_config() -> dict[str, Any] | None:
         BatchedTritonExperts,
     )
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        TritonExperts,
-        TritonWNA16Experts,
         fused_experts,
         get_config_file_name,
     )
-    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-        AiterExperts,
-    )
     from vllm.model_executor.layers.fused_moe.router.fused_topk_router import (
         fused_topk,
     )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/experts/flashinfer_cutlass_moe.py
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
rename to vllm/model_executor/layers/fused_moe/experts/flashinfer_cutlass_moe.py
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py
similarity index 99%
rename from vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
rename to vllm/model_executor/layers/fused_moe/experts/marlin_moe.py
index 3487ac1766e6..376ef3cee549 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py
@@ -29,6 +29,7 @@
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache,
     disable_inplace,
+    swiglu_limit_func,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     get_marlin_input_dtype,
@@ -50,8 +51,6 @@
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
-from .utils import swiglu_limit_func
-
 
 def _fused_marlin_moe(
     hidden_states: torch.Tensor,
@@ -414,6 +413,7 @@ def batched_fused_marlin_moe(
     is_k_full: bool = True,
     output: torch.Tensor | None = None,
     inplace: bool = False,
+    clamp_limit: float | None = None,
 ) -> torch.Tensor:
     """
     This function massages the inputs so the batched hidden_states can be
@@ -536,6 +536,7 @@ def batched_fused_marlin_moe(
         intermediate_cache2=intermediate_cache2,
         output=output.view(-1, K) if output is not None else output,
         is_k_full=is_k_full,
+        clamp_limit=clamp_limit,
     )
 
     output = output.view(B, BATCH_TOKENS_MAX, K)
@@ -769,6 +770,7 @@ def apply(
                 sort_indices2=self.w2_g_idx_sort_indices,
                 is_k_full=self.is_k_full,
                 input_dtype=self.input_dtype,
+                clamp_limit=self.gemm1_clamp_limit,
             )
             return
 
@@ -971,4 +973,5 @@ def apply(
             sort_indices1=self.w13_g_idx_sort_indices,
             sort_indices2=self.w2_g_idx_sort_indices,
             is_k_full=self.is_k_full,
+            clamp_limit=self.gemm1_clamp_limit,
         )
diff --git a/vllm/model_executor/layers/fused_moe/experts/nvfp4_emulation_moe.py b/vllm/model_executor/layers/fused_moe/experts/nvfp4_emulation_moe.py
index f1a0ee7ac52d..de5b45ccb87b 100644
--- a/vllm/model_executor/layers/fused_moe/experts/nvfp4_emulation_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/nvfp4_emulation_moe.py
@@ -20,7 +20,7 @@
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (
     dequantize_to_dtype,
diff --git a/vllm/model_executor/layers/fused_moe/experts/ocp_mx_emulation_moe.py b/vllm/model_executor/layers/fused_moe/experts/ocp_mx_emulation_moe.py
index 9fb163ef42af..feb8c2ea769b 100644
--- a/vllm/model_executor/layers/fused_moe/experts/ocp_mx_emulation_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/ocp_mx_emulation_moe.py
@@ -20,7 +20,7 @@
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import dequant_mxfp4
 from vllm.model_executor.layers.quantization.utils.mxfp6_utils import dequant_mxfp6
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/experts/rocm_aiter_moe.py
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
rename to vllm/model_executor/layers/fused_moe/experts/rocm_aiter_moe.py
diff --git a/vllm/model_executor/layers/fused_moe/experts/triton_moe.py b/vllm/model_executor/layers/fused_moe/experts/triton_moe.py
new file mode 100644
index 000000000000..e99f307ec0ec
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/experts/triton_moe.py
@@ -0,0 +1,522 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Triton-based MoE expert implementations."""
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    _prepare_expert_assignment,
+    invoke_fused_moe_triton_kernel,
+    invoke_fused_moe_wna16_triton_kernel,
+    try_get_optimal_moe_config,
+)
+from vllm.model_executor.layers.fused_moe.lora_experts_mixin import (
+    LoRAExpertsMixin,
+)
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size,
+)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP,
+)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache,
+    moe_kernel_quantize_input,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    kFp8Dynamic128Sym,
+    kFp8DynamicTensorSym,
+    kFp8DynamicTokenSym,
+    kFp8Static128BlockSym,
+    kFp8StaticChannelSym,
+    kFp8StaticTensorSym,
+    kInt8DynamicTokenSym,
+    kInt8StaticChannelSym,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl
+
+
+class TritonExperts(LoRAExpertsMixin, mk.FusedMoEExpertsModular):
+    """Triton-based fused MoE expert implementation."""
+
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        # Whether quantized MOE runs natively, or through
+        # higher-precision + activation QDQ.
+        self.quantization_emulation = False
+        super().__init__(moe_config, quant_config)
+
+    @staticmethod
+    def activation_format() -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    @staticmethod
+    def _supports_current_device() -> bool:
+        return current_platform.is_cuda_alike() or current_platform.is_xpu()
+
+    @staticmethod
+    def _supports_no_act_and_mul() -> bool:
+        return True
+
+    @staticmethod
+    def _supports_quant_scheme(
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        # INT8 requires at least 7.5 (Turing).
+        device_supports_int8 = (
+            current_platform.is_cuda()
+            and current_platform.has_device_capability((7, 5))
+        )
+
+        supported: list[tuple[QuantKey | None, QuantKey | None]] = [(None, None)]
+        if device_supports_int8:
+            supported.append((kInt8StaticChannelSym, kInt8DynamicTokenSym))
+        if current_platform.supports_fp8():
+            supported += [
+                (kFp8Static128BlockSym, kFp8Dynamic128Sym),
+                (kFp8StaticChannelSym, kFp8DynamicTokenSym),
+                (kFp8StaticTensorSym, kFp8DynamicTokenSym),
+                (kFp8StaticTensorSym, kFp8StaticTensorSym),
+                (kFp8StaticTensorSym, kFp8DynamicTensorSym),
+            ]
+        return (weight_key, activation_key) in supported
+
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        return activation in [
+            MoEActivation.SILU,
+            MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
+            MoEActivation.SWIGLUOAI,
+            MoEActivation.SWIGLUSTEP,
+            MoEActivation.SILU_NO_MUL,
+            MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
+            MoEActivation.RELU2_NO_MUL,
+        ]
+
+    @staticmethod
+    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
+        return not (
+            moe_parallel_config.use_fi_nvl_two_sided_kernels
+            or moe_parallel_config.use_fi_nvl_one_sided_kernels
+        )
+
+    @staticmethod
+    def _supports_batch_invariance():
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: MoEActivation,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+        workspace1 = (M, topk, max(activation_out_dim, K))
+        workspace2 = (M, topk, max(N, K))
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        # Check constraints.
+        if self.quant_config.use_int4_w4a16:
+            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
+        else:
+            assert hidden_states.size(-1) == w1.size(2), (
+                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
+            )
+
+        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+        assert hidden_states.dim() == 2
+        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert hidden_states.dtype in [
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+        ]
+
+        E, num_tokens, N, K, top_k_num = self.moe_problem_size(
+            hidden_states, w1, w2, topk_ids
+        )
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        config = try_get_optimal_moe_config(
+            w1.size(),
+            w2.size(),
+            top_k_num,
+            self.quant_config.config_name(hidden_states.dtype),
+            num_tokens,
+            block_shape=self.block_shape,
+        )
+
+        if hidden_states.dtype == torch.bfloat16:
+            compute_type = tl.bfloat16
+        elif hidden_states.dtype == torch.float16:
+            compute_type = tl.float16
+        elif hidden_states.dtype == torch.float32:
+            compute_type = tl.float32
+        elif (
+            hidden_states.dtype == torch.float8_e4m3fn
+            or hidden_states.dtype == torch.float8_e4m3fnuz
+        ):
+            compute_type = tl.bfloat16
+        else:
+            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
+
+        # Note that the output tensor might be in workspace1
+        intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
+        cache2_dim = self.adjust_N_for_activation(N, activation)
+        intermediate_cache2 = _resize_cache(
+            workspace13, (num_tokens * top_k_num, cache2_dim)
+        )
+        intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = (
+            _prepare_expert_assignment(
+                topk_ids,
+                config,
+                num_tokens,
+                top_k_num,
+                global_num_experts,
+                expert_map,
+                use_int8_w8a16=self.quant_config.use_int8_w8a16,
+                use_int4_w4a16=self.quant_config.use_int4_w4a16,
+                block_shape=self.block_shape,
+            )
+        )
+
+        invoke_fused_moe_triton_kernel(
+            hidden_states,
+            w1,
+            intermediate_cache1,
+            a1q_scale,
+            self.w1_scale,
+            None,  # topk_weights
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            False,  # mul_routed_weights
+            top_k_num,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
+            use_int8_w8a8=self.quant_config.use_int8_w8a8,
+            use_int8_w8a16=self.quant_config.use_int8_w8a16,
+            use_int4_w4a16=self.quant_config.use_int4_w4a16,
+            per_channel_quant=self.per_act_token_quant,
+            block_shape=self.block_shape,
+            B_bias=self.w1_bias,
+        )
+
+        # LoRA w13: applied to intermediate_cache1 before activation, using
+        # hidden_states as the lora_a input.  moe_lora_align_block_size is
+        # called once here and results reused for the w2 LoRA below.
+        sorted_token_ids_lora = None
+        expert_ids_lora = None
+        num_tokens_post_padded_lora = None
+        token_lora_mapping = None
+        lora_context = self._lora_context
+        if lora_context is not None:
+            (
+                sorted_token_ids_lora,
+                expert_ids_lora,
+                num_tokens_post_padded_lora,
+                token_lora_mapping,
+            ) = self.apply_w13_lora(
+                lora_context,
+                y=intermediate_cache1,
+                x=hidden_states,
+                topk_ids=topk_ids,
+                topk_weights=topk_weights,
+                expert_map=expert_map,
+                w1=w1,
+                w2=w2,
+                num_tokens=num_tokens,
+                top_k_num=top_k_num,
+            )
+
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+
+        a2q_scale: torch.Tensor | None = None
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            intermediate_cache2,
+            a2_scale,
+            self.quant_dtype,
+            self.per_act_token_quant,
+            self.block_shape,
+            quantization_emulation=self.quantization_emulation,
+        )
+
+        invoke_fused_moe_triton_kernel(
+            qintermediate_cache2,
+            w2,
+            intermediate_cache3,
+            a2q_scale,
+            self.w2_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            not apply_router_weight_on_input,
+            1,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
+            use_int8_w8a8=self.quant_config.use_int8_w8a8,
+            use_int8_w8a16=self.quant_config.use_int8_w8a16,
+            use_int4_w4a16=self.quant_config.use_int4_w4a16,
+            per_channel_quant=self.per_act_token_quant,
+            block_shape=self.block_shape,
+            B_bias=self.w2_bias,
+        )
+
+        # LoRA w2: applied to intermediate_cache3 before moe_sum, using the
+        # unquantized intermediate_cache2 as the lora_a input.  Reuses the
+        # sorted_token_ids_lora computed above.
+        if lora_context is not None:
+            self.apply_w2_lora(
+                lora_context,
+                y=intermediate_cache3,
+                x=intermediate_cache2,
+                topk_weights=topk_weights,
+                sorted_token_ids_lora=sorted_token_ids_lora,
+                expert_ids_lora=expert_ids_lora,
+                num_tokens_post_padded_lora=num_tokens_post_padded_lora,
+                token_lora_mapping=token_lora_mapping,
+                num_tokens=num_tokens,
+                w1=w1,
+                w2=w2,
+                top_k_num=top_k_num,
+            )
+
+        # separate function is required for MoE + LoRA
+        self.moe_sum(intermediate_cache3, output)
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
+        ops.moe_sum(input, output)
+
+
+class TritonWNA16Experts(TritonExperts):
+    @staticmethod
+    def _supports_current_device() -> bool:
+        raise NotImplementedError(
+            "TritonWNA16Experts is not yet used by an Oracle. "
+            "This method should not be called."
+        )
+
+    @staticmethod
+    def _supports_no_act_and_mul() -> bool:
+        raise NotImplementedError(
+            "TritonWNA16Experts is not yet used by an Oracle. "
+            "This method should not be called."
+        )
+
+    @staticmethod
+    def _supports_quant_scheme(
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        raise NotImplementedError(
+            "TritonWNA16Experts is not yet used by an Oracle. "
+            "This method should not be called."
+        )
+
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        raise NotImplementedError(
+            "TritonWNA16Experts is not yet used by an Oracle. "
+            "This method should not be called."
+        )
+
+    @staticmethod
+    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
+        raise NotImplementedError(
+            "TritonWNA16Experts is not yet used by an Oracle. "
+            "This method should not be called."
+        )
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        # Check constraints.
+        if self.quant_config.use_int4_w4a16:
+            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
+        else:
+            assert hidden_states.size(-1) == w1.size(2), (
+                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
+            )
+
+        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+        assert hidden_states.dim() == 2
+        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+        assert hidden_states.dtype in [
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+        ]
+
+        E, num_tokens, N, K, top_k_num = self.moe_problem_size(
+            hidden_states, w1, w2, topk_ids
+        )
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        config = try_get_optimal_moe_config(
+            w1.size(),
+            w2.size(),
+            top_k_num,
+            self.quant_config.config_name(hidden_states.dtype),
+            num_tokens,
+            block_shape=self.block_shape,
+        )
+
+        if hidden_states.dtype == torch.bfloat16:
+            compute_type = tl.bfloat16
+        elif hidden_states.dtype == torch.float16:
+            compute_type = tl.float16
+        elif hidden_states.dtype == torch.float32:
+            compute_type = tl.float32
+        elif (
+            hidden_states.dtype == torch.float8_e4m3fn
+            or hidden_states.dtype == torch.float8_e4m3fnuz
+        ):
+            compute_type = tl.bfloat16
+        else:
+            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
+
+        # Note that the output tensor might be in workspace1
+        intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
+        activation_out_dim = self.adjust_N_for_activation(N, activation)
+        intermediate_cache2 = _resize_cache(
+            workspace13, (num_tokens * top_k_num, activation_out_dim)
+        )
+        intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
+        )
+
+        invoke_fused_moe_wna16_triton_kernel(
+            hidden_states,
+            w1,
+            intermediate_cache1,
+            self.w1_scale,
+            self.quant_config.w1_zp,
+            None,  # topk_weights
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            False,  # mul_routed_weights
+            top_k_num,
+            config,
+            compute_type=compute_type,
+            use_int8_w8a16=self.quant_config.use_int8_w8a16,
+            use_int4_w4a16=self.quant_config.use_int4_w4a16,
+            block_shape=self.block_shape,
+        )
+
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+
+        a2q_scale: torch.Tensor | None = None
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            intermediate_cache2,
+            a2_scale,
+            self.quant_dtype,
+            self.per_act_token_quant,
+            self.block_shape,
+        )
+
+        invoke_fused_moe_wna16_triton_kernel(
+            qintermediate_cache2,
+            w2,
+            intermediate_cache3,
+            self.w2_scale,
+            self.quant_config.w2_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            not apply_router_weight_on_input,
+            1,
+            config,
+            compute_type=compute_type,
+            use_int8_w8a16=self.quant_config.use_int8_w8a16,
+            use_int4_w4a16=self.quant_config.use_int4_w4a16,
+            block_shape=self.block_shape,
+        )
+
+        # separate function is required for MoE + LoRA
+        self.moe_sum(intermediate_cache3, output)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1a655934259a..8d974ea5671d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -20,34 +20,16 @@
 )
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
-    FusedMoEConfig,
-    FusedMoEParallelConfig,
     FusedMoEQuantConfig,
     _get_config_dtype_str,
 )
-from vllm.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     moe_align_block_size,
 )
-from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP,
-)
 from vllm.model_executor.layers.fused_moe.utils import (
-    _resize_cache,
     disable_inplace,
     moe_kernel_quantize_input,
 )
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey,
-    kFp8Dynamic128Sym,
-    kFp8DynamicTensorSym,
-    kFp8DynamicTokenSym,
-    kFp8Static128BlockSym,
-    kFp8StaticChannelSym,
-    kFp8StaticTensorSym,
-    kInt8DynamicTokenSym,
-    kInt8StaticChannelSym,
-)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
@@ -1885,479 +1867,3 @@ def fused_experts_impl(
     )
 
     return out_hidden_states
-
-
-class TritonExperts(LoRAExpertsMixin, mk.FusedMoEExpertsModular):
-    """Triton-based fused MoE expert implementation."""
-
-    def __init__(
-        self,
-        moe_config: FusedMoEConfig,
-        quant_config: FusedMoEQuantConfig,
-    ):
-        # Whether quantized MOE runs natively, or through
-        # higher-precision + activation QDQ.
-        self.quantization_emulation = False
-        super().__init__(moe_config, quant_config)
-
-    @staticmethod
-    def activation_format() -> mk.FusedMoEActivationFormat:
-        return mk.FusedMoEActivationFormat.Standard
-
-    @staticmethod
-    def _supports_current_device() -> bool:
-        return current_platform.is_cuda_alike() or current_platform.is_xpu()
-
-    @staticmethod
-    def _supports_no_act_and_mul() -> bool:
-        return True
-
-    @staticmethod
-    def _supports_quant_scheme(
-        weight_key: QuantKey | None,
-        activation_key: QuantKey | None,
-    ) -> bool:
-        # INT8 requires at least 7.5 (Turing).
-        device_supports_int8 = (
-            current_platform.is_cuda()
-            and current_platform.has_device_capability((7, 5))
-        )
-
-        supported: list[tuple[QuantKey | None, QuantKey | None]] = [(None, None)]
-        if device_supports_int8:
-            supported.append((kInt8StaticChannelSym, kInt8DynamicTokenSym))
-        if current_platform.supports_fp8():
-            supported += [
-                (kFp8Static128BlockSym, kFp8Dynamic128Sym),
-                (kFp8StaticChannelSym, kFp8DynamicTokenSym),
-                (kFp8StaticTensorSym, kFp8DynamicTokenSym),
-                (kFp8StaticTensorSym, kFp8StaticTensorSym),
-                (kFp8StaticTensorSym, kFp8DynamicTensorSym),
-            ]
-        return (weight_key, activation_key) in supported
-
-    @staticmethod
-    def _supports_activation(activation: MoEActivation) -> bool:
-        return activation in [
-            MoEActivation.SILU,
-            MoEActivation.GELU,
-            MoEActivation.GELU_TANH,
-            MoEActivation.SWIGLUOAI,
-            MoEActivation.SWIGLUSTEP,
-            MoEActivation.SILU_NO_MUL,
-            MoEActivation.GELU_NO_MUL,
-            MoEActivation.GELU_TANH_NO_MUL,
-            MoEActivation.RELU2_NO_MUL,
-        ]
-
-    @staticmethod
-    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
-        return not (
-            moe_parallel_config.use_fi_nvl_two_sided_kernels
-            or moe_parallel_config.use_fi_nvl_one_sided_kernels
-        )
-
-    @staticmethod
-    def _supports_batch_invariance():
-        return True
-
-    def supports_expert_map(self) -> bool:
-        return True
-
-    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        return TopKWeightAndReduceNoOP()
-
-    def workspace_shapes(
-        self,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        global_num_experts: int,
-        local_num_experts: int,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        activation: MoEActivation,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        activation_out_dim = self.adjust_N_for_activation(N, activation)
-        workspace1 = (M, topk, max(activation_out_dim, K))
-        workspace2 = (M, topk, max(N, K))
-        output = (M, K)
-        return (workspace1, workspace2, output)
-
-    def apply(
-        self,
-        output: torch.Tensor,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: MoEActivation,
-        global_num_experts: int,
-        expert_map: torch.Tensor | None,
-        a1q_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        apply_router_weight_on_input: bool,
-    ):
-        # Check constraints.
-        if self.quant_config.use_int4_w4a16:
-            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
-        else:
-            assert hidden_states.size(-1) == w1.size(2), (
-                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
-            )
-
-        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-        assert hidden_states.dim() == 2
-        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
-        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
-        assert hidden_states.dtype in [
-            torch.float32,
-            torch.float16,
-            torch.bfloat16,
-            torch.float8_e4m3fn,
-            torch.float8_e4m3fnuz,
-        ]
-
-        E, num_tokens, N, K, top_k_num = self.moe_problem_size(
-            hidden_states, w1, w2, topk_ids
-        )
-
-        if global_num_experts == -1:
-            global_num_experts = E
-
-        config = try_get_optimal_moe_config(
-            w1.size(),
-            w2.size(),
-            top_k_num,
-            self.quant_config.config_name(hidden_states.dtype),
-            num_tokens,
-            block_shape=self.block_shape,
-        )
-
-        if hidden_states.dtype == torch.bfloat16:
-            compute_type = tl.bfloat16
-        elif hidden_states.dtype == torch.float16:
-            compute_type = tl.float16
-        elif hidden_states.dtype == torch.float32:
-            compute_type = tl.float32
-        elif (
-            hidden_states.dtype == torch.float8_e4m3fn
-            or hidden_states.dtype == torch.float8_e4m3fnuz
-        ):
-            compute_type = tl.bfloat16
-        else:
-            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
-
-        # Note that the output tensor might be in workspace1
-        intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
-        cache2_dim = self.adjust_N_for_activation(N, activation)
-        intermediate_cache2 = _resize_cache(
-            workspace13, (num_tokens * top_k_num, cache2_dim)
-        )
-        intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))
-
-        sorted_token_ids, expert_ids, num_tokens_post_padded = (
-            _prepare_expert_assignment(
-                topk_ids,
-                config,
-                num_tokens,
-                top_k_num,
-                global_num_experts,
-                expert_map,
-                use_int8_w8a16=self.quant_config.use_int8_w8a16,
-                use_int4_w4a16=self.quant_config.use_int4_w4a16,
-                block_shape=self.block_shape,
-            )
-        )
-
-        invoke_fused_moe_triton_kernel(
-            hidden_states,
-            w1,
-            intermediate_cache1,
-            a1q_scale,
-            self.w1_scale,
-            None,  # topk_weights
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            False,  # mul_routed_weights
-            top_k_num,
-            config,
-            compute_type=compute_type,
-            use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
-            use_int8_w8a8=self.quant_config.use_int8_w8a8,
-            use_int8_w8a16=self.quant_config.use_int8_w8a16,
-            use_int4_w4a16=self.quant_config.use_int4_w4a16,
-            per_channel_quant=self.per_act_token_quant,
-            block_shape=self.block_shape,
-            B_bias=self.w1_bias,
-        )
-
-        # LoRA w13: applied to intermediate_cache1 before activation, using
-        # hidden_states as the lora_a input.  moe_lora_align_block_size is
-        # called once here and results reused for the w2 LoRA below.
-        sorted_token_ids_lora = None
-        expert_ids_lora = None
-        num_tokens_post_padded_lora = None
-        token_lora_mapping = None
-        lora_context = self._lora_context
-        if lora_context is not None:
-            (
-                sorted_token_ids_lora,
-                expert_ids_lora,
-                num_tokens_post_padded_lora,
-                token_lora_mapping,
-            ) = self.apply_w13_lora(
-                lora_context,
-                y=intermediate_cache1,
-                x=hidden_states,
-                topk_ids=topk_ids,
-                topk_weights=topk_weights,
-                expert_map=expert_map,
-                w1=w1,
-                w2=w2,
-                num_tokens=num_tokens,
-                top_k_num=top_k_num,
-            )
-
-        self.activation(
-            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
-        )
-
-        a2q_scale: torch.Tensor | None = None
-
-        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
-            intermediate_cache2,
-            a2_scale,
-            self.quant_dtype,
-            self.per_act_token_quant,
-            self.block_shape,
-            quantization_emulation=self.quantization_emulation,
-        )
-
-        invoke_fused_moe_triton_kernel(
-            qintermediate_cache2,
-            w2,
-            intermediate_cache3,
-            a2q_scale,
-            self.w2_scale,
-            topk_weights,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            not apply_router_weight_on_input,
-            1,
-            config,
-            compute_type=compute_type,
-            use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
-            use_int8_w8a8=self.quant_config.use_int8_w8a8,
-            use_int8_w8a16=self.quant_config.use_int8_w8a16,
-            use_int4_w4a16=self.quant_config.use_int4_w4a16,
-            per_channel_quant=self.per_act_token_quant,
-            block_shape=self.block_shape,
-            B_bias=self.w2_bias,
-        )
-
-        # LoRA w2: applied to intermediate_cache3 before moe_sum, using the
-        # unquantized intermediate_cache2 as the lora_a input.  Reuses the
-        # sorted_token_ids_lora computed above.
-        if lora_context is not None:
-            self.apply_w2_lora(
-                lora_context,
-                y=intermediate_cache3,
-                x=intermediate_cache2,
-                topk_weights=topk_weights,
-                sorted_token_ids_lora=sorted_token_ids_lora,
-                expert_ids_lora=expert_ids_lora,
-                num_tokens_post_padded_lora=num_tokens_post_padded_lora,
-                token_lora_mapping=token_lora_mapping,
-                num_tokens=num_tokens,
-                w1=w1,
-                w2=w2,
-                top_k_num=top_k_num,
-            )
-
-        # separate function is required for MoE + LoRA
-        self.moe_sum(intermediate_cache3, output)
-
-    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
-        ops.moe_sum(input, output)
-
-
-class TritonWNA16Experts(TritonExperts):
-    @staticmethod
-    def _supports_current_device() -> bool:
-        raise NotImplementedError(
-            "TritonWNA16Experts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    @staticmethod
-    def _supports_no_act_and_mul() -> bool:
-        raise NotImplementedError(
-            "TritonWNA16Experts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    @staticmethod
-    def _supports_quant_scheme(
-        weight_key: QuantKey | None,
-        activation_key: QuantKey | None,
-    ) -> bool:
-        raise NotImplementedError(
-            "TritonWNA16Experts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    @staticmethod
-    def _supports_activation(activation: MoEActivation) -> bool:
-        raise NotImplementedError(
-            "TritonWNA16Experts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    @staticmethod
-    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
-        raise NotImplementedError(
-            "TritonWNA16Experts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    def apply(
-        self,
-        output: torch.Tensor,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: MoEActivation,
-        global_num_experts: int,
-        expert_map: torch.Tensor | None,
-        a1q_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        apply_router_weight_on_input: bool,
-    ):
-        # Check constraints.
-        if self.quant_config.use_int4_w4a16:
-            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
-        else:
-            assert hidden_states.size(-1) == w1.size(2), (
-                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
-            )
-
-        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-        assert hidden_states.dim() == 2
-        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
-        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
-        assert hidden_states.dtype in [
-            torch.float32,
-            torch.float16,
-            torch.bfloat16,
-            torch.float8_e4m3fn,
-            torch.float8_e4m3fnuz,
-        ]
-
-        E, num_tokens, N, K, top_k_num = self.moe_problem_size(
-            hidden_states, w1, w2, topk_ids
-        )
-
-        if global_num_experts == -1:
-            global_num_experts = E
-
-        config = try_get_optimal_moe_config(
-            w1.size(),
-            w2.size(),
-            top_k_num,
-            self.quant_config.config_name(hidden_states.dtype),
-            num_tokens,
-            block_shape=self.block_shape,
-        )
-
-        if hidden_states.dtype == torch.bfloat16:
-            compute_type = tl.bfloat16
-        elif hidden_states.dtype == torch.float16:
-            compute_type = tl.float16
-        elif hidden_states.dtype == torch.float32:
-            compute_type = tl.float32
-        elif (
-            hidden_states.dtype == torch.float8_e4m3fn
-            or hidden_states.dtype == torch.float8_e4m3fnuz
-        ):
-            compute_type = tl.bfloat16
-        else:
-            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
-
-        # Note that the output tensor might be in workspace1
-        intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
-        activation_out_dim = self.adjust_N_for_activation(N, activation)
-        intermediate_cache2 = _resize_cache(
-            workspace13, (num_tokens * top_k_num, activation_out_dim)
-        )
-        intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))
-
-        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-            topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
-        )
-
-        invoke_fused_moe_wna16_triton_kernel(
-            hidden_states,
-            w1,
-            intermediate_cache1,
-            self.w1_scale,
-            self.quant_config.w1_zp,
-            None,  # topk_weights
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            False,  # mul_routed_weights
-            top_k_num,
-            config,
-            compute_type=compute_type,
-            use_int8_w8a16=self.quant_config.use_int8_w8a16,
-            use_int4_w4a16=self.quant_config.use_int4_w4a16,
-            block_shape=self.block_shape,
-        )
-
-        self.activation(
-            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
-        )
-
-        a2q_scale: torch.Tensor | None = None
-
-        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
-            intermediate_cache2,
-            a2_scale,
-            self.quant_dtype,
-            self.per_act_token_quant,
-            self.block_shape,
-        )
-
-        invoke_fused_moe_wna16_triton_kernel(
-            qintermediate_cache2,
-            w2,
-            intermediate_cache3,
-            self.w2_scale,
-            self.quant_config.w2_zp,
-            topk_weights,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            not apply_router_weight_on_input,
-            1,
-            config,
-            compute_type=compute_type,
-            use_int8_w8a16=self.quant_config.use_int8_w8a16,
-            use_int4_w4a16=self.quant_config.use_int4_w4a16,
-            block_shape=self.block_shape,
-        )
-
-        # separate function is required for MoE + LoRA
-        self.moe_sum(intermediate_cache3, output)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index acb601eff558..2eef89793a6e 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -26,15 +26,15 @@
     FusedMoEQuantConfig,
     RoutingMethodType,
 )
+from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
+    init_aiter_topK_meta_data,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    init_aiter_topK_meta_data,
-)
 from vllm.model_executor.layers.fused_moe.router.router_factory import (
     create_fused_moe_router,
 )
diff --git a/vllm/model_executor/layers/fused_moe/lora_context.py b/vllm/model_executor/layers/fused_moe/lora_context.py
index 92500a7bb47d..ab1f0bfc1476 100644
--- a/vllm/model_executor/layers/fused_moe/lora_context.py
+++ b/vllm/model_executor/layers/fused_moe/lora_context.py
@@ -42,3 +42,10 @@ class MoELoRAContext:
     # Whether VLLM_TUNED_CONFIG_FOLDER is set; selects get_lora_op_configs vs
     # try_get_optimal_moe_lora_config for Triton kernel tile configs.
     use_tuned_config: bool
+
+    # Per-rank token→LoRA mapping after EP dispatch. Set by
+    # FusedMoEPrepareAndFinalizeModular.prepare() when EP+LoRA is active, read
+    # by LoRAExpertsMixin helpers in place of punica_wrapper's global mapping.
+    # None means no dispatch happened (non-EP path), in which case callers
+    # fall back to punica_wrapper.token_mapping_meta.
+    local_token_lora_mapping: torch.Tensor | None = None
diff --git a/vllm/model_executor/layers/fused_moe/lora_experts_mixin.py b/vllm/model_executor/layers/fused_moe/lora_experts_mixin.py
index c609c5cf56b5..10707b91b70e 100644
--- a/vllm/model_executor/layers/fused_moe/lora_experts_mixin.py
+++ b/vllm/model_executor/layers/fused_moe/lora_experts_mixin.py
@@ -70,6 +70,7 @@ def apply_w13_lora(
             lora_context.w13_num_slices,
             lora_context.fully_sharded,
             lora_context.use_tuned_config,
+            token_lora_mapping=lora_context.local_token_lora_mapping,
         )
 
     def apply_w2_lora(
diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
index b65f8efa4a70..1ce5a1e3213b 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
@@ -123,7 +123,7 @@ def backend_to_kernel_cls(
         return [TrtLlmFp8ExpertsMonolithic, TrtLlmFp8ExpertsModular]
 
     elif backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
-        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe import (  # noqa: E501
             FlashInferExperts,
         )
 
@@ -144,14 +144,14 @@ def backend_to_kernel_cls(
         return [BatchedDeepGemmExperts]
 
     elif backend == Fp8MoeBackend.MARLIN:
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
             MarlinExperts,
         )
 
         return [MarlinExperts]
 
     elif backend == Fp8MoeBackend.TRITON:
-        from vllm.model_executor.layers.fused_moe.fused_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
             TritonExperts,
         )
 
@@ -165,7 +165,7 @@ def backend_to_kernel_cls(
         return [BatchedTritonExperts]
 
     elif backend == Fp8MoeBackend.AITER:
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
             AiterExperts,
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/oracle/int8.py b/vllm/model_executor/layers/fused_moe/oracle/int8.py
index cdb1be108b5d..0c1d641a8e62 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/int8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/int8.py
@@ -46,7 +46,7 @@ def backend_to_kernel_cls(
     backend: Int8MoeBackend,
 ) -> list[type[mk.FusedMoEExperts]]:
     if backend == Int8MoeBackend.TRITON:
-        from vllm.model_executor.layers.fused_moe.fused_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
             TritonExperts,
         )
 
@@ -79,9 +79,6 @@ def select_int8_moe_backend(
     Note: Shape-specific fallbacks may still occur at runtime.
     """
 
-    if config.is_lora_enabled:
-        return Int8MoeBackend.TRITON, backend_to_kernel_cls(Int8MoeBackend.TRITON)[0]
-
     AVAILABLE_BACKENDS = _get_priority_backends(config)
 
     activation_format = (
diff --git a/vllm/model_executor/layers/fused_moe/oracle/int_wna16.py b/vllm/model_executor/layers/fused_moe/oracle/int_wna16.py
index 5503d233f128..a0dde3cb7f77 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/int_wna16.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/int_wna16.py
@@ -12,7 +12,7 @@
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
     BatchedMarlinExperts,
     MarlinExperts,
 )
@@ -42,14 +42,14 @@ def backend_to_kernel_cls(
 ) -> list[type[mk.FusedMoEExperts]]:
     """Return the experts class for the given backend, or None for NONE."""
     if backend == WNA16MoEBackend.MARLIN:
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
             MarlinExperts,
         )
 
         return [MarlinExperts]
 
     elif backend == WNA16MoEBackend.BATCHED_MARLIN:
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
             BatchedMarlinExperts,
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
index 1cc94a347c42..4de0d722d50b 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
@@ -124,7 +124,7 @@ def backend_to_kernel_cls(
         Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
         Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
     ):
-        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe import (  # noqa: E501
             FlashInferExperts,
         )
 
@@ -160,21 +160,21 @@ def backend_to_kernel_cls(
         ]
 
     elif backend == Mxfp4MoeBackend.MARLIN:
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
             MarlinExperts,
         )
 
         return [MarlinExperts]
 
     elif backend == Mxfp4MoeBackend.BATCHED_MARLIN:
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
             BatchedMarlinExperts,
         )
 
         return [BatchedMarlinExperts]
 
     elif backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
             AiterExperts,
         )
 
@@ -301,28 +301,6 @@ def select_mxfp4_moe_backend(
     """
     # If activation_key is explicitly provided (e.g., W4A8), use it
     requested_activation_key = activation_key
-    device_capability = current_platform.get_device_capability()
-    triton_kernels_supported = (
-        has_triton_kernels()
-        and device_capability is not None
-        and (9, 0) <= device_capability < (11, 0)
-    )
-
-    # LoRA: separate experts backend path
-    if config.is_lora_enabled:
-        if not current_platform.is_cuda():
-            # ROCm: Triton mxfp4 LoRA hits GPU memory faults due to
-            # triton_kernels.tensor.Tensor / HIP read-only page issues
-            # during weight swizzle and LoRA forward. Needs work from
-            # the triton_kernels/aiter side.
-            raise NotImplementedError("Mxfp4 LoRA is currently only supported on CUDA.")
-        if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported:
-            logger.info_once("Using Triton backend for mxfp4 lora")
-            return Mxfp4MoeBackend.TRITON_UNFUSED, backend_to_kernel_cls(
-                Mxfp4MoeBackend.TRITON_UNFUSED
-            )[0]
-        logger.info_once("Using Marlin backend for mxfp4 lora")
-        return Mxfp4MoeBackend.MARLIN, backend_to_kernel_cls(Mxfp4MoeBackend.MARLIN)[0]
 
     activation_format = (
         mk.FusedMoEActivationFormat.BatchedExperts
diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
index c67def149b9d..8133902d519b 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
@@ -61,8 +61,6 @@ def select_mxfp8_moe_backend(
     Returns:
         A tuple of (fp8_backend, experts_cls).
     """
-    if config.is_lora_enabled:
-        raise NotImplementedError("LoRA is not supported for MXFP8 MoE.")
 
     runner_backend = config.moe_backend
     if runner_backend != "auto":
diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
index 12c543b419fb..d22fe95f1e51 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
@@ -89,7 +89,7 @@ def backend_to_kernel_cls(
         ]
 
     elif backend == NvFp4MoeBackend.FLASHINFER_CUTLASS:
-        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe import (  # noqa: E501
             FlashInferExperts,
         )
 
@@ -117,7 +117,7 @@ def backend_to_kernel_cls(
         return [CutlassExpertsFp4]
 
     elif backend == NvFp4MoeBackend.MARLIN:
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
             MarlinExperts,
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
index 8240a5e8c963..f1fd196c09c1 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
@@ -95,21 +95,23 @@ def backend_to_kernel_cls(
         return TrtLlmBf16Experts
 
     elif backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS:
-        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe import (  # noqa: E501
             FlashInferExperts,
         )
 
         return FlashInferExperts
 
     elif backend == UnquantizedMoeBackend.AITER:
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
             AiterExperts,
         )
 
         return AiterExperts
 
     elif backend == UnquantizedMoeBackend.TRITON:
-        from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+        from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
+            TritonExperts,
+        )
 
         return TritonExperts
 
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
index 54d77101a3fc..b8633726c72b 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
@@ -84,6 +84,14 @@ def __init__(
         super().__init__()
         self.is_sequence_parallel = is_sequence_parallel
         self._num_dispatchers = num_dispatchers
+        # Set by FusedMoEWithLoRA.set_mapping() when LoRA is active. When
+        # present, prepare() dispatches the per-token LoRA mapping alongside
+        # hidden_states and writes the gathered result back to the context so
+        # experts can use the per-rank-local mapping.
+        self._lora_context = None
+
+    def set_lora_context(self, ctx) -> None:
+        self._lora_context = ctx
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -124,22 +132,54 @@ def prepare(
 
         a1q, scales = _quantize_and_setup_dispatch(a1, quant_config, defer_input_quant)
 
+        # When LoRA is active, dispatch the per-token LoRA id along with
+        # hidden_states so every rank receives the correct mapping for the
+        # tokens it ends up processing. The punica_wrapper stores indices as
+        # int64 but the moe_lora_align_block_size kernel expects int32, so
+        # pull the pre-cast view from token_mapping_meta.
+        lora_ctx = self._lora_context
+        local_token_lora_mapping = None
+        if lora_ctx is not None:
+            local_token_lora_mapping = (
+                lora_ctx.punica_wrapper.token_mapping_meta.token_lora_mapping[
+                    : a1.shape[0]
+                ]
+            )
+
+        extra_tensors: list[torch.Tensor] | None = None
+        if scales is not None:
+            extra_tensors = list(scales)
+        if local_token_lora_mapping is not None:
+            if extra_tensors is None:
+                extra_tensors = []
+            extra_tensors.append(local_token_lora_mapping)
+
         res = get_ep_group().dispatch(
             a1q,
             topk_weights,
             topk_ids,
             is_sequence_parallel=self.is_sequence_parallel,
-            extra_tensors=scales,
+            extra_tensors=extra_tensors,
         )
 
-        if scales is None:
+        if extra_tensors is None:
             assert len(res) == 3
             a1q, topk_weights, topk_ids = res
             a1q_scale = None
         else:
             assert len(res) == 4
-            a1q, topk_weights, topk_ids, scales = res
-            a1q_scale = _unwrap_scale_and_prepare_for_moe(scales, quant_config)
+            a1q, topk_weights, topk_ids, gathered_extras = res
+            gathered_extras = list(gathered_extras)
+            if local_token_lora_mapping is not None:
+                dispatched_lora_mapping = gathered_extras.pop()
+                assert lora_ctx is not None
+                lora_ctx.local_token_lora_mapping = dispatched_lora_mapping
+            if scales is not None:
+                a1q_scale = _unwrap_scale_and_prepare_for_moe(
+                    gathered_extras, quant_config
+                )
+            else:
+                a1q_scale = None
 
         return a1q, a1q_scale, None, topk_ids, topk_weights
 
diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
index 8e35169d9005..a3e0075f2b7d 100644
--- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
+++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
@@ -34,7 +34,7 @@ def __init__(
 
     @property
     def routing_method_type(self) -> RoutingMethodType:
-        from vllm.model_executor.models.cohere_moe import token_choice_with_bias
+        from vllm.model_executor.models.cohere2_moe import token_choice_with_bias
         from vllm.model_executor.models.llama4 import Llama4MoE
 
         # NOTE: FLASHINFER_TRTLLM support the Llama4 router.
diff --git a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py
index 74c3a62a1f11..37d812a24bbe 100644
--- a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py
+++ b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py
@@ -14,7 +14,7 @@
     RoutingMethodType,
     get_routing_method_type,
 )
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
     rocm_aiter_grouped_topk,
 )
 from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter
diff --git a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py
index 70431878932d..ee3622b4ebe3 100644
--- a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py
@@ -11,8 +11,8 @@
     FusedMoEQuantConfig,
 )
 from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
 from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
-from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
 from vllm.platforms import current_platform
 
 
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index d58dbc20e244..fbcf030a9256 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -14,8 +14,8 @@
     _valid_deep_gemm,
     _valid_deep_gemm_shape,
 )
+from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
 from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
-from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
 from vllm.utils.deep_gemm import (
     is_deep_gemm_e8m0_used,
 )
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 91de16f79c68..d9cfce2c2141 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -171,6 +171,11 @@ def _setup_kernel(
         replace_parameter(layer, "w13_weight", w13_new, prefer_copy=is_weight_update)
         replace_parameter(layer, "w2_weight", w2_new, prefer_copy=is_weight_update)
 
+        # AITER backend requires weights to be marked as shuffled.
+        if self.unquantized_backend == UnquantizedMoeBackend.AITER:
+            layer.w13_weight.is_shuffled = True
+            layer.w2_weight.is_shuffled = True
+
         if not is_weight_update:
             # Setup moe kernel only on the first call. For the unquantized
             # method, moe_quant_config is either the constant
diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py
index 70c67f33f0ac..f079f3dc9765 100644
--- a/vllm/model_executor/layers/kda.py
+++ b/vllm/model_executor/layers/kda.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 from .fla.ops.kda import (
     FusedRMSNormGated,
@@ -84,8 +85,8 @@ def kda_attention_fake(
 
 class KimiDeltaAttention(nn.Module, MambaBase):
     @property
-    def mamba_type(self) -> str:
-        return "gdn_attention"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.GDN_ATTN
 
     def get_state_dtype(
         self,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 765e79331d1e..323cad9427d3 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -60,6 +60,7 @@
     "ModelOptFp8PbWoLinearMethod",
     "QuarkLinearMethod",
     "ModelOptNvFp4LinearMethod",
+    "ModelOptNvFp4W4A16LinearMethod",
     "HummingLinearMethod",
 ]
 
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index 2c05880c0fe1..8bbb21d7bc90 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -8,6 +8,7 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.v1.attention.backend import AttentionBackend
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 from vllm.v1.attention.selector import get_mamba_attn_backend
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
 
@@ -33,7 +34,7 @@ def get_state_shape(self) -> Iterable[tuple[int, ...]]:
 
     @property
     @abstractmethod
-    def mamba_type(self) -> str:
+    def mamba_type(self) -> MambaAttentionBackendEnum:
         pass
 
     @abstractmethod
diff --git a/vllm/model_executor/layers/mamba/gdn_linear_attn.py b/vllm/model_executor/layers/mamba/gdn_linear_attn.py
index 518e9d4f0cff..4493ee783630 100644
--- a/vllm/model_executor/layers/mamba/gdn_linear_attn.py
+++ b/vllm/model_executor/layers/mamba/gdn_linear_attn.py
@@ -64,6 +64,7 @@
     direct_register_custom_op,
 )
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 # Optional ROCm AITER Triton kernels for the GDN decode fast-path.
 # Availability is checked centrally via rocm_aiter_ops; the actual function
@@ -237,8 +238,8 @@ def forward_native(
 @PluggableLayer.register("gated_delta_net_attention")
 class GatedDeltaNetAttention(PluggableLayer, MambaBase):
     @property
-    def mamba_type(self) -> str:
-        return "gdn_attention"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.GDN_ATTN
 
     def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
         return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
@@ -263,7 +264,6 @@ def __init__(
         config: Qwen3NextConfig,
         vllm_config: VllmConfig,
         prefix: str = "",
-        create_in_proj_qkvz: bool = True,
         gqa_interleaved_layout=False,
     ) -> None:
         super().__init__()
@@ -323,32 +323,14 @@ def __init__(
         # we need to create qkvz_proj adaptively here.
         # When create_in_proj_qkvz is False (e.g. LoRA enabled in Qwen3.5),
         # in_proj_qkv and in_proj_z are created separately instead.
-        self.has_lora_projections = not create_in_proj_qkvz
-        if create_in_proj_qkvz:
-            self.in_proj_qkvz = self.create_qkvz_proj(
-                hidden_size=self.hidden_size,
-                key_dim=self.key_dim,
-                value_dim=self.value_dim,
-                quant_config=quant_config,
-                prefix=f"{prefix}.in_proj_qkvz",
-            )
-        else:
-            # LoRA case (Qwen3.5 only): keep q/k/v and z as separate modules
-            # so that LoRA adapters can be applied independently.
-            self.in_proj_qkv = MergedColumnParallelLinear(
-                input_size=self.hidden_size,
-                output_sizes=[self.key_dim, self.key_dim, self.value_dim],
-                bias=False,
-                quant_config=quant_config,
-                prefix=f"{prefix}.in_proj_qkv",
-            )
-            self.in_proj_z = ColumnParallelLinear(
-                input_size=self.hidden_size,
-                output_size=self.value_dim,
-                bias=False,
-                quant_config=quant_config,
-                prefix=f"{prefix}.in_proj_z",
-            )
+        self.in_proj_qkvz = self.create_qkvz_proj(
+            hidden_size=self.hidden_size,
+            key_dim=self.key_dim,
+            value_dim=self.value_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.in_proj_qkvz",
+        )
+
         # ba_proj doesn't support blockwise fp8 quantization.
         # Qwen3-Next and Qwen3.5 have different in_proj_ba checkpoint
         # layouts, so we use a factory method to create the projection.
@@ -637,7 +619,6 @@ def prepare_gdn_attention_core_inputs(
 
         return mixed_qkv_out, z_out, b_out, a_out
 
-    @torch.compile(fullgraph=True)
     def rearrange_mixed_qkv(self, mixed_qkv):
         """Split packed qkv into contiguous (1, seq, heads, dim) tensors.
 
@@ -708,7 +689,7 @@ def forward_hip(
     ):
         """ROCm forward using AITER Triton fused projection+attention when
         available, otherwise falling back to the generic CUDA path."""
-        if not self.has_lora_projections and GDN_AITER_TRITON_AVAILABLE:
+        if GDN_AITER_TRITON_AVAILABLE:
             num_tokens = hidden_states.size(0)
             projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
             projected_states_ba, _ = self.in_proj_ba(hidden_states)
@@ -753,37 +734,27 @@ def forward_cuda(
         # ============================================================
         # Part 1: Input Projection
         # ============================================================
-        if self.has_lora_projections:
-            # LoRA path (Qwen3.5 only): separate in_proj_qkv and in_proj_z
-            mixed_qkv, _ = self.in_proj_qkv(hidden_states)
-            ba, _ = self.in_proj_ba(hidden_states)
-            z, _ = self.in_proj_z(hidden_states)
+        mixed_qkvz, _ = self.in_proj_qkvz(hidden_states)
+        ba, _ = self.in_proj_ba(hidden_states)
+
+        if self.gqa_interleaved_layout:
+            # Qwen3-Next: unpack the interleaved GQA layout
+            query, key, value, z, b, a = self.fix_query_key_value_ordering(
+                mixed_qkvz, ba
+            )
+            query, key, value = map(
+                lambda x: rearrange(x, "l p d -> l (p d)"), (query, key, value)
+            )
+            mixed_qkv = torch.cat((query, key, value), dim=-1)
+        else:
+            # Qwen3.5: weights are already in [q, k, v, z] and [b, a] order
+            qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size
+            z_size = self.value_dim // self.tp_size
+            mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
             z = z.reshape(z.size(0), -1, self.head_v_dim)
             b, a = ba.chunk(2, dim=-1)
             b = b.contiguous()
             a = a.contiguous()
-        else:
-            mixed_qkvz, _ = self.in_proj_qkvz(hidden_states)
-            ba, _ = self.in_proj_ba(hidden_states)
-
-            if self.gqa_interleaved_layout:
-                # Qwen3-Next: unpack the interleaved GQA layout
-                query, key, value, z, b, a = self.fix_query_key_value_ordering(
-                    mixed_qkvz, ba
-                )
-                query, key, value = map(
-                    lambda x: rearrange(x, "l p d -> l (p d)"), (query, key, value)
-                )
-                mixed_qkv = torch.cat((query, key, value), dim=-1)
-            else:
-                # Qwen3.5: weights are already in [q, k, v, z] and [b, a] order
-                qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size
-                z_size = self.value_dim // self.tp_size
-                mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
-                z = z.reshape(z.size(0), -1, self.head_v_dim)
-                b, a = ba.chunk(2, dim=-1)
-                b = b.contiguous()
-                a = a.contiguous()
 
         # ============================================================
         # Part 2: Core Attention (Custom Op)
@@ -823,8 +794,6 @@ def forward_xpu(
         """
         num_tokens = hidden_states.size(0)
 
-        assert not self.has_lora_projections, "lora isn't supported on XPU."
-
         # ============================================================
         # Part 1: Input Projection
         # ============================================================
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index b277c9ed6b74..47508e3a8d85 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -32,6 +32,7 @@
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 
 @CustomOp.register("minimax_text01_rmsnorm_tp")
@@ -246,8 +247,8 @@ def jit_linear_forward_prefix(
 
 class MiniMaxText01LinearAttention(nn.Module, MambaBase):
     @property
-    def mamba_type(self) -> str:
-        return "linear_attention"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.LINEAR
 
     def get_state_dtype(self) -> tuple[torch.dtype]:
         assert self.model_config is not None
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 0e476755201e..1d3159d1e7b3 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -42,6 +42,7 @@
 )
 from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
@@ -361,6 +362,7 @@ def forward_impl(self, hidden_states: torch.Tensor, output: torch.Tensor):
                 initial_state_idx=block_idx_last_computed_token_p,
                 num_computed_tokens=num_computed_tokens_p,
                 block_size_to_align=mamba_block_size,
+                metadata=attn_metadata,
             )
             # 3. State Space Model sequence transformations.
             discrete_time_step_p, B_p, C_p = self._ssm_transform(
@@ -476,8 +478,8 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         )
 
     @property
-    def mamba_type(self) -> str:
-        return "mamba1"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.MAMBA1
 
     def _time_proj_bias(self) -> torch.Tensor | None:
         if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 2b4b1934f9b3..05cb9aadfe79 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -52,6 +52,7 @@
 )
 from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 # Added by the IBM Team, 2024
 
@@ -935,8 +936,8 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         )
 
     @property
-    def mamba_type(self) -> str:
-        return "mamba2"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.MAMBA2
 
 
 def mamba_mixer2(
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index 1160105ad101..d87a7638533e 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -37,7 +37,7 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
     num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
     # Strides
     stride_x_dim: tl.constexpr,  # stride to get to next feature-value,
-    stride_x_token: tl.constexpr,  # stride to get to next token (same feature-index, same sequence-index)
+    stride_x_token: tl.int64,  # stride to get to next token (same feature-index, same sequence-index)
     stride_w_dim: tl.constexpr,  # stride to get to next dim-axis value
     stride_w_width: tl.constexpr,  # stride to get to next width-axis value
     stride_istate_seq: tl.constexpr,
@@ -45,7 +45,7 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
     stride_istate_token: tl.constexpr,
     stride_cache_indices: tl.constexpr,
     stride_o_dim: tl.constexpr,
-    stride_o_token: tl.constexpr,
+    stride_o_token: tl.int64,
     stride_block_m: tl.constexpr,  # Stride block to align divided by BLOCK_M
     # others
     pad_slot_id: tl.constexpr,
@@ -769,7 +769,7 @@ def _causal_conv1d_update_kernel(
     # Strides
     stride_x_seq: tl.constexpr,
     stride_x_dim: tl.constexpr,
-    stride_x_token: tl.constexpr,
+    stride_x_token: tl.int64,
     stride_w_dim: tl.constexpr,
     stride_w_width: tl.constexpr,
     stride_conv_state_seq: tl.constexpr,
@@ -778,7 +778,7 @@ def _causal_conv1d_update_kernel(
     stride_state_indices: tl.constexpr,
     stride_o_seq: tl.constexpr,
     stride_o_dim: tl.constexpr,
-    stride_o_token: tl.constexpr,
+    stride_o_token: tl.int64,
     # others
     null_block_id: tl.constexpr,
     # Meta-parameters
diff --git a/vllm/model_executor/layers/mamba/ops/ssu_dispatch.py b/vllm/model_executor/layers/mamba/ops/ssu_dispatch.py
index 33a08feb9cfb..92258ef204bd 100644
--- a/vllm/model_executor/layers/mamba/ops/ssu_dispatch.py
+++ b/vllm/model_executor/layers/mamba/ops/ssu_dispatch.py
@@ -14,6 +14,7 @@
 
 from vllm.config.mamba import MambaBackendEnum, MambaConfig
 from vllm.logger import init_logger
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 from vllm.v1.attention.backends.utils import NULL_BLOCK_ID
 from vllm.v1.kv_cache_interface import KVCacheConfig, MambaSpec
 
@@ -200,7 +201,8 @@ def initialize_mamba_ssu_backend(
     """
     if not any(
         isinstance(g.kv_cache_spec, MambaSpec)
-        and g.kv_cache_spec.mamba_type in ("mamba1", "mamba2")
+        and g.kv_cache_spec.mamba_type
+        in (MambaAttentionBackendEnum.MAMBA1, MambaAttentionBackendEnum.MAMBA2)
         for g in kv_cache_config.kv_cache_groups
     ):
         return
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index 629167acfe52..79976dfff14e 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -25,6 +25,7 @@
 )
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backend import AttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata
 
 
@@ -223,8 +224,8 @@ def get_state_shape(self) -> tuple[tuple[int, ...]]:
         )
 
     @property
-    def mamba_type(self) -> str:
-        return "short_conv"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.SHORT_CONV
 
 
 def short_conv(
diff --git a/vllm/model_executor/layers/mhc.py b/vllm/model_executor/layers/mhc.py
index cbc5ec2962ec..b3ea3b5e98e6 100644
--- a/vllm/model_executor/layers/mhc.py
+++ b/vllm/model_executor/layers/mhc.py
@@ -441,6 +441,131 @@ def mhc_post_tilelang(
         T.pdl_trigger()
 
 
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
+    },
+)
+def mhc_fused_tilelang(
+    comb_mix,
+    residual_in,
+    post_mix,
+    x_in,
+    weight_t,
+    yp_out,
+    rp_out,
+    residual_out,
+    hc: int,
+    hidden: int,
+    n_out: int,
+    n_thr: int = 256,
+    h_blk: int = 256,
+    tile_n: int = 1,
+    split_k: int = 1,
+) -> tilelang.JITKernel:
+    """Fused mhc post-mapping + pre-norm GEMM FMA"""
+    m = T.dynamic("num_tokens")
+    split_k = T.dynamic("split_k")
+    h = hidden
+    h_blk = math.gcd(hidden, h_blk)
+    h_per_split = h // split_k
+    n_tiles = n_out // tile_n
+
+    comb_mix: T.Tensor((m, hc, hc), T.float32)  # type: ignore[no-redef, valid-type]
+    residual_in: T.Tensor((m, hc, h), T.bfloat16)  # type: ignore[no-redef, valid-type]
+    post_mix: T.Tensor((m, hc), T.float32)  # type: ignore[no-redef, valid-type]
+    x_in: T.Tensor((m, h), T.bfloat16)  # type: ignore[no-redef, valid-type]
+    weight_t: T.Tensor((n_out, hc, h), T.float32)  # type: ignore[no-redef, valid-type]
+    yp_out: T.Tensor((split_k, m, n_out), T.float32)  # type: ignore[no-redef, valid-type]
+    rp_out: T.Tensor((split_k, m), T.float32)  # type: ignore[no-redef, valid-type]
+    residual_out: T.Tensor((m, hc, h), T.bfloat16)  # type: ignore[no-redef, valid-type]
+
+    h_iters = h_per_split // n_thr
+    num_warps = n_thr // 32
+
+    with T.Kernel(m, n_tiles, split_k, threads=n_thr) as (i_n, i_nt, i_ks):
+        tid = T.get_thread_binding()
+        warp_id = T.get_warp_idx()
+        lane = T.get_lane_idx()
+
+        s_warp = T.alloc_shared((num_warps, tile_n + 1), T.float32)
+        s_post = T.alloc_shared((hc,), T.float32)
+        s_comb = T.alloc_shared((hc, hc), T.float32)
+
+        pm = T.alloc_local((hc,), T.float32)
+        cm = T.alloc_local((hc, hc), T.float32)
+        acc = T.alloc_local((tile_n,), T.float32)
+        sqr = T.alloc_local((1,), T.float32)
+        new_r = T.alloc_local((hc,), T.float32)
+
+        T.clear(acc)
+        T.clear(sqr)
+        h_split_start = i_ks * h_per_split
+
+        T.pdl_sync()
+
+        T.copy(post_mix[i_n, 0], s_post)
+        T.copy(comb_mix[i_n, 0, 0], s_comb)
+
+        for j in T.unroll(hc):
+            pm[j] = s_post[j]
+        for j in T.unroll(hc):
+            for k in T.unroll(hc):
+                cm[k, j] = s_comb[k, j]
+
+        # Each thread owns h_iters elements of the k-split's h slice.
+        for it in T.serial(h_iters):
+            h_idx = h_split_start + it * n_thr + tid
+
+            # Compute new residual from layer output and past residual
+            for j in T.unroll(hc):
+                new_r[j] = pm[j] * x_in[i_n, h_idx]
+                for k in T.unroll(hc):
+                    new_r[j] += cm[k, j] * residual_in[i_n, k, h_idx]
+
+            # populate residual_out and compute sqr sum
+            if i_nt == 0:
+                for j in T.unroll(hc):
+                    residual_out[i_n, j, h_idx] = new_r[j]
+                    sqr[0] += new_r[j] * new_r[j]
+
+            # Per-thread FMA into acc[n]
+            for n in T.unroll(tile_n):
+                for j in T.unroll(hc):
+                    acc[n] += weight_t[i_nt * tile_n + n, j, h_idx] * new_r[j]
+
+        for n in T.unroll(tile_n):
+            acc[n] = T.warp_reduce_sum(acc[n])
+        if i_nt == 0:
+            sqr[0] = T.warp_reduce_sum(sqr[0])
+
+        # Cross-warp reduce via shared mem
+        if lane == 0:
+            for n in T.unroll(tile_n):
+                s_warp[warp_id, n] = acc[n]
+            if i_nt == 0:
+                s_warp[warp_id, tile_n] = sqr[0]
+        T.sync_threads()
+
+        # Warp 0 does the final cross-warp sum and writes outputs
+        if warp_id == 0:
+            if lane < tile_n:
+                v = T.alloc_var(T.float32, init=0.0)
+                for w in T.unroll(num_warps):
+                    v += s_warp[w, lane]
+                yp_out[i_ks, i_n, i_nt * tile_n + lane] = v
+
+            if i_nt == 0 and lane == 0:
+                v2 = T.alloc_var(T.float32, init=0.0)
+                for w in T.unroll(num_warps):
+                    v2 += s_warp[w, tile_n]
+                rp_out[i_ks, i_n] = v2
+
+        T.pdl_trigger()
+
+
 def mhc_post(
     x: torch.Tensor,
     residual: torch.Tensor,
@@ -468,6 +593,242 @@ def mhc_post(
     return out
 
 
+def mhc_fused_post_pre(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    post_layer_mix: torch.Tensor,
+    comb_res_mix: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+    n_splits: int = 1,
+    tile_n: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Run one MHC post block followed by the next MHC pre block.
+
+    Returns:
+        residual_cur: post-mapped residual, shape (..., hc_mult, hidden_size)
+        post_mix_cur: shape (..., hc_mult, 1)
+        comb_mix_cur: shape (..., hc_mult, hc_mult)
+        layer_input_cur: shape (..., hidden_size)
+    """
+
+    assert residual.dtype == torch.bfloat16
+    assert x.dtype == torch.bfloat16
+    assert post_layer_mix.dtype == torch.float32
+    assert comb_res_mix.dtype == torch.float32
+    assert fn.dtype == torch.float32
+    assert hc_scale.dtype == torch.float32
+    assert hc_base.dtype == torch.float32
+
+    hc_mult = residual.shape[-2]
+    hidden_size = residual.shape[-1]
+    hc_mult2 = hc_mult * hc_mult
+    hc_mult3 = hc_mult * 2 + hc_mult2
+    hc_hidden_size = hc_mult * hidden_size
+    outer_shape = residual.shape[:-2]
+
+    assert x.shape == (*outer_shape, hidden_size)
+    assert post_layer_mix.shape in (
+        (*outer_shape, hc_mult, 1),
+        (*outer_shape, hc_mult),
+    )
+    assert comb_res_mix.shape == (*outer_shape, hc_mult, hc_mult)
+    assert fn.shape == (hc_mult3, hc_hidden_size)
+    assert hc_scale.shape == (3,)
+    assert hc_base.shape == (hc_mult3,)
+
+    assert n_splits in (1, 2, 4, 8)
+    assert hidden_size % n_splits == 0
+
+    if current_platform.is_rocm():
+        # tilelang ships only CUDA codegen and the fused kernels here
+        # additionally use PDL (Hopper-only). Compose the existing torch
+        # fallbacks of ``mhc_post`` + ``mhc_pre`` instead — both already
+        # have a ROCm branch and produce the exact output shapes/dtypes
+        # the fused op contracts on.
+        if post_layer_mix.ndim == residual.ndim - 1:
+            post_layer_mix_3d = post_layer_mix.unsqueeze(-1)
+        else:
+            post_layer_mix_3d = post_layer_mix
+        residual_cur = mhc_post(x, residual, post_layer_mix_3d, comb_res_mix)
+        post_mix_cur, comb_mix_cur, layer_input_cur = mhc_pre(
+            residual_cur,
+            fn,
+            hc_scale,
+            hc_base,
+            rms_eps,
+            hc_pre_eps,
+            hc_sinkhorn_eps,
+            hc_post_mult_value,
+            sinkhorn_repeat,
+        )
+        return residual_cur, post_mix_cur, comb_mix_cur, layer_input_cur
+
+    residual_flat = residual.view(-1, hc_mult, hidden_size)
+    num_tokens = residual_flat.shape[0]
+    x_flat = x.view(num_tokens, hidden_size)
+    post_layer_mix_flat = post_layer_mix.view(num_tokens, hc_mult)
+    comb_res_mix_flat = comb_res_mix.view(num_tokens, hc_mult, hc_mult)
+
+    fma_token_threshold = 16
+    if num_tokens <= fma_token_threshold:
+        # TODO(gnovack): investigate autotuning these heuristics
+        tile_n = 2 if num_tokens < 8 else 3
+        n_splits = 8 if (num_tokens < 8 and hidden_size <= 4096) else 4
+    else:
+        # these number are from deepgemm kernel impl
+        block_k = 64
+        block_m = 64
+        n_splits = compute_num_split(block_k, hc_hidden_size, cdiv(num_tokens, block_m))
+
+    gemm_out_mul = torch.empty(
+        n_splits,
+        num_tokens,
+        hc_mult3,
+        dtype=torch.float32,
+        device=residual.device,
+    )
+    gemm_out_sqrsum = torch.empty(
+        n_splits,
+        num_tokens,
+        dtype=torch.float32,
+        device=residual.device,
+    )
+    residual_cur = torch.empty_like(residual_flat)
+    post_mix_cur = torch.empty(
+        num_tokens,
+        hc_mult,
+        dtype=torch.float32,
+        device=residual.device,
+    )
+    comb_mix_cur = torch.empty(
+        num_tokens,
+        hc_mult2,
+        dtype=torch.float32,
+        device=residual.device,
+    )
+    layer_input_cur = torch.empty(
+        num_tokens,
+        hidden_size,
+        dtype=torch.bfloat16,
+        device=residual.device,
+    )
+
+    if num_tokens <= fma_token_threshold:
+        mhc_fused_tilelang(
+            comb_res_mix_flat,
+            residual_flat,
+            post_layer_mix_flat,
+            x_flat,
+            fn.view(hc_mult3, hc_mult, hidden_size),
+            gemm_out_mul,
+            gemm_out_sqrsum,
+            residual_cur,
+            hc_mult,
+            hidden_size,
+            hc_mult3,
+            tile_n=tile_n,
+            n_splits=n_splits,
+        )
+    else:
+        mhc_post_tilelang(
+            comb_res_mix_flat,
+            residual_flat,
+            post_layer_mix_flat,
+            x_flat,
+            residual_cur,
+            residual.shape[-2],
+            residual.shape[-1],
+        )
+
+        from vllm.utils.deep_gemm import tf32_hc_prenorm_gemm
+
+        tf32_hc_prenorm_gemm(
+            residual_cur.view(num_tokens, hc_mult * hidden_size),
+            fn,
+            gemm_out_mul,
+            gemm_out_sqrsum,
+            n_splits,
+        )
+
+    mhc_pre_big_fuse_tilelang(
+        gemm_out_mul,
+        gemm_out_sqrsum,
+        hc_scale,
+        hc_base,
+        residual_cur,
+        post_mix_cur,
+        comb_mix_cur,
+        layer_input_cur,
+        hidden_size,
+        rms_eps,
+        hc_pre_eps,
+        hc_sinkhorn_eps,
+        hc_post_mult_value,
+        sinkhorn_repeat,
+        n_splits,
+        hc_mult,
+    )
+
+    return (
+        residual_cur.view(*outer_shape, hc_mult, hidden_size),
+        post_mix_cur.view(*outer_shape, hc_mult, 1),
+        comb_mix_cur.view(*outer_shape, hc_mult, hc_mult),
+        layer_input_cur.view(*outer_shape, hidden_size),
+    )
+
+
+def _mhc_fused_post_pre_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    post_layer_mix: torch.Tensor,
+    comb_res_mix: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+    n_splits: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    hc_mult = residual.shape[-2]
+    hidden_size = residual.shape[-1]
+    outer_shape = residual.shape[:-2]
+
+    residual_cur = torch.empty_like(residual)
+    post_mix_cur = torch.empty(
+        *outer_shape,
+        hc_mult,
+        1,
+        dtype=torch.float32,
+        device=residual.device,
+    )
+    comb_mix_cur = torch.empty(
+        *outer_shape,
+        hc_mult,
+        hc_mult,
+        dtype=torch.float32,
+        device=residual.device,
+    )
+    layer_input_cur = torch.empty(
+        *outer_shape,
+        hidden_size,
+        dtype=torch.bfloat16,
+        device=residual.device,
+    )
+
+    return residual_cur, post_mix_cur, comb_mix_cur, layer_input_cur
+
+
 def _mhc_post_fake(
     x: torch.Tensor,
     residual: torch.Tensor,
@@ -489,6 +850,12 @@ def _mhc_post_fake(
     mutates_args=[],
     fake_impl=_mhc_post_fake,
 )
+direct_register_custom_op(
+    op_name="mhc_fused_post_pre",
+    op_func=mhc_fused_post_pre,
+    mutates_args=[],
+    fake_impl=_mhc_fused_post_pre_fake,
+)
 
 
 @tilelang.jit(
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index cfad1f86faa2..8f455220fe3b 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -20,7 +20,7 @@
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.experts.marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoEMethodBase,
@@ -764,7 +764,7 @@ def select_gemm_impl(
             )
 
         from vllm.model_executor.layers.fused_moe import modular_kernel as mk
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
             BatchedMarlinExperts,
             MarlinExperts,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py
index 629e1c5ef1be..2ac2e28f20b5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
     CutlassExpertsMxfp4,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
     MarlinExperts,
 )
 from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import (
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py
index 8f86e687b7f6..10a7302dc1d0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py
@@ -20,7 +20,7 @@
     FusedMoEQuantConfig,
     int4_w4a16_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+from vllm.model_executor.layers.fused_moe.experts.marlin_moe import (
     BatchedMarlinExperts,
     MarlinExperts,
     fused_marlin_moe,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 1c9237d3f60a..af4419ccbe98 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -16,7 +16,10 @@
 from vllm.model_executor.kernels.linear import (
     init_fp8_linear_kernel,
 )
-from vllm.model_executor.kernels.linear.scaled_mm import MarlinFP8ScaledMMLinearKernel
+from vllm.model_executor.kernels.linear.scaled_mm import (
+    CutlassFP8ScaledMMLinearKernel,
+    MarlinFP8ScaledMMLinearKernel,
+)
 from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.fused_moe import (
     FusedMoE,
@@ -28,6 +31,7 @@
 )
 from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.fused_moe.oracle.fp8 import (
+    Fp8MoeBackend,
     convert_to_fp8_moe_kernel_format,
     make_fp8_moe_kernel,
     make_fp8_moe_quant_config,
@@ -436,8 +440,8 @@ def apply(
         x: torch.Tensor,
         bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        # if batch invariant mode is enabled, prefer DeepGEMM FP8 path
-        # we will use BF16 dequant when DeepGEMM is not supported.
+        # if batch invariant mode is enabled, prefer direct FP8 path
+        # we will use BF16 dequant when direct FP8 is not supported.
         if envs.VLLM_BATCH_INVARIANT:
             if self.block_quant:
                 assert self.weight_block_size is not None
@@ -447,6 +451,9 @@ def apply(
                     bias,
                 )
             else:
+                if isinstance(self.fp8_linear, CutlassFP8ScaledMMLinearKernel):
+                    return self.fp8_linear.apply_weights(layer, x, bias)
+
                 # per-tensor/channel: dequant to BF16 and run GEMM
                 weight_fp8 = layer.weight.to(torch.bfloat16)
                 weight_scale = layer.weight_scale.to(torch.bfloat16)
@@ -766,6 +773,11 @@ def _setup_kernel(
         replace_parameter(layer, f"w13_{self.weight_scale_name}", w13_scale)
         replace_parameter(layer, f"w2_{self.weight_scale_name}", w2_scale)
 
+        # AITER backend requires weights to be marked as shuffled.
+        if self.fp8_backend == Fp8MoeBackend.AITER:
+            layer.w13_weight.is_shuffled = True
+            layer.w2_weight.is_shuffled = True
+
         self.moe_quant_config = self.get_fused_moe_quant_config(layer)
         if self.moe_quant_config:
             assert self.experts_cls is not None
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 0862efbea294..6566d671532a 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -11,6 +11,8 @@
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.kernels.linear import (
+    MarlinNvFp4LinearKernel,
+    NvFp4LinearLayerConfig,
     init_fp8_linear_kernel,
     init_mxfp8_linear_kernel,
     init_nvfp4_linear_kernel,
@@ -89,6 +91,7 @@
 from vllm.model_executor.parameter import (
     BlockQuantScaleParameter,
     ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
     ModelWeightParameter,
     PerTensorScaleParameter,
 )
@@ -107,8 +110,10 @@
     "FP8_PER_CHANNEL_PER_TOKEN",
     # FP8 per-block weight-only (ModelOpt may emit this as lowercase).
     "FP8_PB_WO",
-    # FP4
+    # NVFP4 W4A4 (4-bit float weights AND 4-bit float activations).
     "NVFP4",
+    # W4A16 NVFP4 (4-bit float weights, fp16/bf16 activations).
+    "W4A16_NVFP4",
     # MXFP8
     "MXFP8",
     # MIXED_PRECISION,
@@ -245,7 +250,7 @@ def _extract_modelopt_quant_algo(
         """
         if hf_quant_cfg is None:
             return None
-        if hf_quant_cfg.get("quant_method", "").lower() != "modelopt":
+        if not hf_quant_cfg.get("quant_method", "").lower().startswith("modelopt"):
             return None
         if "quantization" in hf_quant_cfg:
             quant_config = hf_quant_cfg["quantization"]
@@ -1003,22 +1008,41 @@ class ModelOptNvFp4Config(ModelOptQuantConfigBase):
 
     def __init__(
         self,
-        is_checkpoint_nvfp4_serialized: bool,
-        kv_cache_quant_algo: str | None,
-        exclude_modules: list[str],
+        quant_method: str = "NVFP4",
+        is_checkpoint_nvfp4_serialized: bool = False,
+        kv_cache_quant_algo: str | None = None,
+        exclude_modules: list[str] | None = None,
         group_size: int = 16,
     ) -> None:
+        if exclude_modules is None:
+            exclude_modules = []
         super().__init__(exclude_modules)
+        self.quant_method = quant_method
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
             logger.warning(
-                "Detected ModelOpt NVFP4 checkpoint. Please note that"
-                " the format is experimental and could change in future."
+                "Detected ModelOpt NVFP4 checkpoint (quant_algo=%s). Please "
+                "note that the format is experimental and could change in "
+                "future.",
+                quant_method,
             )
 
             self.group_size = group_size
             self.kv_cache_quant_algo = kv_cache_quant_algo
 
+        # Select LinearMethod implementation based on quant_algo (FP8 pattern).
+        # NVFP4         -> W4A4: cutlass NVFP4 GEMM with input quantization
+        # W4A16_NVFP4   -> W4A16: FP4 Marlin GEMM with bf16/fp16 activations
+        if quant_method == "NVFP4":
+            self.LinearMethodCls = ModelOptNvFp4LinearMethod
+        elif quant_method == "W4A16_NVFP4":
+            self.LinearMethodCls = ModelOptNvFp4W4A16LinearMethod
+        else:
+            raise ValueError(
+                f"Unsupported ModelOpt NVFP4 quant_algo: {quant_method}. "
+                "Supported: NVFP4 / W4A16_NVFP4."
+            )
+
     def get_name(self) -> QuantizationMethods:
         return "modelopt_fp4"
 
@@ -1069,6 +1093,7 @@ def _from_config(
                 )
 
         return cls(
+            quant_method,
             is_checkpoint_nvfp4_serialized,
             kv_cache_quant_method,
             exclude_modules,
@@ -1208,6 +1233,152 @@ def apply(
         return self.kernel.apply_weights(layer=layer, x=x, bias=bias)
 
 
+class ModelOptNvFp4W4A16LinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt NVFP4 W4A16.
+
+    4-bit NVFP4 weights, fp16/bf16 activations. Loads ModelOpt-style names
+    directly (no on-disk conversion) and dispatches to the FP4 Marlin GEMM:
+
+        weight          uint8     packed NVFP4 (2 nibbles/byte along input dim)
+        weight_scale    fp8-e4m3  per 16-elem group along input dim
+        weight_scale_2  fp32      per-tensor global scale = amax / (6.0 * 448.0)
+
+    No activation quantization. Marlin expects the global scale in the same
+    form ModelOpt stores (amax/2688), so we rename weight_scale_2 ->
+    weight_global_scale **without reciprocation** -- the CT W4A16 path
+    reciprocates only because CT stores the inverse on disk.
+
+    We also register a placeholder input_scale parameter so that W4A4-shaped
+    checkpoints (which contain *_proj.input_scale tensors) can be loaded
+    under this method without the per-shard loader hitting a KeyError on
+    the merged-name lookup. The placeholder is discarded in
+    process_weights_after_loading -- its value is never used.
+    """
+
+    def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
+        self.quant_config = quant_config
+        # Vestigial slot mirrored from ModelOptNvFp4LinearMethod: the parent
+        # config's get_quant_method only fills marlin_input_dtype when
+        # backend == "marlin"; we don't set that since we pin the kernel
+        # below, but we keep the attribute for shape parity.
+        self.marlin_input_dtype = None
+        # Direct-instantiate the Marlin NVFP4 adapter rather than going through
+        # init_nvfp4_linear_kernel(): the latter's priority list returns a
+        # cutlass W4A4 kernel as first-pick on this hardware, which would
+        # silently try to quantize activations (we have no input_scale). For
+        # W4A16 there is exactly one valid kernel, so we pin it.
+        self.kernel = MarlinNvFp4LinearKernel(NvFp4LinearLayerConfig())
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "W4A16_NVFP4 quantization was selected; "
+                "dynamic quantization is not supported."
+            )
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        if input_size_per_partition % 16 != 0:
+            raise ValueError(
+                "Unsupported model: input feature size is not a multiple of 16."
+            )
+
+        # Packed NVFP4 weights: uint8, 2 nibbles per byte along the input dim.
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # Per-tensor global weight scale (fp32). ModelOpt stores
+        # amax / (NVFP4_max * fp8_e4m3_max) = amax / 2688. PerTensorScaleParameter
+        # holds one entry per fused output partition (e.g. q/k/v in a fused QKV).
+        weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        # Per-group fp8 weight scale.
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.group_size,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # Placeholder input_scale param so W4A4-shaped checkpoints can be
+        # loaded under this method without KeyError on the merged-name
+        # lookup (qwen2-style stacked-loader path renames *_proj.input_scale
+        # to e.g. qkv_proj.input_scale and looks it up unconditionally).
+        # Discarded in process_weights_after_loading; never read by the kernel.
+        # For native W4A16 checkpoints (no input_scale on disk) the param
+        # stays uninitialized and is simply deleted.
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("input_scale", input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Discard the input_scale placeholder. Whether it carries values
+        # (W4A4 ckpt loaded as W4A16) or is uninitialized (native W4A16
+        # ckpt), W4A16 mode does not quantize activations, so this is unused.
+        if hasattr(layer, "input_scale"):
+            del layer.input_scale
+
+        if torch.unique(layer.weight_scale_2).numel() != 1:
+            logger.warning_once(
+                "In W4A16_NVFP4 linear, the global weight scale "
+                "(weight_scale_2) differs across fused parallel layers "
+                "(e.g. q/k/v_proj). This will likely reduce accuracy. "
+                "Consider a checkpoint with a shared global scale."
+            )
+
+        # Rename weight_scale_2 -> weight_global_scale. NO reciprocation:
+        # ModelOpt already stores amax/2688, which is exactly what Marlin
+        # consumes via nvfp4_marlin_process_global_scale (called inside the
+        # Marlin adapter's process_weights_after_loading).
+        layer.weight_global_scale = Parameter(
+            layer.weight_scale_2.max().to(torch.float32), requires_grad=False
+        )
+        del layer.weight_scale_2
+
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.kernel.apply_weights(layer=layer, x=x, bias=bias)
+
+
 class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     """
     MoE Method for FP4 Quantization.
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index d6fef0b3d3d5..d3f3033be7b3 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -350,6 +350,11 @@ def _setup_kernel(
             self.w13_precision_config = w13_scale
             self.w2_precision_config = w2_scale
 
+        # AITER backend requires weights to be marked as shuffled.
+        if self.mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
+            layer.w13_weight.is_shuffled = True
+            layer.w2_weight.is_shuffled = True
+
         if w13_bias is not None and w2_bias is not None:
             replace_parameter(layer, "w13_bias", w13_bias)
             replace_parameter(layer, "w2_bias", w2_bias)
@@ -678,6 +683,11 @@ def _setup_kernel(
             self.w13_precision_config = w13_scale
             self.w2_precision_config = w2_scale
 
+        # AITER backend requires weights to be marked as shuffled.
+        if self.mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
+            layer.w13_weight.is_shuffled = True
+            layer.w2_weight.is_shuffled = True
+
         if w13_bias is not None and w2_bias is not None:
             replace_parameter(layer, "w13_bias", w13_bias)
             replace_parameter(layer, "w2_bias", w2_bias)
diff --git a/vllm/model_executor/layers/quantization/online/fp8.py b/vllm/model_executor/layers/quantization/online/fp8.py
index 9cb697289d7e..cad65c4c9fe4 100644
--- a/vllm/model_executor/layers/quantization/online/fp8.py
+++ b/vllm/model_executor/layers/quantization/online/fp8.py
@@ -18,6 +18,9 @@
 from vllm import _custom_ops as ops
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.kernels.linear import init_fp8_linear_kernel
+from vllm.model_executor.kernels.linear.scaled_mm import (
+    CutlassFP8ScaledMMLinearKernel,
+)
 from vllm.model_executor.layers.fused_moe.oracle.fp8 import (
     select_fp8_moe_backend,
 )
@@ -161,6 +164,9 @@ def apply(
     ) -> torch.Tensor:
         # if batch invariant mode is enabled, use BF16 dequant
         if envs.VLLM_BATCH_INVARIANT:
+            if isinstance(self.fp8_linear, CutlassFP8ScaledMMLinearKernel):
+                return self.fp8_linear.apply_weights(layer, x, bias)
+
             weight_fp8 = layer.weight.to(torch.bfloat16)
             weight_scale = layer.weight_scale.to(torch.bfloat16)
             if weight_scale.numel() == 1:
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index a14bfbc9c19b..3889e376b560 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -26,7 +26,7 @@
     mxfp4_w4a16_moe_quant_config,
     ocp_mx_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.experts.marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import (
     TRITON_BACKENDS,
     Mxfp4MoeBackend,
@@ -444,7 +444,7 @@ def apply(
         shared_experts_input: torch.Tensor | None,
     ) -> torch.Tensor:
         if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
                 rocm_aiter_fused_experts,
             )
 
@@ -909,7 +909,7 @@ def apply(
         topk_ids: torch.Tensor,
         shared_experts_input: torch.Tensor | None,
     ) -> torch.Tensor:
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
             rocm_aiter_fused_experts,
         )
 
@@ -1436,7 +1436,7 @@ def apply(
 
         # AITER path
         # TODO: Refactor this to use modular MOE kernel as well.
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
             rocm_aiter_fused_experts,
         )
 
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index d9aab35c25f4..7a54620c255c 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -171,7 +171,7 @@ def _silu_mul_quant_fp8_packed_kernel(
 
     pid_pack = tl.program_id(0)
     pid_m = tl.program_id(1)
-    m_offset = pid_m * BLOCK_M
+    m_offset = pid_m.to(tl.int64) * BLOCK_M
 
     if m_offset >= M:
         return
@@ -321,8 +321,8 @@ def _silu_mul_per_token_group_quant_fp8_colmajor(
     pid_n = tl.program_id(1)
     N_2 = N // 2
 
-    m_offset = pid_m * BLOCK_M
-    n_offset = pid_n * BLOCK_N
+    m_offset = pid_m.to(tl.int64) * BLOCK_M
+    n_offset = pid_n.to(tl.int64) * BLOCK_N
     if m_offset >= M:
         return
 
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index 9a06eedd0f7d..7362abcc8fbc 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -127,29 +127,52 @@ def forward_native(
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """PyTorch-native implementation equivalent to forward()."""
         assert key is not None
-        cos_sin_cache = self._match_cos_sin_cache_dtype(query)
-        query_rot = query[..., : self.rotary_dim]
-        key_rot = key[..., : self.rotary_dim]
-        if self.rotary_dim < self.head_size:
-            query_pass = query[..., self.rotary_dim :]
-            key_pass = key[..., self.rotary_dim :]
+        return self.forward_static(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.rotary_dim,
+            self.cos_sin_cache,
+            self.is_neox_style,
+            offsets,
+        )
+
+    @staticmethod
+    def forward_static(
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        head_size: int,
+        rotary_dim: int,
+        cos_sin_cache: torch.Tensor,
+        is_neox_style: bool,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """A static implementation of forward()."""
+        assert key is not None
+        query_rot = query[..., :rotary_dim]
+        key_rot = key[..., :rotary_dim]
+        if rotary_dim < head_size:
+            query_pass = query[..., rotary_dim:]
+            key_pass = key[..., rotary_dim:]
 
         cos_sin = cos_sin_cache[
             torch.add(positions, offsets) if offsets is not None else positions
         ]
         cos, sin = cos_sin.chunk(2, dim=-1)
-        if self.is_neox_style:
+        if is_neox_style:
             cos = torch.cat((cos, cos), dim=-1).unsqueeze(-2)
             sin = torch.cat((sin, sin), dim=-1).unsqueeze(-2)
         else:
             cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
             sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
 
-        rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
+        rotate_fn = rotate_neox if is_neox_style else rotate_gptj
         query_rot = query_rot * cos + rotate_fn(query_rot) * sin
         key_rot = key_rot * cos + rotate_fn(key_rot) * sin
 
-        if self.rotary_dim < self.head_size:
+        if rotary_dim < head_size:
             query = torch.cat((query_rot, query_pass), dim=-1)
             key = torch.cat((key_rot, key_pass), dim=-1)
         else:
diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
index ec03fc6533f9..59ad6359aa1c 100644
--- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -195,10 +195,8 @@ def forward_cuda(
     def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
         cos, sin = cos_sin.chunk(2, dim=-1)
         if self.is_neox_style:
-            # NOTE(woosuk): Here we assume that the positions tensor has the
-            # shape [batch_size, seq_len].
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+            cos = torch.cat((cos, cos), dim=-1).unsqueeze(-2)
+            sin = torch.cat((sin, sin), dim=-1).unsqueeze(-2)
         else:
             cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
             sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index a76092028671..5b5632e319c9 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -256,6 +256,12 @@ def _get_weights_iterator(
                         self.load_config.use_tqdm_on_load,
                         self.load_config.safetensors_load_strategy,
                         local_expert_ids=self.local_expert_ids,
+                        safetensors_prefetch_num_threads=(
+                            self.load_config.safetensors_prefetch_num_threads
+                        ),
+                        safetensors_prefetch_block_size=(
+                            self.load_config.safetensors_prefetch_block_size
+                        ),
                     )
         else:
             if extra_config.get("enable_multithread_load"):
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 31b00df4e4c3..c418c29a8c72 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -30,7 +30,11 @@
 
 from vllm import envs
 from vllm.config import ModelConfig
-from vllm.config.load import LoadConfig
+from vllm.config.load import (
+    DEFAULT_SAFETENSORS_PREFETCH_BLOCK_SIZE,
+    DEFAULT_SAFETENSORS_PREFETCH_NUM_THREADS,
+    LoadConfig,
+)
 from vllm.distributed import get_tensor_model_parallel_rank, get_world_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (
@@ -810,40 +814,57 @@ def _get_fs_type(files: list[str]) -> str:
         return ""
 
 
-def _prefetch_checkpoint(file_path: str) -> None:
+def _prefetch_checkpoint(
+    file_path: str,
+    block_size: int = DEFAULT_SAFETENSORS_PREFETCH_BLOCK_SIZE,
+) -> None:
     """Prefetch a checkpoint file into the OS page cache.
 
-    Reads the file in 16MB blocks so the kernel caches its pages before
-    workers load the same file.
+    Reads the file in blocks so the kernel caches its pages before workers load
+    the same file.
     """
-    block_size = 16 * 1024 * 1024  # 16MB
+    if block_size < 1:
+        raise ValueError("safetensors prefetch block size must be >= 1")
+
     with open(file_path, "rb") as f:
         while f.read(block_size):
             pass
 
 
-def _prefetch_all_checkpoints(sorted_files: list[str]) -> None:
+def _prefetch_all_checkpoints(
+    sorted_files: list[str],
+    num_prefetch_threads: int = DEFAULT_SAFETENSORS_PREFETCH_NUM_THREADS,
+    block_size: int = DEFAULT_SAFETENSORS_PREFETCH_BLOCK_SIZE,
+) -> None:
     """Start prefetching checkpoint files into page cache in a background thread."""
+    if num_prefetch_threads < 1:
+        raise ValueError("safetensors prefetch num threads must be >= 1")
+    if block_size < 1:
+        raise ValueError("safetensors prefetch block size must be >= 1")
+
     if torch.distributed.is_initialized():
         rank = torch.distributed.get_rank()
         world_size = torch.distributed.get_world_size()
     else:
         rank = 0
         world_size = 1
-    num_prefetch_threads = 8
     paths_to_prefetch = sorted_files[rank::world_size]
     total_for_rank = len(paths_to_prefetch)
 
     async def _prefetch_all() -> None:
-        semaphore = asyncio.Semaphore(num_prefetch_threads)
+        loop = asyncio.get_running_loop()
         completed = 0
         next_log_pct = 10
 
-        async def prefetch_one(path: str) -> None:
+        async def prefetch_one(
+            path: str,
+            executor: concurrent.futures.ThreadPoolExecutor,
+        ) -> None:
             nonlocal completed, next_log_pct
             try:
-                async with semaphore:
-                    await asyncio.to_thread(_prefetch_checkpoint, path)
+                await loop.run_in_executor(
+                    executor, _prefetch_checkpoint, path, block_size
+                )
                 completed += 1
                 if total_for_rank > 0 and next_log_pct <= 100:
                     pct = 100 * completed / total_for_rank
@@ -860,7 +881,12 @@ async def prefetch_one(path: str) -> None:
                     "Failed to prefetch checkpoint file %r.", path, exc_info=True
                 )
 
-        await asyncio.gather(*(prefetch_one(p) for p in paths_to_prefetch))
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=num_prefetch_threads
+        ) as executor:
+            await asyncio.gather(
+                *(prefetch_one(p, executor) for p in paths_to_prefetch)
+            )
 
     def _run_prefetch() -> None:
         start = time.perf_counter()
@@ -871,7 +897,12 @@ def _run_prefetch() -> None:
             elapsed,
         )
 
-    logger.info("Prefetching checkpoint files into page cache started (in background)")
+    logger.info(
+        "Prefetching checkpoint files into page cache started "
+        "(in background, num_threads=%d, block_size=%d bytes)",
+        num_prefetch_threads,
+        block_size,
+    )
     threading.Thread(target=_run_prefetch, daemon=True).start()
 
 
@@ -880,6 +911,9 @@ def safetensors_weights_iterator(
     use_tqdm_on_load: bool,
     safetensors_load_strategy: str | None = None,
     local_expert_ids: set[int] | None = None,
+    *,
+    safetensors_prefetch_num_threads: int = DEFAULT_SAFETENSORS_PREFETCH_NUM_THREADS,
+    safetensors_prefetch_block_size: int = DEFAULT_SAFETENSORS_PREFETCH_BLOCK_SIZE,
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files.
 
@@ -951,7 +985,11 @@ def safetensors_weights_iterator(
         )
 
     if should_prefetch:
-        _prefetch_all_checkpoints(sorted_files)
+        _prefetch_all_checkpoints(
+            sorted_files,
+            num_prefetch_threads=safetensors_prefetch_num_threads,
+            block_size=safetensors_prefetch_block_size,
+        )
 
     leftover_state_dict: dict[str, torch.Tensor] = {}
     for st_file in tqdm(
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index bc4f85bf7ddb..eb8c3e3f65e1 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -45,6 +45,7 @@
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
+    maybe_prefix,
 )
 
 
@@ -367,7 +368,10 @@ def __init__(self, *, vllm_config, prefix: str = "") -> None:
         self.config = config
 
         # Initialize the inner Transformer model (ArceeModel)
-        self.model = ArceeModel(vllm_config=vllm_config, prefix=f"{prefix}.model")
+        self.model = ArceeModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
         # On the last pipeline stage, set up the LM head and logits processor
         if get_pp_group().is_last_rank:
             # Determine vocabulary size (including any LoRA extra tokens
@@ -378,7 +382,7 @@ def __init__(self, *, vllm_config, prefix: str = "") -> None:
                 config.hidden_size,
                 quant_config=vllm_config.quant_config,
                 bias=getattr(config, "lm_head_bias", False),
-                prefix=f"{prefix}.lm_head",
+                prefix=maybe_prefix(prefix, "lm_head"),
             )
             if config.tie_word_embeddings:
                 # Tie output weights with input embedding matrix
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 48c8d9a441e5..55bc64cd94a9 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -540,7 +540,7 @@ def __init__(
             self.vision_tower = AriaVisionTransformer(
                 config.vision_config,
                 quant_config=quant_config,
-                prefix=f"{prefix}.vision_tower",
+                prefix=maybe_prefix(prefix, "vision_tower"),
             )
             self.multi_modal_projector = AriaProjector(
                 config, prefix=maybe_prefix(prefix, "multi_modal_projector")
diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py
index bcf1200bd1b4..a1fd1646a222 100644
--- a/vllm/model_executor/models/bailing_moe_linear.py
+++ b/vllm/model_executor/models/bailing_moe_linear.py
@@ -64,6 +64,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 from .interfaces import HasInnerState, IsHybrid, SupportsPP
 from .utils import (
@@ -444,8 +445,8 @@ class BailingMoELinearAttention(PluggableLayer, MambaBase):
     # --8<-- [end:bailing_moe_linear_attention]
 
     @property
-    def mamba_type(self) -> str:
-        return "linear_attention"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.LINEAR
 
     def get_state_shape(self) -> tuple[tuple[int, ...], ...]:
         """Return state shape for linear attention cache.
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 8b5fd452e8ff..49d2a5a89f5f 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -561,7 +561,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 config.qformer_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
-                prefix=f"{prefix}.qformer",
+                prefix=maybe_prefix(prefix, "qformer"),
             )
             self.language_projection = nn.Linear(
                 config.qformer_config.hidden_size,
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 05a494683a85..779bbb23e2b4 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -777,7 +777,7 @@ def __init__(
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             require_post_norm=require_post_norm,
-            prefix=f"{prefix}.vision_model",
+            prefix=maybe_prefix(prefix, "vision_model"),
         )
 
     def forward(
diff --git a/vllm/model_executor/models/cohere_moe.py b/vllm/model_executor/models/cohere2_moe.py
similarity index 83%
rename from vllm/model_executor/models/cohere_moe.py
rename to vllm/model_executor/models/cohere2_moe.py
index a059d68c9d02..aa8adff188f7 100644
--- a/vllm/model_executor/models/cohere_moe.py
+++ b/vllm/model_executor/models/cohere2_moe.py
@@ -30,7 +30,9 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
     maybe_remap_kv_scale_name,
+    row_parallel_weight_loader,
 )
+from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
@@ -53,7 +55,7 @@ def token_choice_with_bias(
     topk: int,
     renormalize: bool,
 ):
-    """Sigmoid -> top-k (-> renormalize) custom routing for CohereMoe."""
+    """Sigmoid -> top-k (-> renormalize) custom routing for Cohere2Moe."""
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
 
     scores = gating_output.float().sigmoid()
@@ -65,7 +67,38 @@ def token_choice_with_bias(
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
-class CohereMoeMLP(nn.Module):
+@torch.compile(backend=current_platform.simple_compile_backend)
+def rms_norm_func(hidden_states, weight, variance_epsilon):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    hidden_states = weight.to(torch.float32) * hidden_states
+    return hidden_states.to(input_dtype)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, param_shape=None, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(param_shape))
+        self.variance_epsilon = eps
+        set_weight_attrs(self.weight, {"weight_loader": row_parallel_weight_loader})
+
+    def forward(self, hidden_states, residuals=None):
+        hidden_states = rms_norm_func(hidden_states, self.weight, self.variance_epsilon)
+        return hidden_states, residuals
+
+
+def select_norm_impl(config: CohereConfig) -> tuple[type[nn.Module], float]:
+    """Returns (norm_class, eps). Uses RMSNorm when config.rms_norm_eps is set,
+    otherwise falls back to LayerNorm with config.layer_norm_eps."""
+    rms_eps = getattr(config, "rms_norm_eps", None)
+    if rms_eps is not None:
+        return RMSNorm, rms_eps
+    return LayerNorm, config.layer_norm_eps
+
+
+class Cohere2MoeMLP(nn.Module):
     """Cohere MLP used as shared experts in the MoE block."""
 
     def __init__(
@@ -73,6 +106,7 @@ def __init__(
         config: CohereConfig,
         intermediate_size: int | None = None,
         quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = False,
         prefix: str = "",
     ):
         super().__init__()
@@ -95,7 +129,7 @@ def __init__(
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
-            reduce_results=False,
+            reduce_results=reduce_results,
             prefix=f"{prefix}.down_proj",
         )
         self.act_fn = SiluAndMul()
@@ -107,7 +141,7 @@ def forward(self, x):
         return x
 
 
-class CohereMoeAttention(nn.Module):
+class Cohere2MoeAttention(nn.Module):
     """Cohere MoE attention with sliding-window interleave."""
 
     def __init__(
@@ -170,6 +204,19 @@ def __init__(
         ):
             self.sliding_window = config.sliding_window
 
+        # Prefix-dense layers (layer_idx < first_k_dense_replace) have full
+        # attention (no sliding window). When prefix_dense_sliding_window_pattern
+        # == 1, they keep RoPE even though they are not sliding-window layers.
+        first_k_dense_replace = getattr(config, "first_k_dense_replace", 0)
+        prefix_dense_sliding_window_pattern = getattr(
+            config, "prefix_dense_sliding_window_pattern", 1
+        )
+        self.force_rope = bool(
+            first_k_dense_replace
+            and prefix_dense_sliding_window_pattern == 1
+            and self.layer_idx < first_k_dense_replace
+        )
+
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
@@ -188,15 +235,15 @@ def forward(
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        if self.sliding_window:
+        if self.sliding_window or self.force_rope:
             q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
 
-class CohereMoe(nn.Module):
-    """Tensor-parallel MoE block for CohereMoe with shared experts."""
+class Cohere2Moe(nn.Module):
+    """Tensor-parallel MoE block for Cohere2Moe with shared experts."""
 
     def __init__(
         self,
@@ -234,7 +281,7 @@ def __init__(
         )
 
         if hasattr(config, "num_shared_experts") and config.num_shared_experts > 0:
-            self.shared_experts = CohereMoeMLP(
+            self.shared_experts = Cohere2MoeMLP(
                 config=config,
                 intermediate_size=config.intermediate_size * config.num_shared_experts,
                 quant_config=quant_config,
@@ -276,7 +323,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return final_hidden_states.view(orig_shape)
 
 
-class CohereMoeDecoderLayer(nn.Module):
+class Cohere2MoeDecoderLayer(nn.Module):
     def __init__(
         self,
         config: CohereConfig,
@@ -287,19 +334,34 @@ def __init__(
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
+        self.layer_idx = extract_layer_index(prefix)
 
-        self.self_attn = CohereMoeAttention(
+        self.self_attn = Cohere2MoeAttention(
             config,
             cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
-        self.mlp = CohereMoe(
-            config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
-        )
-        self.input_layernorm = LayerNorm(
-            param_shape=(config.hidden_size,), eps=config.layer_norm_eps
-        )
+
+        # Layers before first_k_dense_replace use a dense MLP instead of MoE.
+        first_k_dense_replace = getattr(config, "first_k_dense_replace", 0)
+        if self.layer_idx < first_k_dense_replace:
+            self.mlp = Cohere2MoeMLP(
+                config=config,
+                intermediate_size=getattr(
+                    config, "prefix_dense_intermediate_size", config.intermediate_size
+                ),
+                quant_config=quant_config,
+                reduce_results=True,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = Cohere2Moe(
+                config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+            )
+
+        norm_cls, norm_eps = select_norm_impl(config)
+        self.input_layernorm = norm_cls(param_shape=(config.hidden_size,), eps=norm_eps)
 
     def forward(
         self,
@@ -320,8 +382,8 @@ def forward(
 
 
 @support_torch_compile
-class CohereMoeModel(nn.Module):
-    """Transformer decoder for CohereMoe."""
+class Cohere2MoeModel(nn.Module):
+    """Transformer decoder for Cohere2Moe."""
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -339,14 +401,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: CohereMoeDecoderLayer(
+            lambda prefix: Cohere2MoeDecoderLayer(
                 config, cache_config, quant_config, prefix=prefix
             ),
             prefix=f"{prefix}.layers",
         )
-        self.norm = LayerNorm(
-            param_shape=(config.hidden_size,), eps=config.layer_norm_eps
-        )
+        norm_cls, norm_eps = select_norm_impl(config)
+        self.norm = norm_cls(param_shape=(config.hidden_size,), eps=norm_eps)
         self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
             ["hidden_states", "residual"], config.hidden_size
         )
@@ -471,7 +532,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-class CohereMoeForCausalLM(nn.Module, SupportsPP, SupportsQuant):
+class Cohere2MoeForCausalLM(nn.Module, SupportsPP, SupportsQuant):
     is_text_generation_model = True
 
     packed_modules_mapping = {
@@ -498,7 +559,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(
             self.unpadded_vocab_size, config.vocab_size, scale=self.logits_scale
         )
-        self.model = CohereMoeModel(
+        self.model = Cohere2MoeModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/cohere_asr.py b/vllm/model_executor/models/cohere_asr.py
index 584793832dc7..da74404139aa 100644
--- a/vllm/model_executor/models/cohere_asr.py
+++ b/vllm/model_executor/models/cohere_asr.py
@@ -64,7 +64,7 @@
     SupportsMultiModal,
     SupportsTranscription,
 )
-from .utils import AutoWeightsLoader, WeightsMapper, make_layers
+from .utils import AutoWeightsLoader, WeightsMapper, make_layers, maybe_prefix
 
 logger = init_logger(__name__)
 
@@ -1717,7 +1717,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.encoder = ConformerEncoder(vllm_config=vllm_config)
 
         self.decoder = CohereASRDecoder(
-            vllm_config=vllm_config, prefix=f"{prefix}.decoder"
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "decoder"),
         )
 
         if self.encoder.d_model != self.decoder.hidden_size:
diff --git a/vllm/model_executor/models/cohere_eagle.py b/vllm/model_executor/models/cohere_eagle.py
new file mode 100644
index 000000000000..5c22d6e34dd5
--- /dev/null
+++ b/vllm/model_executor/models/cohere_eagle.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+from transformers import CohereConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.commandr import (
+    CohereDecoderLayer,
+    CohereForCausalLM,
+    LayerNorm,
+)
+
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
+
+logger = init_logger(__name__)
+
+
+class CohereEagleDecoderLayer(CohereDecoderLayer):
+    """Eagle draft variant of CohereDecoderLayer."""
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+
+@support_torch_compile
+class CohereEagleModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        start_layer_id: int = 0,
+    ) -> None:
+        super().__init__()
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
+        # Cohere2-targeted EAGLE drafts inherit the target's sliding-window
+        # attention pattern. ``CohereAttention`` resolves per-layer behavior
+        # via ``config.layer_types[layer_idx]`` and the eagle layers use
+        # absolute indices (target_layer_num + i), so prepend the target's
+        # ``layer_types`` to the draft's so the lookup succeeds.
+        target_text_config = vllm_config.model_config.hf_text_config
+        if hasattr(target_text_config, "layer_types") and hasattr(
+            self.config, "layer_types"
+        ):
+            self.config.layer_types = list(target_text_config.layer_types) + list(
+                self.config.layer_types
+            )
+
+        self.vocab_size = self.config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                CohereEagleDecoderLayer(
+                    self.config,
+                    cache_config=vllm_config.cache_config,
+                    quant_config=self.quant_config,
+                    prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
+
+        # Cohere EAGLE checkpoints include a bias term on the input fusion
+        # projection (unlike LLaMA EAGLE which uses bias=False).
+        self.fc = ReplicatedLinear(
+            input_size=self.config.hidden_size * 2,
+            output_size=self.config.hidden_size,
+            bias=True,
+            params_dtype=vllm_config.model_config.dtype,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
+        )
+
+        # Cohere EAGLE applies an explicit final LayerNorm to the draft
+        # hidden states before they are consumed by the logits processor.
+        self.norm = LayerNorm(
+            param_shape=(self.config.hidden_size),
+            eps=self.config.layer_norm_eps,
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.fc(torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleCohereForCausalLM(CohereForCausalLM):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        self.config = vllm_config.speculative_config.draft_model_config.hf_config
+        # Flags checked by the speculative proposer to decide whether to share
+        # embed_tokens / lm_head with the target model. Cohere EAGLE checkpoints
+        # use tied embeddings so these weights are absent from the draft file.
+        self.has_own_embed_tokens = False
+        self.has_own_lm_head = False
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config
+        )
+        self.model = CohereEagleModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            start_layer_id=target_layer_num,
+        )
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(
+            self.config.vocab_size, scale=logit_scale
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                f"{type(self).__name__} does not support multimodal inputs yet."
+            )
+        return self.model(input_ids, positions, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        def _track_and_forward(inputs):
+            name, weight = inputs
+            process_eagle_weight(self, name)
+            return name, weight
+
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(
+                ["lm_head.", "model.embed_tokens."]
+                if self.config.tie_word_embeddings
+                else None
+            ),
+        )
+
+        loaded_weight_names = loader.load_weights(map(_track_and_forward, weights))
+
+        # Embed tokens are tied with the target model and therefore not
+        # present in the EAGLE checkpoint; mark them as loaded explicitly to
+        # avoid a spurious "weight not found" warning from the default
+        # weight loader.
+        loaded_weight_names.add("model.embed_tokens.weight")
+        return loaded_weight_names
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index f975b32adc19..76e90e327655 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -198,7 +198,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config.parallel_config
         )
         self.model = DeepseekV2Model(
-            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            start_layer_id=target_layer_num,
         )
 
         self.lm_head = ParallelLMHead(
diff --git a/vllm/model_executor/models/deepseek_eagle3.py b/vllm/model_executor/models/deepseek_eagle3.py
index 640ba89914b2..47dfc0670697 100644
--- a/vllm/model_executor/models/deepseek_eagle3.py
+++ b/vllm/model_executor/models/deepseek_eagle3.py
@@ -318,7 +318,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config.target_layer_count = target_layer_num
 
         self.model = DeepseekV2Eagle3Model(
-            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            start_layer_id=target_layer_num,
         )
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 86881376a106..44797874a4c5 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -83,6 +83,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
     extract_layer_index,
     sequence_parallel_chunk,
 )
@@ -1254,6 +1255,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.aux_hidden_state_layers = tuple[int, ...]()
 
+        # Needed by load_weights
+        qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
+        qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
+        self.use_mha = config.model_type == "deepseek" or all(
+            dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
+        )
+        self.num_redundant_experts = (
+            vllm_config.parallel_config.eplb_config.num_redundant_experts
+        )
+
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
@@ -1315,174 +1326,6 @@ def forward(
             return hidden_states, aux_hidden_states
         return hidden_states
 
-
-class DeepseekV2MixtureOfExperts(MixtureOfExperts):
-    moe_mlp_layers: list[DeepseekV2MoE]
-    """
-    List of MoE MLP layers in the model.
-    """
-
-    def extract_moe_parameters(self, example_moe: DeepseekV2MoE | None):
-        if example_moe is None:
-            self.num_moe_layers = 0
-            self.num_expert_groups = 0
-            self.num_logical_experts = 0
-            self.num_physical_experts = 0
-            self.num_local_physical_experts = 0
-            self.num_routed_experts = 0
-            self.num_shared_experts = 0
-            self.num_redundant_experts = 0
-            logger.warning("DeepSeekV2: No DeepseekV2MoE layer found in model.layers.")
-        else:
-            self.num_logical_experts = example_moe.n_logical_experts
-            self.num_physical_experts = example_moe.n_physical_experts
-            self.num_local_physical_experts = example_moe.n_local_physical_experts
-            self.num_routed_experts = example_moe.n_routed_experts
-            self.num_shared_experts = example_moe.n_shared_experts
-            self.num_redundant_experts = example_moe.n_redundant_experts
-
-    def update_physical_experts_metadata(
-        self,
-        num_physical_experts: int,
-        num_local_physical_experts: int,
-    ) -> None:
-        assert self.num_local_physical_experts == num_local_physical_experts
-        self.num_physical_experts = num_physical_experts
-        self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
-        for moe in self.moe_mlp_layers:
-            moe.n_local_physical_experts = num_local_physical_experts
-            moe.n_physical_experts = num_physical_experts
-            moe.n_redundant_experts = self.num_redundant_experts
-            moe.experts.update_expert_map()
-
-
-class DeepseekV2ForCausalLM(
-    nn.Module,
-    SupportsPP,
-    DeepseekV2MixtureOfExperts,
-    SupportsLoRA,
-    SupportsEagle,
-    SupportsEagle3,
-):
-    packed_modules_mapping = {
-        "gate_up_proj": ["gate_proj", "up_proj"],
-    }
-    model_cls = DeepseekV2Model
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-
-        qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
-        qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
-        self.use_mha = config.model_type == "deepseek" or all(
-            dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
-        )
-
-        if self.use_mha:
-            self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
-
-        # `packed_modules_mapping` needs to be modified before
-        # initializing DeepseekV2Model, as it is passed inplace to
-        # quantization config init and may be used to select the
-        # quant_method for relevant layers during initialization.
-        self.fuse_qkv_a_proj = (
-            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
-        )
-        if self.fuse_qkv_a_proj:
-            self.packed_modules_mapping["fused_qkv_a_proj"] = [
-                "q_a_proj",
-                "kv_a_proj_with_mqa",
-            ]
-
-        self.model = self.model_cls(
-            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
-        )
-        if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-        else:
-            self.lm_head = PPMissingLayer()
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
-        # Set MoE hyperparameters
-        self.num_moe_layers = (
-            self.config.num_hidden_layers - self.config.first_k_dense_replace
-        )
-        self.set_moe_parameters()
-
-    def set_moe_parameters(self):
-        self.expert_weights = []
-
-        self.num_expert_groups = getattr(self.config, "n_group", 1)
-
-        self.moe_layers = []
-        self.moe_mlp_layers = []
-        example_moe = None
-        for layer in self.model.layers:
-            if isinstance(layer, PPMissingLayer):
-                continue
-
-            assert isinstance(layer, DeepseekV2DecoderLayer)
-            if isinstance(layer.mlp, DeepseekV2MoE):
-                # Pick last one layer since the first ones may be dense layers.
-                example_moe = layer.mlp
-                self.moe_mlp_layers.append(layer.mlp)
-                self.moe_layers.append(layer.mlp.experts)
-
-        self.extract_moe_parameters(example_moe)
-
-    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
-        self.model.aux_hidden_state_layers = layers
-
-    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
-        num_layers = len(self.model.layers)
-        return (2, num_layers // 2, num_layers - 3)
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_input_ids(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor | None,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
-        hidden_states = self.model(
-            input_ids, positions, intermediate_tensors, inputs_embeds
-        )
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
-    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        return fused_moe_make_expert_params_mapping(
-            self,
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts,
-            num_redundant_experts=0,
-        )
-
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         rocm_aiter_moe_shared_expert_enabled = (
             rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
@@ -1703,6 +1546,178 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
+class DeepseekV2MixtureOfExperts(MixtureOfExperts):
+    moe_mlp_layers: list[DeepseekV2MoE]
+    """
+    List of MoE MLP layers in the model.
+    """
+
+    def extract_moe_parameters(self, example_moe: DeepseekV2MoE | None):
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("DeepSeekV2: No DeepseekV2MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class DeepseekV2ForCausalLM(
+    nn.Module,
+    SupportsPP,
+    DeepseekV2MixtureOfExperts,
+    SupportsLoRA,
+    SupportsEagle,
+    SupportsEagle3,
+):
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+    model_cls = DeepseekV2Model
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
+        qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
+        self.use_mha = config.model_type == "deepseek" or all(
+            dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
+        )
+
+        if self.use_mha:
+            self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
+
+        # `packed_modules_mapping` needs to be modified before
+        # initializing DeepseekV2Model, as it is passed inplace to
+        # quantization config init and may be used to select the
+        # quant_method for relevant layers during initialization.
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.model = self.model_cls(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+        # Set MoE hyperparameters
+        self.num_moe_layers = (
+            self.config.num_hidden_layers - self.config.first_k_dense_replace
+        )
+        self.set_moe_parameters()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.num_expert_groups = getattr(self.config, "n_group", 1)
+
+        self.moe_layers = []
+        self.moe_mlp_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, DeepseekV2DecoderLayer)
+            if isinstance(layer.mlp, DeepseekV2MoE):
+                # Pick last one layer since the first ones may be dense layers.
+                example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
+                self.moe_layers.append(layer.mlp.experts)
+
+        self.extract_moe_parameters(example_moe)
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return fused_moe_make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+            num_redundant_experts=0,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+
 class DeepseekForCausalLM(DeepseekV2ForCausalLM):
     pass
 
@@ -1726,6 +1741,8 @@ def get_spec_layer_idx_from_weight_name(
     ):
         layer_idx = config.num_hidden_layers
         for i in range(config.num_nextn_predict_layers):
-            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
+            if weight_name.startswith(
+                f"model.layers.{layer_idx + i}."
+            ) or weight_name.startswith(f"layers.{layer_idx + i}."):
                 return layer_idx + i
     return None
diff --git a/vllm/model_executor/models/deepseek_v4.py b/vllm/model_executor/models/deepseek_v4.py
index 2a0c8c8772c1..3f9a8357cb97 100644
--- a/vllm/model_executor/models/deepseek_v4.py
+++ b/vllm/model_executor/models/deepseek_v4.py
@@ -12,6 +12,7 @@
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import (
     get_ep_group,
+    get_pp_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
@@ -49,6 +50,7 @@
     VocabParallelEmbedding,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsPP
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -57,8 +59,10 @@
 
 from .utils import (
     AutoWeightsLoader,
+    PPMissingLayer,
     WeightsMapper,
     extract_layer_index,
+    is_pp_missing_parameter,
     make_layers,
     maybe_prefix,
 )
@@ -1243,23 +1247,53 @@ def forward(
         x: torch.Tensor,
         positions: torch.Tensor,
         input_ids: torch.Tensor | None,
+        post_mix: torch.Tensor | None,
+        res_mix: torch.Tensor | None,
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
-        residual = x
-        x, post, comb = self.hc_pre(
-            x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
-        )
+        if residual is None:
+            # Run standalone hc_pre on first layer
+            residual = x
+            x, post_mix, res_mix = self.hc_pre(
+                x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base
+            )
+        else:
+            residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
+                x,
+                residual,
+                post_mix,
+                res_mix,
+                self.hc_attn_fn,
+                self.hc_attn_scale,
+                self.hc_attn_base,
+                self.rms_norm_eps,
+                self.hc_eps,
+                self.hc_eps,
+                self.hc_post_alpha,
+                self.hc_sinkhorn_iters,
+            )
+
         x = self.attn_norm(x)
         x = self.attn(positions, x, None)
-        x = self.hc_post(x, residual, post, comb)
 
-        residual = x
-        x, post, comb = self.hc_pre(
-            x, self.hc_ffn_fn, self.hc_ffn_scale, self.hc_ffn_base
+        residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre(
+            x,
+            residual,
+            post_mix,
+            res_mix,
+            self.hc_ffn_fn,
+            self.hc_ffn_scale,
+            self.hc_ffn_base,
+            self.rms_norm_eps,
+            self.hc_eps,
+            self.hc_eps,
+            self.hc_post_alpha,
+            self.hc_sinkhorn_iters,
         )
+
         x = self.ffn_norm(x)
         x = self.ffn(x, input_ids)
-        x = self.hc_post(x, residual, post, comb)
-        return x
+        return x, residual, post_mix, res_mix
 
 
 @support_torch_compile
@@ -1305,12 +1339,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             device=self.device,
         )
 
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-            quant_config=quant_config,
-            prefix=f"{prefix}.embed_tokens",
-        )
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -1323,7 +1360,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=f"{prefix}.layers",
         )
 
-        self.norm = RMSNorm(config.hidden_size, self.rms_norm_eps)
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, self.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
 
         self.hc_head_fn = nn.Parameter(
             torch.empty(
@@ -1348,16 +1388,42 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Pre-hc_head residual stream buffer for the MTP draft. Stable
         # address (outside the cudagraph pool) so the copy_ in forward()
         # refreshes it correctly across captured shapes.
-        self._mtp_hidden_buffer = torch.empty(
-            vllm_config.scheduler_config.max_num_batched_tokens,
-            self.hc_dim,
-            dtype=vllm_config.model_config.dtype,
-            device=self.device,
-        )
+        # refreshes it correctly across captured shapes. Only allocated on
+        # the last PP rank — that's where MTP target hidden states are
+        # produced.
+        if get_pp_group().is_last_rank:
+            self._mtp_hidden_buffer = torch.empty(
+                vllm_config.scheduler_config.max_num_batched_tokens,
+                self.hc_dim,
+                dtype=vllm_config.model_config.dtype,
+                device=self.device,
+            )
+        else:
+            self._mtp_hidden_buffer = None
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
+    def make_empty_intermediate_tensors(
+        self,
+        batch_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> IntermediateTensors:
+        # PP intermediate tensors carry the multi-stream hidden_states
+        # of shape (num_tokens, hc_mult, hidden_size) — V4 expands the
+        # token embedding to hc_mult streams before the first decoder
+        # layer and keeps that shape until hc_head() collapses it.
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.hc_mult, self.config.hidden_size),
+                    dtype=dtype,
+                    device=device,
+                ),
+            }
+        )
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1365,16 +1431,34 @@ def forward(
         intermediate_tensors: IntermediateTensors | None,
         inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor | IntermediateTensors:
-        hidden_states = self.embed_input_ids(input_ids)
-        hidden_states = hidden_states.unsqueeze(-2).repeat(1, self.hc_mult, 1)
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            hidden_states = hidden_states.unsqueeze(-2).repeat(1, self.hc_mult, 1)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
         if self.use_mega_moe:
             input_ids = input_ids.to(torch.int64)
+
+        residual, post_mix, res_mix = None, None, None
         for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states = layer(
+            hidden_states, residual, post_mix, res_mix = layer(
                 hidden_states,
                 positions,
                 input_ids,
+                post_mix,
+                res_mix,
+                residual,
             )
+        else:
+            hidden_states = layer.hc_post(hidden_states, residual, post_mix, res_mix)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
 
         # Stash pre-hc_head residual for the MTP draft (captured copy_).
         num_tokens = hidden_states.shape[0]
@@ -1424,6 +1508,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                     continue
                 name = name.replace(weight_name, param_name)
 
+                if is_pp_missing_parameter(name, self):
+                    break
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -1445,6 +1531,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                         if weight_name not in name:
                             continue
                         name_mapped = name.replace(weight_name, param_name)
+                        if is_pp_missing_parameter(name_mapped, self):
+                            continue
                         param = params_dict[name_mapped]
                         # We should ask the weight loader to return success or not
                         # here since otherwise we may skip experts with other
@@ -1466,12 +1554,16 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                     loaded_params.add(name_mapped)
                     continue
                 elif "attn_sink" in name:
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     narrow_weight = loaded_weight[head_rank_start:head_rank_end]
                     n = narrow_weight.shape[0]
                     params_dict[name][:n].copy_(narrow_weight)
                     loaded_params.add(name)
                     continue
                 else:
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     param = params_dict[name]
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader
@@ -1569,7 +1661,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
     )
 
 
-class DeepseekV4ForCausalLM(nn.Module):
+class DeepseekV4ForCausalLM(nn.Module, SupportsPP):
     model_cls = DeepseekV4Model
 
     # Default mapper assumes the original FP4-expert checkpoint layout.
@@ -1588,12 +1680,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = self.model_cls(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.lm_head = ParallelLMHead(
-            config.vocab_size,
-            config.hidden_size,
-            prefix=maybe_prefix(prefix, "lm_head"),
-        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.embed_input_ids(input_ids)
diff --git a/vllm/model_executor/models/exaone4_5.py b/vllm/model_executor/models/exaone4_5.py
index 1eac43ccb0c4..b44708466cf5 100644
--- a/vllm/model_executor/models/exaone4_5.py
+++ b/vllm/model_executor/models/exaone4_5.py
@@ -23,7 +23,6 @@
 import torch.nn as nn
 from transformers.models.exaone4_5 import (
     Exaone4_5_Config,
-    Exaone4_5_ImageProcessor,
     Exaone4_5_Processor,
 )
 from transformers.models.exaone4_5.configuration_exaone4_5 import Exaone4_5_VisionConfig
@@ -304,9 +303,6 @@ def get_hf_processor(self, **kwargs: object) -> Exaone4_5_Processor:
             **kwargs,
         )
 
-    def get_image_processor(self, **kwargs: object) -> Exaone4_5_ImageProcessor:
-        return Exaone4_5_ImageProcessor(**kwargs)
-
 
 @MULTIMODAL_REGISTRY.register_processor(
     Exaone4_5_MultiModalProcessor,
diff --git a/vllm/model_executor/models/exaone4_5_mtp.py b/vllm/model_executor/models/exaone4_5_mtp.py
index 99bf724bdaa4..7711f72e42ca 100644
--- a/vllm/model_executor/models/exaone4_5_mtp.py
+++ b/vllm/model_executor/models/exaone4_5_mtp.py
@@ -9,6 +9,7 @@
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import ColumnParallelLinear
@@ -22,6 +23,7 @@
     ExaoneMoeMTP,
     ExaoneMoeMultiTokenPredictor,
 )
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -48,6 +50,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         config = model_config.hf_config
+        text_config = config.text_config
 
         self.config = config
         lora_vocab = (
@@ -58,18 +61,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
 
-        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.mtp_start_layer_idx = text_config.num_hidden_layers
         self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1)
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
-            config.hidden_size,
+            text_config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
 
         self.fc = ColumnParallelLinear(
-            self.config.hidden_size * 2,
-            self.config.hidden_size,
+            text_config.hidden_size * 2,
+            text_config.hidden_size,
             gather_output=True,
             bias=False,
             return_bias=False,
@@ -78,27 +81,68 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.layers = nn.ModuleList(
             Exaone4DecoderLayer(
-                vllm_config.model_config.hf_config,
+                text_config,
                 quant_config=quant_config,
                 prefix=f"{prefix}.layers.{idx}",
             )
             for idx in range(self.num_mtp_layers)
         )
 
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.pre_fc_norm_hidden = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = RMSNorm(text_config.hidden_size, eps=text_config.rms_norm_eps)
+        self.pre_fc_norm_hidden = RMSNorm(
+            text_config.hidden_size, eps=text_config.rms_norm_eps
+        )
         self.pre_fc_norm_embedding = RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
+            text_config.hidden_size, eps=text_config.rms_norm_eps
         )
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            assert hidden_states.shape[-1] == inputs_embeds.shape[-1]
+            inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds)
+            hidden_states = self.pre_fc_norm_hidden(hidden_states)
+            hidden_states = torch.cat([inputs_embeds, hidden_states], dim=-1)
+            hidden_states = self.fc(hidden_states)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        current_step_idx = spec_step_idx % self.num_mtp_layers
+        hidden_states, residual = self.layers[current_step_idx](
+            positions=positions,
+            hidden_states=hidden_states,
+            residual=residual,
+        )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
 
 @support_torch_compile
 class Exaone4_5_MTP(ExaoneMoeMTP, SupportsMultiModal):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
+        text_config = config.text_config
         self.vllm_config = vllm_config
         self.quant_config = vllm_config.quant_config
 
@@ -110,7 +154,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.unpadded_vocab_size = config.vocab_size
         self.lm_head = ParallelLMHead(
             self.unpadded_vocab_size,
-            config.hidden_size,
+            text_config.hidden_size,
             org_num_embeddings=config.vocab_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
diff --git a/vllm/model_executor/models/gemma4.py b/vllm/model_executor/models/gemma4.py
index 62b7f5576673..1b50db61c288 100644
--- a/vllm/model_executor/models/gemma4.py
+++ b/vllm/model_executor/models/gemma4.py
@@ -40,6 +40,7 @@
 from vllm.model_executor.layers.fused_moe import (
     FusedMoE,
     GateLinear,
+    fused_moe_make_expert_params_mapping,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -1368,30 +1369,35 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         #   moe.experts.{id}.gate_proj → FusedMoE w1 (shard of w13)
         #   moe.experts.{id}.up_proj   → FusedMoE w3 (shard of w13)
         #   moe.experts.{id}.down_proj → FusedMoE w2
-        #
-        # Use prefix matching to handle both weights and
-        # quantization scale parameters. The param_name is a prefix ending
-        # in underscore, and weight_name ends with a dot, so that:
-        #   "experts.0.gate_proj.weight_scale" -> "experts.w13_weight_scale"
-        #   "experts.0.gate_proj.weight" -> "experts.w13_weight"
         num_experts = getattr(self.config, "num_experts", None) or 0
-        expert_params_mapping = [
-            # (param_name, weight_name, expert_id, shard_id)
+        # Strategy A: dot-separated suffix
+        # (standard AWQ/GPTQ e.g. .qweight, .scales, .weight)
+        dot_suffix_expert_params_mapping = fused_moe_make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=num_experts,
+        )
+        # Strategy B: underscore-separated suffix
+        # (CompressedTensors-format AWQ/W4A16 _packed, _scale)
+        underscore_suffix_expert_params_mapping = [
             (
-                "experts.w13_"
-                if proj_name in ["gate_proj", "up_proj"]
-                else "experts.w2_",
-                f"experts.{expert_id}.{proj_name}.",
+                f"{param_name}weight_",
+                f"{weight_name.rstrip('.')}_",
                 expert_id,
                 shard_id,
             )
-            for expert_id in range(num_experts)
-            for shard_id, proj_name in [
-                ("w1", "gate_proj"),
-                ("w2", "down_proj"),
-                ("w3", "up_proj"),
-            ]
+            for (
+                param_name,
+                weight_name,
+                expert_id,
+                shard_id,
+            ) in dot_suffix_expert_params_mapping
         ]
+        expert_params_mapping = (
+            dot_suffix_expert_params_mapping + underscore_suffix_expert_params_mapping
+        )
         params_dict = dict(self.named_parameters())
         # Include buffers (e.g. layer_scalar) so they can be loaded too
         params_dict.update(dict(self.named_buffers()))
diff --git a/vllm/model_executor/models/gemma4_mm.py b/vllm/model_executor/models/gemma4_mm.py
index 73a5e701e7af..91be7d47f6f0 100644
--- a/vllm/model_executor/models/gemma4_mm.py
+++ b/vllm/model_executor/models/gemma4_mm.py
@@ -124,12 +124,12 @@ class Gemma4ImagePixelInputs(TensorSchema):
 
     type: Literal["pixel_values"] = "pixel_values"
     pixel_values: Annotated[
-        torch.Tensor,
-        TensorShape("bn", "np", "pp"),
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("bn", "np", "pp", dynamic_dims={"np"}),
     ]
     pixel_position_ids: Annotated[
-        torch.Tensor,
-        TensorShape("bn", "np", 2),
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("bn", "np", 2, dynamic_dims={"np"}),
     ]
 
 
@@ -1128,15 +1128,20 @@ def _process_image_input(
         # metadata, and validating numerical equivalence with the
         # current per-image path.
         #
+        # Concurrent requests with different image resolutions may
+        # arrive as a list of per-image tensors, while same-resolution
+        # batches may arrive as a stacked tensor. Both forms are
+        # iterable over the per-image dimension.
+
         # Process each image individually through the vision tower.
         # The vision tower's forward() strips padding and returns a
         # flat tensor of valid tokens. We process per-image to get
         # variable-length outputs matching the dynamic token count
         # from get_image_repl.
         per_image_features = []
-        for i in range(pixel_values.shape[0]):
-            pv = pixel_values[i].unsqueeze(0)  # (1, max_patches, patch_pixels)
-            pp = pixel_position_ids[i].unsqueeze(0)  # (1, max_patches, 2)
+        for pv, pp in zip(pixel_values, pixel_position_ids, strict=True):
+            pv = pv.unsqueeze(0)  # (1, max_patches, patch_pixels)
+            pp = pp.unsqueeze(0)  # (1, max_patches, 2)
 
             # Derive the pooler's output_length from the total patch
             # count (including padding).  The vision tower encoder
diff --git a/vllm/model_executor/models/granite4_vision.py b/vllm/model_executor/models/granite4_vision.py
index 710fc94ee5f8..c6e4df2992cb 100644
--- a/vllm/model_executor/models/granite4_vision.py
+++ b/vllm/model_executor/models/granite4_vision.py
@@ -149,7 +149,7 @@ def __init__(
             qformer_config,
             quant_config=quant_config,
             cache_config=cache_config,
-            prefix=f"{prefix}.qformer",
+            prefix=maybe_prefix(prefix, "qformer"),
         )
 
         self.image_side = (
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 036b92ed8808..2b9a03be08f2 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -620,7 +620,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.encoder = GraniteSpeechCTCEncoder(
                 config=config.encoder_config,
                 quant_config=quant_config,
-                prefix=f"{prefix}.encoder",
+                prefix=maybe_prefix(prefix, "encoder"),
             )
 
             # Blip2 QFormer
@@ -628,7 +628,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 config=config,
                 quant_config=quant_config,
                 cache_config=cache_config,
-                prefix=f"{prefix}.projector",
+                prefix=maybe_prefix(prefix, "projector"),
             )
 
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index fca801b74823..b900c0ed83ea 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -930,7 +930,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.quant_config = quant_config
 
-        self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
+        self.model = HunYuanModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
                 config.vocab_size,
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 7166f725774f..86d7edc25f92 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -729,7 +729,7 @@ def __init__(
         self.vision_model = KeyeSiglipVisionTransformer(
             config,
             quant_config=quant_config,
-            prefix=f"{prefix}.vision_model",
+            prefix=maybe_prefix(prefix, "vision_model"),
         )
         self.quant_config = quant_config
 
diff --git a/vllm/model_executor/models/kimi_k25_vit.py b/vllm/model_executor/models/kimi_k25_vit.py
index 69524293c54b..237c28506ed0 100644
--- a/vllm/model_executor/models/kimi_k25_vit.py
+++ b/vllm/model_executor/models/kimi_k25_vit.py
@@ -618,6 +618,9 @@ def mm_projector_forward(mm_projector: torch.nn.Module, vt_output: list[torch.Te
     """Apply MM projector to vision tower outputs."""
     num_embedding_list = [x.shape[0] for x in vt_output]
     batched = torch.cat(vt_output, dim=0)
+    projector_dtype = mm_projector.pre_norm.weight.dtype
+    if batched.dtype != projector_dtype:
+        batched = batched.to(projector_dtype)
     proj_out = mm_projector(batched)
     proj_out = proj_out.reshape(-1, proj_out.shape[-1])
     proj_out = torch.split(proj_out, num_embedding_list)
diff --git a/vllm/model_executor/models/lfm2_siglip2.py b/vllm/model_executor/models/lfm2_siglip2.py
index 70ffa2afccf8..cb51c6bd8cc8 100644
--- a/vllm/model_executor/models/lfm2_siglip2.py
+++ b/vllm/model_executor/models/lfm2_siglip2.py
@@ -25,6 +25,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
+from .utils import maybe_prefix
 from .vision import (
     is_vit_use_data_parallel,
     resolve_visual_encoder_outputs,
@@ -472,7 +473,7 @@ def __init__(
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             require_post_norm=require_post_norm,
-            prefix=f"{prefix}.vision_model",
+            prefix=maybe_prefix(prefix, "vision_model"),
         )
 
     def forward(
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 6c7b53d4d525..962377fd178d 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -176,7 +176,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.model = LlamaModel(
             vllm_config=vllm_config,
-            prefix="model",
+            prefix=maybe_prefix(prefix, "model"),
             start_layer_id=target_layer_num,
             quant_config=quant_config,
         )
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 99a69adf1fc3..585c8f6dbd26 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -174,7 +174,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config.parallel_config
         )
         self.model = LlamaModel(
-            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            start_layer_id=target_layer_num,
         )
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index fcec4a4d8609..af115c456776 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -287,7 +287,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # proper layer_types indexing in draft models
         self.config.target_layer_count = target_layer_num
         self.model = LlamaModel(
-            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            start_layer_id=target_layer_num,
         )
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
diff --git a/vllm/model_executor/models/mimo_v2_mtp.py b/vllm/model_executor/models/mimo_v2_mtp.py
index 442f4986b669..c863cedaeb88 100644
--- a/vllm/model_executor/models/mimo_v2_mtp.py
+++ b/vllm/model_executor/models/mimo_v2_mtp.py
@@ -49,7 +49,7 @@
 from .utils import _merge_multimodal_embeddings, maybe_prefix
 
 # MiMo-V2 checkpoints contain multiple MTP layers, but vLLM currently supports
-# only the first layer and only one speculative token.
+# only the first layer
 _MIMO_V2_PRO_NUM_MTP_LAYERS = 1
 _MIMO_V2_FLASH_NUM_MTP_LAYERS = 1
 
@@ -170,10 +170,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         config = vllm_config.model_config.hf_config
         spec_cfg = vllm_config.speculative_config
         assert spec_cfg is not None
-        if spec_cfg.num_speculative_tokens != 1:
-            raise ValueError(
-                "MiMo-V2 MTP in vLLM only supports num_speculative_tokens=1."
-            )
         num_mtp_layers = 1
 
         self.num_mtp_layers = num_mtp_layers
@@ -203,10 +199,10 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        assert spec_step_idx == 0, "MiMo-V2 MTP only supports one speculative token."
         if inputs_embeds is None:
             inputs_embeds = self.embed_input_ids(input_ids)
-        return self.mtp.layers[str(spec_step_idx)](
+        current_step_idx = spec_step_idx % self.num_mtp_layers
+        return self.mtp.layers[str(current_step_idx)](
             inputs_embeds, positions, previous_hidden_states
         )
 
@@ -216,7 +212,6 @@ def compute_logits(
         lm_head: ParallelLMHead,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        assert spec_step_idx == 0, "MiMo-V2 MTP only supports one speculative token."
         return self.logits_processor(lm_head, hidden_states)
 
 
@@ -245,7 +240,6 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        assert spec_step_idx == 0, "MiMo-V2 MTP only supports one speculative token."
         return self.model(
             input_ids, positions, hidden_states, inputs_embeds, spec_step_idx
         )
@@ -255,7 +249,6 @@ def compute_logits(
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
     ) -> torch.Tensor | None:
-        assert spec_step_idx == 0, "MiMo-V2 MTP only supports one speculative token."
         return self.model.compute_logits(hidden_states, self.lm_head, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/mistral_eagle.py b/vllm/model_executor/models/mistral_eagle.py
index 908b50f7ca00..8865742d6495 100644
--- a/vllm/model_executor/models/mistral_eagle.py
+++ b/vllm/model_executor/models/mistral_eagle.py
@@ -128,7 +128,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             vllm_config.parallel_config
         )
         self.model = EagleMistralModel(
-            vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            start_layer_id=target_layer_num,
         )
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py
index aa58fa6d1583..9ad3810e41fd 100644
--- a/vllm/model_executor/models/molmo2.py
+++ b/vllm/model_executor/models/molmo2.py
@@ -489,7 +489,7 @@ def __init__(
         self.transformer = Molmo2VisionBlockCollection(
             config,
             quant_config,
-            prefix=f"{prefix}.transformer",
+            prefix=maybe_prefix(prefix, "transformer"),
         )
 
     def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
@@ -1338,6 +1338,9 @@ def exif_transpose(
 def build_flat_image_bool_length(
     image_grids: torch.LongTensor,
     hf_config: PretrainedConfig,
+    image_use_col_tokens: bool = True,
+    use_single_crop_col_tokens: bool | None = None,
+    use_single_crop_start_token: bool = True,
 ) -> tuple[torch.LongTensor, torch.LongTensor]:
     image_patch_id = hf_config.image_patch_id
     low_res_image_start_id = hf_config.low_res_image_start_token_id
@@ -1353,7 +1356,17 @@ def build_flat_image_bool_length(
     h = image_grids[:, 2]
     w = image_grids[:, 3]
 
-    lengths = resized_h * resized_w + h * (w + 1) + 4  # [B]
+    low_res_use_col_tokens = (
+        image_use_col_tokens
+        if use_single_crop_col_tokens is None
+        else use_single_crop_col_tokens
+    )
+    low_res_extra = int(low_res_use_col_tokens)
+    high_res_extra = int(image_use_col_tokens)
+
+    lengths = (
+        resized_h * (resized_w + low_res_extra) + h * (w + high_res_extra) + 4
+    )  # [B]
     total_len = int(lengths.sum().item())
 
     flat = torch.empty(total_len, dtype=torch.long, device=device)
@@ -1363,16 +1376,24 @@ def build_flat_image_bool_length(
         resized_h_i, resized_w_i, h_i, w_i = image_grids[i].tolist()
         L_i = int(lengths[i].item())
 
-        num_low_res_patches = resized_h_i * resized_w_i
-
         idx = offset
 
-        flat[idx] = low_res_image_start_id
+        flat[idx] = (
+            low_res_image_start_id if use_single_crop_start_token else image_start_id
+        )
         idx += 1
 
-        if num_low_res_patches > 0:
-            flat[idx : idx + num_low_res_patches] = image_patch_id
-            idx += num_low_res_patches
+        low_res_block_len = resized_w_i + low_res_extra
+        if low_res_block_len > 0 and resized_h_i > 0:
+            line = torch.empty(low_res_block_len, dtype=torch.long, device=device)
+            if resized_w_i > 0:
+                line[:resized_w_i] = image_patch_id
+            if low_res_use_col_tokens:
+                line[resized_w_i] = image_col_id
+
+            block = line.repeat(resized_h_i)
+            flat[idx : idx + resized_h_i * low_res_block_len] = block
+            idx += resized_h_i * low_res_block_len
 
         flat[idx] = image_end_id
         idx += 1
@@ -1380,12 +1401,13 @@ def build_flat_image_bool_length(
         flat[idx] = image_start_id
         idx += 1
 
-        block_len = w_i + 1
+        block_len = w_i + high_res_extra
         if block_len > 0 and h_i > 0:
             line = torch.empty(block_len, dtype=torch.long, device=device)
             if w_i > 0:
                 line[:w_i] = image_patch_id
-            line[w_i] = image_col_id
+            if image_use_col_tokens:
+                line[w_i] = image_col_id
 
             block = line.repeat(h_i)
             flat[idx : idx + h_i * block_len] = block
@@ -2108,7 +2130,13 @@ def patched_call(text=None, images=None, videos=None, **kwargs) -> BatchFeature:
             (
                 processed_outputs["image_tokens"],
                 processed_outputs["num_image_tokens"],
-            ) = build_flat_image_bool_length(image_grids, hf_config)
+            ) = build_flat_image_bool_length(
+                image_grids,
+                hf_config,
+                image_use_col_tokens=hf_processor.image_use_col_tokens,
+                use_single_crop_col_tokens=hf_processor.use_single_crop_col_tokens,
+                use_single_crop_start_token=hf_processor.use_single_crop_start_token,
+            )
 
         return BatchFeature({**processed_outputs, **all_video_outputs})
 
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 994b52606b18..64667503d578 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1518,37 +1518,51 @@ def is_vision_weights(name: str) -> bool:
         def is_sound_weights(name: str) -> bool:
             return name.startswith("sound")
 
-        # Separate weights by component
-        llm_weights = []
-        vision_weights = []
-        sound_weights = []
-
-        for name, w in weights:
-            if is_llm(name):
-                # Strip 'language_model.' prefix for LLM weights
-                llm_weights.append((".".join(name.split(".")[1:]), w))
-            elif is_adapter_weights((name, w)):
-                if not load_multimodal_weights:
-                    continue
+        # LLM weights (the bulk of the model) are streamed lazily through a
+        # generator so each tensor is copied into its parameter before the
+        # iterator advances, avoiding stale-reference corruption with
+        # reusable-buffer streamers. The smaller mm components (mlp1, vision,
+        # sound) are detach+cloned on append so they are independent of any
+        # reusable buffer the streamer may use, then loaded after the LLM.
+        adapter_weights: list[tuple[str, torch.Tensor]] = []
+        vision_weights: list[tuple[str, torch.Tensor]] = []
+        sound_weights: list[tuple[str, torch.Tensor]] = []
+
+        def llm_weights_gen():
+            for name, w in weights:
+                if is_llm(name):
+                    # Strip 'language_model.' prefix for LLM weights
+                    yield ".".join(name.split(".")[1:]), w
+                elif is_adapter_weights((name, w)):
+                    if not load_multimodal_weights:
+                        continue
+                    trimmed_name = ".".join(name.split(".")[1:])
+                    adapter_weights.append((trimmed_name, w.detach().clone()))
+                elif is_vision_weights(name):
+                    if not load_multimodal_weights:
+                        continue
+                    # Convert: vision_model.radio_model.* → radio_model.*
+                    hf_key = name[len("vision_model.") :]
+                    vision_weights.append((hf_key, w.detach().clone()))
+                elif is_sound_weights(name):
+                    if not load_multimodal_weights:
+                        continue
+                    assert self.sound_encoder is not None
+                    sound_weights.append((name, w.detach().clone()))
+
+        # Fully drain the generator so every mm tensor is buffered, even if
+        # the LLM loader stops iterating early.
+        llm_weights_iter = llm_weights_gen()
+        self.language_model.load_weights(llm_weights_iter)
+        for _ in llm_weights_iter:
+            pass
+
+        if load_multimodal_weights:
+            for trimmed_name, w in adapter_weights:
                 # Load vision-language adapter weights directly
-                trimmed_name = ".".join(name.split(".")[1:])
                 param = adapter_dict[trimmed_name]
                 with torch.no_grad():
                     default_weight_loader(param, w)
-            elif is_vision_weights(name):
-                if not load_multimodal_weights:
-                    continue
-                # Convert: vision_model.radio_model.* → radio_model.*
-                hf_key = name[len("vision_model.") :]  # Remove "vision_model." prefix
-                vision_weights.append((hf_key, w))
-            elif is_sound_weights(name):
-                if not load_multimodal_weights:
-                    continue
-                assert self.sound_encoder is not None
-                sound_weights.append((name, w))
-
-        self.language_model.load_weights(llm_weights)
-        if load_multimodal_weights:
             self.vision_model.load_weights(vision_weights)
             if self.sound_encoder is not None and len(sound_weights) > 0:
                 self.sound_encoder.load_weights(sound_weights)
diff --git a/vllm/model_executor/models/olmo_hybrid.py b/vllm/model_executor/models/olmo_hybrid.py
index d070132fc4e4..8d94e400c4ed 100644
--- a/vllm/model_executor/models/olmo_hybrid.py
+++ b/vllm/model_executor/models/olmo_hybrid.py
@@ -92,6 +92,7 @@
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
 from .utils import (
@@ -136,8 +137,8 @@ class OlmoHybridGatedDeltaNet(nn.Module, MambaBase):
     """
 
     @property
-    def mamba_type(self) -> str:
-        return "gdn_attention"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.GDN_ATTN
 
     def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
         return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
diff --git a/vllm/model_executor/models/openpangu_vl.py b/vllm/model_executor/models/openpangu_vl.py
index e9288e6ddb14..a0c13f1e0f1e 100644
--- a/vllm/model_executor/models/openpangu_vl.py
+++ b/vllm/model_executor/models/openpangu_vl.py
@@ -857,7 +857,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         with self._mark_language_model(vllm_config):
             self.language_model = init_vllm_registered_model(
                 vllm_config=vllm_config,
-                prefix=maybe_prefix("openpangu", "language_model"),
+                prefix=maybe_prefix(prefix, "openpangu.language_model"),
                 architectures=["PanguEmbeddedForCausalLM"],
             )
 
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 3d9cf1c3415f..f25585fd7643 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -96,7 +96,7 @@ def __init__(
         self.backbone = self._init_backbone(
             config=config,
             quant_config=quant_config,
-            prefix=f"{prefix}.backbone",
+            prefix=maybe_prefix(prefix, "backbone"),
         )
         # reserved tokens for IMAGE_INDICATORS
         head_dim = config.vocab_size - len(IMAGE_INDICATOR_IDS)
@@ -442,7 +442,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.visual_tokenizer = VisualTokenizer(
                 config=config.visual_tokenizer_config,
                 quant_config=quant_config,
-                prefix=f"{prefix}.visual_tokenizer",
+                prefix=maybe_prefix(prefix, "visual_tokenizer"),
             )
             self.vte = VisualEmbedding(
                 self.config.visual_tokenizer_config.vocab_size, self.config.hidden_size
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 4acad73c502e..6dbed78a6fc6 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -465,7 +465,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 config=config.vit_config,
                 visual_vocab_size=config.visual_vocab_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.visual_tokenizer",
+                prefix=maybe_prefix(prefix, "visual_tokenizer"),
             )
             self.vte = VisualEmbedding(config.visual_vocab_size, config.hidden_size)
 
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 8eaf94620c11..cd88009c739a 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -895,7 +895,7 @@ def __init__(
         self.vision_model = SiglipVisionTransformer(
             config,
             quant_config=quant_config,
-            prefix=f"{prefix}.vision_model",
+            prefix=maybe_prefix(prefix, "vision_model"),
         )
         self.quant_config = quant_config
 
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 2db95b857563..6163b809670c 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1034,7 +1034,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.vision_encoder = Phi4MMImageEncoder(
                 config,
                 quant_config,
-                prefix="model.vision_embed_tokens",
+                prefix=maybe_prefix(prefix, "model.vision_embed_tokens"),
                 model_dir=config._name_or_path,
             )
 
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index c6d369fffa1d..5fd925cf0bee 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -72,6 +72,7 @@
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 # Only used for type hinting.
 if TYPE_CHECKING:
@@ -478,8 +479,8 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         )
 
     @property
-    def mamba_type(self) -> str:
-        return "mamba2"
+    def mamba_type(self) -> MambaAttentionBackendEnum:
+        return MambaAttentionBackendEnum.MAMBA2
 
 
 def plamo2_mamba_mixer(
diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py
index 2449724cd04f..dc761e69b487 100644
--- a/vllm/model_executor/models/qwen3_5.py
+++ b/vllm/model_executor/models/qwen3_5.py
@@ -138,7 +138,6 @@ def __init__(
                 vllm_config=vllm_config,
                 prefix=f"{prefix}.linear_attn",
                 gqa_interleaved_layout=False,
-                create_in_proj_qkvz=vllm_config.lora_config is None,
             )
         elif self.layer_type == "full_attention":
             self.self_attn = Qwen3NextAttention(
@@ -217,7 +216,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.num_redundant_experts = eplb_config.num_redundant_experts
 
         self.config = config
-        self.enable_lora = vllm_config.lora_config is not None
 
         self.vocab_size = config.vocab_size
 
@@ -276,6 +274,9 @@ def load_fused_expert_weights(
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
+            # GDN
+            ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
+            ("in_proj_qkvz", "in_proj_z", 3),
             # self attention
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
@@ -287,21 +288,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             ("in_proj_ba", "in_proj_a", 1),
         ]
 
-        if self.enable_lora:
-            stacked_params_mapping.extend(
-                [
-                    ("in_proj_qkv", "in_proj_qkv", (0, 1, 2)),
-                    ("in_proj_z", "in_proj_z", 0),
-                ]
-            )
-        else:
-            stacked_params_mapping.extend(
-                [
-                    ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
-                    ("in_proj_qkvz", "in_proj_z", 3),
-                ]
-            )
-
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         expert_params_mapping = self.get_expert_mapping()
@@ -352,10 +338,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                     continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                if param_name == "in_proj_z" and self.enable_lora:
-                    weight_loader(param, loaded_weight)
-                else:
-                    weight_loader(param, loaded_weight, shard_id)
+                weight_loader(param, loaded_weight, shard_id)
                 break
             else:
                 is_expert_weight = False
@@ -485,15 +468,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-        # When LoRA is enabled, GDN uses separate in_proj_qkv and in_proj_z
-        # instead of merged in_proj_qkvz; pack mapping must match.
-        if vllm_config.lora_config:
-            base = getattr(Qwen3_5ForCausalLMBase, "packed_modules_mapping", {})
-            self.packed_modules_mapping = {k: list(v) for k, v in base.items()}
-            self.packed_modules_mapping.pop("in_proj_qkvz", None)
-            self.packed_modules_mapping["in_proj_qkv"] = ["in_proj_qkv"]
-            self.packed_modules_mapping["in_proj_z"] = ["in_proj_z"]
-
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
                 self.lm_head = self.model.embed_tokens
@@ -586,7 +560,6 @@ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid)
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
         # protocols have not __init__ method, so we need to use nn.Module.__init__
         nn.Module.__init__(self)
-        self.update_packed_mapping(enable_lora=vllm_config.lora_config is not None)
         config: Qwen3_5Config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
@@ -614,17 +587,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
             self.language_model.make_empty_intermediate_tensors
         )
 
-    def update_packed_mapping(self, enable_lora: bool):
-        # When LoRA is enabled, GDN uses separate in_proj_qkv and in_proj_z
-        if enable_lora:
-            base = getattr(
-                Qwen3_5ForConditionalGeneration, "packed_modules_mapping", {}
-            )
-            self.packed_modules_mapping = {k: list(v) for k, v in base.items()}
-            self.packed_modules_mapping.pop("in_proj_qkvz", None)
-            self.packed_modules_mapping["in_proj_qkv"] = ["in_proj_qkv"]
-            self.packed_modules_mapping["in_proj_z"] = ["in_proj_z"]
-
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
@@ -811,7 +773,6 @@ class Qwen3_5MoeForConditionalGeneration(
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
         # protocols have not __init__ method, so we need to use nn.Module.__init__
         nn.Module.__init__(self)
-        self.update_packed_mapping(enable_lora=vllm_config.lora_config is not None)
         config: Qwen3_5MoeConfig = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
diff --git a/vllm/model_executor/models/qwen3_dflash.py b/vllm/model_executor/models/qwen3_dflash.py
index cffe6267a4b3..dcc23bc40954 100644
--- a/vllm/model_executor/models/qwen3_dflash.py
+++ b/vllm/model_executor/models/qwen3_dflash.py
@@ -510,7 +510,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config.target_layer_count = target_layer_num
         self.model = DFlashQwen3Model(
             vllm_config=vllm_config,
-            prefix="model",
+            prefix=maybe_prefix(prefix, "model"),
             start_layer_id=target_layer_num,
         )
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index d38cd63b90b1..5b77da7aab11 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -89,7 +89,7 @@
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
-    "CohereMoeForCausalLM": ("cohere_moe", "CohereMoeForCausalLM"),
+    "Cohere2MoeForCausalLM": ("cohere2_moe", "Cohere2MoeForCausalLM"),
     "CwmForCausalLM": ("llama", "LlamaForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
@@ -582,6 +582,7 @@
     "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"),
     "MiMoV2MTPModel": ("mimo_v2_mtp", "MiMoV2MTP"),
     "MiMoV2OmniMTPModel": ("mimo_v2_mtp", "MiMoV2OmniMTP"),
+    "EagleCohereForCausalLM": ("cohere_eagle", "EagleCohereForCausalLM"),
     "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
     "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"),
     "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index ce3a260d0ef6..28d725e7a36c 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -867,7 +867,7 @@ def __init__(
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             require_post_norm=require_post_norm,
-            prefix=f"{prefix}.vision_model",
+            prefix=maybe_prefix(prefix, "vision_model"),
             use_head=use_head,
         )
 
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 6c7c33b75481..906a51bd7b1e 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -29,6 +29,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.platforms import current_platform
 
+from .utils import maybe_prefix
 from .vision import is_vit_use_data_parallel
 
 
@@ -598,7 +599,7 @@ def __init__(
         self.vision_model = Siglip2VisionTransformer(
             config,
             quant_config=quant_config,
-            prefix=f"{prefix}.vision_model",
+            prefix=maybe_prefix(prefix, "vision_model"),
         )
 
     def forward(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 82a5f0b95e7a..984b706e7041 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -426,7 +426,6 @@ class RocmPlatform(Platform):
         "fp8_per_block",
         "online",
         "gpt_oss_mxfp4",
-        "deepseek_v4_fp8",
     ]
 
     @classmethod
diff --git a/vllm/reasoning/cohere_command_reasoning_parser.py b/vllm/reasoning/cohere_command_reasoning_parser.py
index c96b21d4e8fb..b28a59089e73 100644
--- a/vllm/reasoning/cohere_command_reasoning_parser.py
+++ b/vllm/reasoning/cohere_command_reasoning_parser.py
@@ -39,16 +39,20 @@
 
 
 class CohereTagRegistry(NamedTuple):
-    """A single ``structural_tag`` begin("trigger")/end pair."""
+    """A single ``structural_tag`` trigger / end pair (``begin`` uses ``trigger``)."""
 
     trigger: str
     end: str
 
 
 class CohereTagStyle(NamedTuple):
-    """The structural tags style for a given model architecture."""
+    """The structural tags style for a given model architecture.
 
-    json: CohereTagRegistry
+    ``json_tags`` lists every JSON-schema wrapper the model may emit (MOE uses
+    both response and text delimiters). ``tools`` is the tool-call wrapper.
+    """
+
+    json_tags: tuple[CohereTagRegistry, ...]
     tools: CohereTagRegistry
 
 
@@ -64,18 +68,30 @@ class CohereNormalizedTool(TypedDict):
 
 
 COMMAND_A_TOOLS_TAG = CohereTagRegistry(
-    trigger="<|START_ACTION|>", end="<|END_ACTION|>"
+    trigger="<|START_ACTION|>",
+    end="<|END_ACTION|>",
 )
 COMMAND_A_JSON_TAG = CohereTagRegistry(
-    trigger="<|START_RESPONSE|>", end="<|END_RESPONSE|>"
+    trigger="<|START_RESPONSE|>",
+    end="<|END_RESPONSE|>",
+)
+COMMAND_A_PLUS_JSON_TAG = CohereTagRegistry(
+    trigger="<|START_TEXT|>",
+    end="<|END_TEXT|>",
 )
 
 MODEL_TO_TAG_STYLE: dict[str, CohereTagStyle] = {
     "Cohere2ForCausalLM": CohereTagStyle(
-        json=COMMAND_A_JSON_TAG, tools=COMMAND_A_TOOLS_TAG
+        json_tags=(COMMAND_A_JSON_TAG,),
+        tools=COMMAND_A_TOOLS_TAG,
     ),
     "Cohere2VisionForConditionalGeneration": CohereTagStyle(
-        json=COMMAND_A_JSON_TAG, tools=COMMAND_A_TOOLS_TAG
+        json_tags=(COMMAND_A_JSON_TAG, COMMAND_A_PLUS_JSON_TAG),
+        tools=COMMAND_A_TOOLS_TAG,
+    ),
+    "Cohere2MoeForCausalLM": CohereTagStyle(
+        json_tags=(COMMAND_A_JSON_TAG,),
+        tools=COMMAND_A_TOOLS_TAG,
     ),
 }
 
@@ -211,15 +227,18 @@ def convert_schema_to_structural_tags(
     style = MODEL_TO_TAG_STYLE[model_architecture]
 
     tags: list[dict] = []
+    triggers: list[str] = []
 
     def _add_tag(tag: CohereTagRegistry, content: dict) -> None:
         tags.append({"begin": tag.trigger, "content": content, "end": tag.end})
+        triggers.append(tag.trigger)
 
     if schema is not None:
-        # Add the JSON-schema tag both for schema-only requests and for the
-        # "tools plus JSON mode" case (North use case: follow the schema when
-        # the model decides not to call any tool).
-        _add_tag(style.json, {"type": "json_schema", "json_schema": schema})
+        # One structural tag per JSON wrapper (e.g. MOE: response + text).
+        # Same for schema-only and "tools plus JSON mode" (North: schema when
+        # the model does not call tools).
+        for jt in style.json_tags:
+            _add_tag(jt, {"type": "json_schema", "json_schema": schema})
 
     if _has_effective_tools(tools):
         # ``tools`` may be a JSON string (poseidon / RESPONSE_FORMAT_TOOL_DEFINITIONS)
@@ -240,7 +259,7 @@ def _add_tag(tag: CohereTagRegistry, content: dict) -> None:
             "type": "structural_tag",
             "format": {
                 "type": "triggered_tags",
-                "triggers": [t["begin"] for t in tags],
+                "triggers": triggers,
                 "tags": tags,
             },
         }
@@ -505,7 +524,7 @@ def adjust_request(
             model_architecture=model_architecture,
         )
         if result is None:
-            # Unsupported architectures are not in ``MODEL_TO_TAG_STYLE``; conversion
+            # Unsupported architectures are not in ``MODEL_TO_TAG_STYLE``.
             raise ValueError(
                 "Failed to build structural_tag guided decoding constraints from "
                 "this request's JSON schema and/or tools. The configured model "
diff --git a/vllm/reasoning/nemotron_v3_reasoning_parser.py b/vllm/reasoning/nemotron_v3_reasoning_parser.py
index 52a57ccc8e93..7256f0f1283d 100644
--- a/vllm/reasoning/nemotron_v3_reasoning_parser.py
+++ b/vllm/reasoning/nemotron_v3_reasoning_parser.py
@@ -26,7 +26,7 @@ def extract_reasoning(
                 chat_template_kwargs.get("enable_thinking") is False
                 or chat_template_kwargs.get("force_nonempty_content") is True
             )
-            and final_content is None
+            and (final_content is None or not final_content.strip())
         ):
             reasoning, final_content = final_content, reasoning
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 88b1b0b8e8e9..76834e9bd779 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -690,6 +690,14 @@ def _validate_logprobs(self, model_config: ModelConfig) -> None:
                     parameter="logprob_token_ids",
                     value=n,
                 )
+            if self.logprobs is not None and self.logprobs != n:
+                raise VLLMValidationError(
+                    f"When both logprobs and logprob_token_ids are set, "
+                    f"logprobs must equal len(logprob_token_ids). Got "
+                    f"logprobs={self.logprobs}, len(logprob_token_ids)={n}.",
+                    parameter="logprob_token_ids",
+                    value=n,
+                )
 
         # Validate prompt logprobs.
         if num_prompt_logprobs := self.prompt_logprobs:
diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py
index ac266ede3f48..64c4beb14357 100644
--- a/vllm/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/tool_parsers/glm4_moe_tool_parser.py
@@ -210,9 +210,10 @@ def extract_tool_calls(
                 arg_dct: dict[str, Any] = {}
                 for key, value in pairs:
                     arg_key = key.strip()
-                    arg_val = value.strip()
-                    if not self._is_string_type(tc_name, arg_key, self.tools):
-                        arg_val = self._deserialize(arg_val)
+                    if self._is_string_type(tc_name, arg_key, self.tools):
+                        arg_val = value
+                    else:
+                        arg_val = self._deserialize(value.strip())
                     logger.debug("arg_key = %s, arg_val = %s", arg_key, arg_val)
                     arg_dct[arg_key] = arg_val
                 tool_calls.append(
diff --git a/vllm/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py
index 73850b2ab0c5..46aa8a4acc38 100644
--- a/vllm/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/tool_parsers/qwen3coder_tool_parser.py
@@ -19,6 +19,7 @@
     FunctionCall,
     ToolCall,
 )
+from vllm.envs import VLLM_ENFORCE_STRICT_TOOL_CALLING
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.abstract_tool_parser import (
@@ -35,7 +36,7 @@
 
 
 class Qwen3CoderToolParser(ToolParser):
-    supports_required_and_named: bool = False
+    supports_required_and_named: bool = not VLLM_ENFORCE_STRICT_TOOL_CALLING
 
     def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
         super().__init__(tokenizer, tools)
diff --git a/vllm/utils/cpu_triton_utils.py b/vllm/utils/cpu_triton_utils.py
index d823e8b14aba..ea0383a9d4b9 100644
--- a/vllm/utils/cpu_triton_utils.py
+++ b/vllm/utils/cpu_triton_utils.py
@@ -204,8 +204,16 @@ def _rejection_greedy_sample_kernel_impl(
     bonus_token_ids,
     is_greedy,
     max_spec_len,
+    uniform_probs=None,
+    synthetic_conditional_rates=None,
+    SYNTHETIC_MODE=False,
 ):
     # C++ kernel expects int64 for all integer tensors.
+    # Note: uniform_probs, synthetic_conditional_rates, and SYNTHETIC_MODE are
+    # passed by the rejection sampler for synthetic mode support, but are not
+    # yet implemented in the C++ CPU kernel. We accept them here to maintain
+    # compatibility with the kernel calling convention.
+    assert not SYNTHETIC_MODE, "Synthetic acceptance not supported with CPU sampling"
     orig_dtype = output_token_ids.dtype
     output_token_ids_i64 = _ensure_int64(output_token_ids)
     torch.ops._C.rejection_greedy_sample_kernel_impl(
@@ -233,11 +241,18 @@ def _rejection_random_sample_kernel_impl(
     is_greedy,
     max_spec_len,
     vocab_size,
+    synthetic_conditional_rates=None,
     NO_DRAFT_PROBS=False,
+    SYNTHETIC_MODE=False,
 ):
     # C++ kernel expects int64 for all integer tensors and float32 for probs.
     # uniform_probs is intentionally float64 in Python to avoid exact-zero
     # samples; cast to float32 here for C++ compatibility.
+    # Note: synthetic_conditional_rates and SYNTHETIC_MODE are passed by the
+    # rejection sampler for synthetic mode support, but are not yet implemented
+    # in the C++ CPU kernel. We accept them here to maintain compatibility with
+    # the kernel calling convention.
+    assert not SYNTHETIC_MODE, "Synthetic acceptance not supported with CPU sampling"
     orig_dtype = output_token_ids.dtype
     output_token_ids_i64 = _ensure_int64(output_token_ids)
     torch.ops._C.rejection_random_sample_kernel_impl(
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 828ff08a067d..44fcc19c2d2b 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -685,6 +685,47 @@ def flashinfer_scaled_fp4_mm(
     )
 
 
+def flashinfer_scaled_fp4_mm_out(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    block_scale_a: torch.Tensor,
+    block_scale_b: torch.Tensor,
+    alpha: torch.Tensor,
+    out: torch.Tensor,
+    out_dtype: torch.dtype | None,
+    use_8x4_sf_layout: bool,
+    backend: str,
+) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2 and out.ndim == 2
+    assert block_scale_a.ndim == 2 and block_scale_b.ndim == 2
+    assert a.stride(-1) == 1
+    assert a.shape[1] == b.shape[0]
+    assert out.shape == (a.shape[0], b.shape[1])
+    assert out.device.type == "cuda"
+
+    if backend in ("cutlass", "cudnn"):
+        if block_scale_a.dtype != torch.uint8:
+            block_scale_a = block_scale_a.view(torch.uint8)
+        if block_scale_b.dtype != torch.uint8:
+            block_scale_b = block_scale_b.view(torch.uint8)
+
+    from flashinfer import mm_fp4 as flashinfer_mm_fp4_
+
+    flashinfer_mm_fp4_(
+        a,
+        b,
+        block_scale_a,
+        block_scale_b,
+        alpha,
+        out_dtype or out.dtype,
+        out=out,
+        block_size=16,
+        use_8x4_sf_layout=use_8x4_sf_layout,
+        backend=backend,
+    )
+    return out
+
+
 def flashinfer_scaled_fp8_mm(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -864,6 +905,7 @@ def is_flashinfer_cudnn_fp8_prefill_attn_supported() -> bool:
     "can_use_trtllm_attention",
     "use_trtllm_attention",
     "flashinfer_scaled_fp4_mm",
+    "flashinfer_scaled_fp4_mm_out",
     "flashinfer_scaled_fp8_mm",
     "flashinfer_scaled_fp8_mm_out",
     "flashinfer_quant_nvfp4_8x4_sf_layout",
diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py
index 6cf57c6894ab..5822e5840afc 100644
--- a/vllm/utils/import_utils.py
+++ b/vllm/utils/import_utils.py
@@ -469,3 +469,8 @@ def has_mori() -> bool:
 def has_fbgemm_gpu() -> bool:
     """Whether the optional `fbgemm_gpu` package is available."""
     return _has_module("fbgemm_gpu")
+
+
+def has_cutedsl() -> bool:
+    """Whether the optional `cutelass` package is available."""
+    return _has_module("cutlass")
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 7c1a784888eb..dc4b1cccac7f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -223,9 +223,7 @@ def __init__(
         self._context = BatchPrefillWithPagedKVCacheWrapper(
             workspace_buffer, get_kv_cache_layout()
         )
-        self._new_tokens = BatchPrefillWithRaggedKVCacheWrapper(
-            workspace_buffer, get_kv_cache_layout()
-        )
+        self._new_tokens = BatchPrefillWithRaggedKVCacheWrapper(workspace_buffer)
 
     def plan(
         self,
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 474a5b2d421e..797179076969 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar
 
 import numpy as np
 import torch
@@ -112,7 +112,7 @@ def get_builder_cls() -> type["FlashMLASparseMetadataBuilder"]:
         return FlashMLASparseMetadataBuilder
 
     @staticmethod
-    def get_impl_cls() -> type["FlashMLASparseImpl"]:
+    def get_impl_cls() -> type[SparseMLAAttentionImpl[Any]]:
         return FlashMLASparseImpl
 
     @classmethod
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse_dsv4.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse_dsv4.py
new file mode 100644
index 000000000000..a647b7d2c654
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse_dsv4.py
@@ -0,0 +1,682 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, cast
+
+import torch
+
+from vllm.forward_context import get_forward_context
+from vllm.triton_utils import tl, triton
+from vllm.v1.attention.backend import (
+    AttentionLayer,
+    CommonAttentionMetadata,
+    SparseMLAAttentionImpl,
+)
+from vllm.v1.attention.backends.mla.flashmla_sparse import (
+    DeepseekV4FlashMLASparseBackend,
+    FlashMLASparseMetadata,
+    FlashMLASparseMetadataBuilder,
+)
+from vllm.v1.attention.backends.mla.sparse_swa import (
+    DeepseekSparseSWAMetadata,
+    DeepseekSparseSWAMetadataBuilder,
+)
+from vllm.v1.attention.ops.deepseek_v4_ops import dequantize_and_gather_k_cache
+from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
+    build_ragged_indices_from_dense,
+    rocm_sparse_attn_decode,
+    rocm_sparse_attn_prefill,
+)
+from vllm.v1.worker.workspace import current_workspace_manager
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.deepseek_v4_attention import (
+        DeepseekV4MLAAttention,
+    )
+
+
+def _build_indptr_from_lengths(lengths: torch.Tensor) -> torch.Tensor:
+    lengths = lengths.to(dtype=torch.int32).contiguous()
+    indptr = torch.zeros(lengths.shape[0] + 1, dtype=torch.int32, device=lengths.device)
+    torch.cumsum(lengths, dim=0, out=indptr[1:])
+    return indptr
+
+
+@triton.jit
+def _compute_topk_lens_kernel(
+    topk_lens_ptr,
+    topk_indices_ptr,
+    topk_indices_stride,
+    topk,
+    is_valid_token_ptr,
+    TRITON_BLOCK_SIZE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    is_valid_token = tl.load(is_valid_token_ptr + token_idx)
+
+    count = tl.zeros((), dtype=tl.int32)
+    for i in range(0, topk, TRITON_BLOCK_SIZE):
+        offset = i + tl.arange(0, TRITON_BLOCK_SIZE)
+        mask = offset < topk
+        local_idx = tl.load(
+            topk_indices_ptr + token_idx * topk_indices_stride + offset,
+            mask=mask,
+            other=-1,
+        )
+        count += tl.sum((local_idx >= 0).to(tl.int32), axis=0)
+
+    tl.store(topk_lens_ptr + token_idx, tl.where(is_valid_token, count, 0))
+
+
+@triton.jit
+def _pack_global_topk_ragged_kernel(
+    global_topk_ragged_ptr,
+    topk_indptr_ptr,
+    topk_indices_ptr,
+    topk_indices_stride,
+    token_to_req_indices_ptr,
+    block_table_ptr,
+    block_table_stride,
+    block_size,
+    topk,
+    BLOCK_SIZE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    block_idx = tl.program_id(1)
+    offset = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+
+    out_start = tl.load(topk_indptr_ptr + token_idx)
+    out_end = tl.load(topk_indptr_ptr + token_idx + 1)
+    out_len = out_end - out_start
+    if block_idx * BLOCK_SIZE >= out_len:
+        return
+
+    req_idx = tl.load(token_to_req_indices_ptr + token_idx)
+    mask = (offset < out_len) & (offset < topk)
+    local_idx = tl.load(
+        topk_indices_ptr + token_idx * topk_indices_stride + offset,
+        mask=mask,
+        other=-1,
+    )
+    valid = mask & (local_idx >= 0)
+    block_indices = local_idx // block_size
+    block_numbers = tl.load(
+        block_table_ptr + req_idx * block_table_stride + block_indices,
+        mask=valid,
+        other=0,
+    )
+    block_offsets = local_idx % block_size
+    slot_ids = tl.where(valid, block_numbers * block_size + block_offsets, -1)
+    tl.store(global_topk_ragged_ptr + out_start + offset, slot_ids, mask=mask)
+
+
+def compute_global_topk_ragged_indices_and_indptr(
+    topk_indices: torch.Tensor,
+    token_to_req_indices: torch.Tensor,
+    block_table: torch.Tensor,
+    block_size: int,
+    is_valid_token: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    topk_indices = topk_indices.reshape(topk_indices.shape[0], -1).contiguous()
+    num_tokens = topk_indices.shape[0]
+    topk = topk_indices.shape[1]
+
+    topk_lens = torch.empty(num_tokens, dtype=torch.int32, device=topk_indices.device)
+    _compute_topk_lens_kernel[(num_tokens,)](
+        topk_lens,
+        topk_indices,
+        topk_indices.stride(0),
+        topk,
+        is_valid_token,
+        TRITON_BLOCK_SIZE=1024,
+    )
+
+    topk_indptr = _build_indptr_from_lengths(topk_lens)
+    global_topk_ragged = torch.empty(
+        num_tokens * topk,
+        dtype=torch.int32,
+        device=topk_indices.device,
+    )
+    if global_topk_ragged.numel() > 0:
+        block = 128
+        _pack_global_topk_ragged_kernel[(num_tokens, triton.cdiv(topk, block))](
+            global_topk_ragged,
+            topk_indptr,
+            topk_indices,
+            topk_indices.stride(0),
+            token_to_req_indices,
+            block_table,
+            block_table.stride(0),
+            block_size,
+            topk,
+            BLOCK_SIZE=block,
+        )
+    return global_topk_ragged, topk_indptr, topk_lens
+
+
+@triton.jit
+def _compute_combined_lens_kernel(
+    combined_lens_ptr,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    TOP_K: tl.constexpr,
+    COMPRESS_RATIO: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    worker_id = tl.program_id(1)
+    num_workers = tl.num_programs(1)
+
+    base = tl.load(query_start_loc_ptr)
+    query_start = tl.load(query_start_loc_ptr + batch_idx) - base
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1) - base
+    query_len = query_end - query_start
+    seq_len = tl.load(seq_lens_ptr + batch_idx)
+    start_pos = seq_len - query_len
+
+    for token_idx in range(query_start + worker_id, query_end, num_workers):
+        token_idx_in_query = token_idx - query_start
+        pos = start_pos + token_idx_in_query
+        topk_len = tl.minimum((pos + 1) // COMPRESS_RATIO, TOP_K)
+        swa_len = tl.minimum(pos + 1, WINDOW_SIZE)
+        tl.store(combined_lens_ptr + token_idx, topk_len + swa_len)
+
+
+@triton.jit
+def _combine_topk_swa_indices_ragged_kernel(
+    combined_ragged_ptr,
+    combined_indptr_ptr,
+    topk_indices_ptr,
+    topk_indices_stride,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    gather_lens_ptr,
+    M,
+    N,
+    topk_width,
+    TOP_K: tl.constexpr,
+    COMPRESS_RATIO: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    worker_id = tl.program_id(1)
+    block_idx = tl.program_id(2)
+    num_workers = tl.num_programs(1)
+
+    base = tl.load(query_start_loc_ptr)
+    query_start = tl.load(query_start_loc_ptr + batch_idx) - base
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1) - base
+    query_len = query_end - query_start
+    seq_len = tl.load(seq_lens_ptr + batch_idx)
+    gather_len = tl.load(gather_lens_ptr + batch_idx)
+    start_pos = seq_len - query_len
+    gather_start = seq_len - gather_len
+
+    for token_idx in range(query_start + worker_id, query_end, num_workers):
+        token_idx_in_query = token_idx - query_start
+        pos = start_pos + token_idx_in_query
+        topk_len = tl.minimum((pos + 1) // COMPRESS_RATIO, TOP_K)
+        swa_len = tl.minimum(pos + 1, WINDOW_SIZE)
+        combined_len = topk_len + swa_len
+
+        offset = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        if block_idx * BLOCK_SIZE < combined_len:
+            out_start = tl.load(combined_indptr_ptr + token_idx)
+            topk_mask = (offset < topk_len) & (offset < topk_width)
+            topk_vals = tl.load(
+                topk_indices_ptr + token_idx * topk_indices_stride + offset,
+                mask=topk_mask,
+                other=-1,
+            )
+            tl.store(
+                combined_ragged_ptr + out_start + offset,
+                topk_vals + M * batch_idx,
+                mask=topk_mask,
+            )
+
+            swa_offset = offset - topk_len
+            swa_mask = (offset >= topk_len) & (swa_offset < swa_len)
+            tl.store(
+                combined_ragged_ptr + out_start + offset,
+                M * batch_idx + N + swa_offset + pos - swa_len + 1 - gather_start,
+                mask=swa_mask,
+            )
+
+
+def combine_topk_swa_indices_ragged(
+    topk_indices: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens: torch.Tensor,
+    gather_lens: torch.Tensor,
+    window_size: int,
+    compress_ratio: int,
+    topk: int,
+    M: int,
+    N: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    topk_indices = topk_indices.reshape(topk_indices.shape[0], -1).contiguous()
+    num_tokens = topk_indices.shape[0]
+    num_reqs = seq_lens.shape[0]
+    combined_lens = torch.empty(
+        num_tokens, dtype=torch.int32, device=topk_indices.device
+    )
+
+    num_workers = 128
+    _compute_combined_lens_kernel[(num_reqs, num_workers)](
+        combined_lens,
+        query_start_loc,
+        seq_lens,
+        TOP_K=topk,
+        COMPRESS_RATIO=compress_ratio,
+        WINDOW_SIZE=window_size,
+    )
+
+    combined_indptr = _build_indptr_from_lengths(combined_lens)
+    combined_ragged = torch.empty(
+        num_tokens * (topk + window_size),
+        dtype=torch.int32,
+        device=topk_indices.device,
+    )
+    if combined_ragged.numel() > 0:
+        block = 128
+        _combine_topk_swa_indices_ragged_kernel[
+            (num_reqs, num_workers, triton.cdiv(topk + window_size, block))
+        ](
+            combined_ragged,
+            combined_indptr,
+            topk_indices,
+            topk_indices.stride(0),
+            query_start_loc,
+            seq_lens,
+            gather_lens,
+            M,
+            N,
+            topk_indices.shape[-1],
+            TOP_K=topk,
+            COMPRESS_RATIO=compress_ratio,
+            WINDOW_SIZE=window_size,
+            BLOCK_SIZE=block,
+        )
+    return combined_ragged, combined_indptr, combined_lens
+
+
+@dataclass
+class DeepseekV4ROCMAiterMLASparseMetadata(FlashMLASparseMetadata):
+    """ROCm-specific DeepSeek V4 metadata carrying ragged decode topk."""
+
+    c128a_decode_topk_ragged_indices: torch.Tensor | None = None
+    c128a_decode_topk_ragged_indptr: torch.Tensor | None = None
+
+
+@dataclass
+class DeepseekV4ROCMAiterSparseSWAMetadata(DeepseekSparseSWAMetadata):
+    decode_swa_ragged_indices: torch.Tensor | None = None
+    decode_swa_ragged_indptr: torch.Tensor | None = None
+
+
+class DeepseekV4ROCMAiterMLASparseMetadataBuilder(FlashMLASparseMetadataBuilder):
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> DeepseekV4ROCMAiterMLASparseMetadata:
+        base = super().build(
+            common_prefix_len=common_prefix_len,
+            common_attn_metadata=common_attn_metadata,
+            fast_build=fast_build,
+        )
+
+        ragged_indices = None
+        ragged_indptr = None
+        dense_decode = base.c128a_global_decode_topk_indices
+        decode_lens = base.c128a_decode_topk_lens
+        if dense_decode is not None and decode_lens is not None:
+            ragged_indices, ragged_indptr = build_ragged_indices_from_dense(
+                dense_decode.reshape(dense_decode.shape[0], -1),
+                decode_lens,
+            )
+
+        return DeepseekV4ROCMAiterMLASparseMetadata(
+            **vars(base),
+            c128a_decode_topk_ragged_indices=ragged_indices,
+            c128a_decode_topk_ragged_indptr=ragged_indptr,
+        )
+
+
+class DeepseekV4ROCMAiterSparseSWAMetadataBuilder(DeepseekSparseSWAMetadataBuilder):
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> DeepseekV4ROCMAiterSparseSWAMetadata:
+        base = super().build(
+            common_prefix_len=common_prefix_len,
+            common_attn_metadata=common_attn_metadata,
+            fast_build=fast_build,
+        )
+
+        ragged_indices = None
+        ragged_indptr = None
+        if (
+            base.num_decode_tokens > 0
+            and base.decode_swa_indices is not None
+            and base.decode_swa_lens is not None
+        ):
+            ragged_indices, ragged_indptr = build_ragged_indices_from_dense(
+                base.decode_swa_indices.reshape(base.num_decode_tokens, -1),
+                base.decode_swa_lens,
+            )
+
+        return DeepseekV4ROCMAiterSparseSWAMetadata(
+            **vars(base),
+            decode_swa_ragged_indices=ragged_indices,
+            decode_swa_ragged_indptr=ragged_indptr,
+        )
+
+
+class DeepseekV4ROCMAiterMLASparseImpl(
+    SparseMLAAttentionImpl[DeepseekV4ROCMAiterMLASparseMetadata]
+):
+    """ROCm sparse MLA implementation used by DeepSeek V4's custom MLA layer."""
+
+    _PREFILL_CHUNK_SIZE = 4
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        **_: object,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+
+    def forward_mqa(
+        self,
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: DeepseekV4ROCMAiterMLASparseMetadata,
+        layer: AttentionLayer,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        raise NotImplementedError(
+            "DeepseekV4ROCMAiterMLASparseImpl is driven by "
+            "DeepseekV4MLAAttention.forward."
+        )
+
+    @classmethod
+    def forward(
+        cls,
+        layer: "DeepseekV4MLAAttention",
+        q: torch.Tensor,
+        kv: torch.Tensor,
+        positions: torch.Tensor,
+        output: torch.Tensor,
+    ) -> None:
+        assert output.shape == q.shape, (
+            f"output buffer shape {output.shape} must match q shape {q.shape}"
+        )
+        assert output.dtype == q.dtype, (
+            f"output buffer dtype {output.dtype} must match q dtype {q.dtype}"
+        )
+
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        assert isinstance(attn_metadata, dict)
+        rocm_metadata = cast(
+            DeepseekV4ROCMAiterMLASparseMetadata | None,
+            attn_metadata.get(layer.prefix),
+        )
+        swa_metadata = cast(
+            DeepseekV4ROCMAiterSparseSWAMetadata | None,
+            attn_metadata.get(layer.swa_cache_layer.prefix),
+        )
+        assert swa_metadata is not None
+
+        swa_only = layer.compress_ratio <= 1
+        self_kv_cache = layer.kv_cache if not swa_only else None
+        swa_kv_cache = layer.swa_cache_layer.kv_cache
+
+        num_decodes = swa_metadata.num_decodes
+        num_prefills = swa_metadata.num_prefills
+        num_decode_tokens = swa_metadata.num_decode_tokens
+
+        if num_prefills > 0:
+            cls._forward_prefill(
+                layer=layer,
+                q=q[num_decode_tokens:],
+                positions=positions[num_decode_tokens:],
+                compressed_k_cache=self_kv_cache,
+                swa_k_cache=swa_kv_cache,
+                output=output[num_decode_tokens:],
+                attn_metadata=rocm_metadata,
+                swa_metadata=swa_metadata,
+            )
+        if num_decodes > 0:
+            cls._forward_decode(
+                layer=layer,
+                q=q[:num_decode_tokens],
+                kv_cache=self_kv_cache,
+                swa_metadata=swa_metadata,
+                attn_metadata=rocm_metadata,
+                swa_only=swa_only,
+                output=output[:num_decode_tokens],
+            )
+
+    @classmethod
+    def _forward_decode(
+        cls,
+        layer: "DeepseekV4MLAAttention",
+        q: torch.Tensor,
+        kv_cache: torch.Tensor | None,
+        swa_metadata: DeepseekV4ROCMAiterSparseSWAMetadata,
+        attn_metadata: DeepseekV4ROCMAiterMLASparseMetadata | None,
+        swa_only: bool,
+        output: torch.Tensor,
+    ) -> None:
+        num_decodes = swa_metadata.num_decodes
+        num_decode_tokens = swa_metadata.num_decode_tokens
+
+        topk_indices = None
+        topk_lens = None
+        topk_ragged_indices = None
+        topk_ragged_indptr = None
+        if not swa_only:
+            assert attn_metadata is not None
+            assert swa_metadata.is_valid_token is not None
+            block_size = attn_metadata.block_size // layer.compress_ratio
+            is_valid = swa_metadata.is_valid_token[:num_decode_tokens]
+            if layer.compress_ratio == 4:
+                assert layer.topk_indices_buffer is not None
+                (
+                    topk_ragged_indices,
+                    topk_ragged_indptr,
+                    topk_lens,
+                ) = compute_global_topk_ragged_indices_and_indptr(
+                    layer.topk_indices_buffer[:num_decode_tokens],
+                    swa_metadata.token_to_req_indices,
+                    attn_metadata.block_table[:num_decodes],
+                    block_size,
+                    is_valid,
+                )
+            else:
+                topk_indices = attn_metadata.c128a_global_decode_topk_indices
+                topk_lens = attn_metadata.c128a_decode_topk_lens
+                topk_ragged_indices = attn_metadata.c128a_decode_topk_ragged_indices
+                topk_ragged_indptr = attn_metadata.c128a_decode_topk_ragged_indptr
+
+        rocm_sparse_attn_decode(
+            q=q,
+            kv_cache=kv_cache,
+            swa_k_cache=layer.swa_cache_layer.kv_cache,
+            swa_only=swa_only,
+            topk_indices=topk_indices,
+            topk_lens=topk_lens,
+            swa_indices=swa_metadata.decode_swa_indices,
+            swa_lens=swa_metadata.decode_swa_lens,
+            swa_ragged_indices=swa_metadata.decode_swa_ragged_indices,
+            swa_ragged_indptr=swa_metadata.decode_swa_ragged_indptr,
+            topk_ragged_indices=topk_ragged_indices,
+            topk_ragged_indptr=topk_ragged_indptr,
+            attn_sink=layer.attn_sink,
+            scale=layer.scale,
+            head_dim=layer.head_dim,
+            nope_head_dim=layer.nope_head_dim,
+            rope_head_dim=layer.rope_head_dim,
+            output=output,
+        )
+
+    @classmethod
+    def _forward_prefill(
+        cls,
+        layer: "DeepseekV4MLAAttention",
+        q: torch.Tensor,
+        positions: torch.Tensor,
+        compressed_k_cache: torch.Tensor | None,
+        swa_k_cache: torch.Tensor,
+        output: torch.Tensor,
+        attn_metadata: DeepseekV4ROCMAiterMLASparseMetadata | None,
+        swa_metadata: DeepseekV4ROCMAiterSparseSWAMetadata,
+    ) -> None:
+        swa_only = attn_metadata is None
+
+        num_prefills = swa_metadata.num_prefills
+        num_prefill_tokens = swa_metadata.num_prefill_tokens
+        num_decodes = swa_metadata.num_decodes
+        num_decode_tokens = swa_metadata.num_decode_tokens
+
+        seq_lens = swa_metadata.prefill_seq_lens
+        gather_lens = swa_metadata.prefill_gather_lens
+        assert seq_lens is not None
+        assert gather_lens is not None
+
+        query_start_loc_cpu = swa_metadata.query_start_loc_cpu
+        query_start_loc = swa_metadata.query_start_loc
+        assert query_start_loc_cpu is not None
+        assert query_start_loc is not None
+        prefill_token_base = query_start_loc_cpu[num_decodes]
+
+        if not swa_only:
+            if layer.compress_ratio == 4:
+                assert layer.topk_indices_buffer is not None
+                topk_indices = layer.topk_indices_buffer[num_decode_tokens:]
+                topk_indices = topk_indices[:num_prefill_tokens]
+            else:
+                assert attn_metadata is not None
+                topk_indices = attn_metadata.c128a_prefill_topk_indices
+            assert topk_indices is not None
+            top_k = topk_indices.shape[-1]
+            N = (layer.max_model_len + layer.compress_ratio - 1) // layer.compress_ratio
+        else:
+            assert layer.topk_indices_buffer is not None
+            topk_indices = layer.topk_indices_buffer[num_decode_tokens:]
+            top_k = 0
+            N = 0
+
+        M = N + layer.window_size + layer.max_num_batched_tokens
+        num_chunks = (num_prefills + cls._PREFILL_CHUNK_SIZE - 1) // (
+            cls._PREFILL_CHUNK_SIZE
+        )
+
+        workspace_manager = current_workspace_manager()
+        kv = workspace_manager.get_simultaneous(
+            ((cls._PREFILL_CHUNK_SIZE, M, q.shape[-1]), torch.bfloat16),
+        )[0]
+        for chunk_idx in range(num_chunks):
+            chunk_start = chunk_idx * cls._PREFILL_CHUNK_SIZE
+            chunk_end = min(chunk_start + cls._PREFILL_CHUNK_SIZE, num_prefills)
+            chunk_size = chunk_end - chunk_start
+            if not swa_only:
+                assert attn_metadata is not None
+                assert compressed_k_cache is not None
+                block_table = attn_metadata.block_table[num_decodes:]
+                dequantize_and_gather_k_cache(
+                    kv[:chunk_size],
+                    compressed_k_cache,
+                    seq_lens=seq_lens[chunk_start:chunk_end] // layer.compress_ratio,
+                    gather_lens=None,
+                    block_table=block_table[chunk_start:chunk_end],
+                    block_size=attn_metadata.block_size // layer.compress_ratio,
+                    offset=0,
+                )
+
+            swa_block_table = swa_metadata.block_table[num_decodes:]
+            dequantize_and_gather_k_cache(
+                kv[:chunk_size],
+                swa_k_cache,
+                seq_lens=seq_lens[chunk_start:chunk_end],
+                gather_lens=gather_lens[chunk_start:chunk_end],
+                block_table=swa_block_table[chunk_start:chunk_end],
+                block_size=swa_metadata.block_size,
+                offset=N,
+            )
+
+            query_start = (
+                query_start_loc_cpu[num_decodes + chunk_start] - prefill_token_base
+            )
+            query_end = (
+                query_start_loc_cpu[num_decodes + chunk_end] - prefill_token_base
+            )
+
+            combined_ragged_indices, combined_ragged_indptr, combined_lens = (
+                combine_topk_swa_indices_ragged(
+                    topk_indices[query_start:query_end],
+                    query_start_loc[
+                        num_decodes + chunk_start : num_decodes + chunk_end + 1
+                    ],
+                    seq_lens[chunk_start:chunk_end],
+                    gather_lens[chunk_start:chunk_end],
+                    layer.window_size,
+                    layer.compress_ratio,
+                    top_k,
+                    M,
+                    N,
+                )
+            )
+            rocm_sparse_attn_prefill(
+                q=q[query_start:query_end],
+                kv=kv.view(-1, 1, q.shape[-1]),
+                indices=torch.empty(
+                    q[query_start:query_end].shape[0],
+                    1,
+                    0,
+                    dtype=torch.int32,
+                    device=q.device,
+                ),
+                topk_length=combined_lens,
+                scale=layer.scale,
+                head_dim=layer.head_dim,
+                nope_head_dim=layer.nope_head_dim,
+                rope_head_dim=layer.rope_head_dim,
+                attn_sink=layer.attn_sink,
+                output=output[query_start:query_end],
+                ragged_indices=combined_ragged_indices,
+                ragged_indptr=combined_ragged_indptr,
+            )
+
+
+class DeepseekV4ROCMAiterMLASparseBackend(DeepseekV4FlashMLASparseBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_V4_FLASHMLA_SPARSE"
+
+    @staticmethod
+    def get_builder_cls() -> type["DeepseekV4ROCMAiterMLASparseMetadataBuilder"]:
+        return DeepseekV4ROCMAiterMLASparseMetadataBuilder
+
+    @staticmethod
+    def get_impl_cls() -> type["DeepseekV4ROCMAiterMLASparseImpl"]:
+        return DeepseekV4ROCMAiterMLASparseImpl
diff --git a/vllm/v1/attention/backends/mla/sparse_swa.py b/vllm/v1/attention/backends/mla/sparse_swa.py
index 28564e6a97d3..bfa3b7285dbd 100644
--- a/vllm/v1/attention/backends/mla/sparse_swa.py
+++ b/vllm/v1/attention/backends/mla/sparse_swa.py
@@ -112,6 +112,12 @@ def get_supported_head_sizes(cls) -> list[int]:
 
     @staticmethod
     def get_builder_cls() -> type["DeepseekSparseSWAMetadataBuilder"]:
+        if current_platform.is_rocm():
+            from vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse_dsv4 import (
+                DeepseekV4ROCMAiterSparseSWAMetadataBuilder,
+            )
+
+            return DeepseekV4ROCMAiterSparseSWAMetadataBuilder
         return DeepseekSparseSWAMetadataBuilder
 
     @staticmethod
diff --git a/vllm/v1/attention/backends/registry.py b/vllm/v1/attention/backends/registry.py
index 552b7f3373c7..fe567a7f2343 100644
--- a/vllm/v1/attention/backends/registry.py
+++ b/vllm/v1/attention/backends/registry.py
@@ -193,16 +193,6 @@ def clear_override(self) -> None:
         _MAMBA_ATTN_OVERRIDES.pop(self, None)
 
 
-MAMBA_TYPE_TO_BACKEND_MAP = {
-    "mamba1": MambaAttentionBackendEnum.MAMBA1.name,
-    "mamba2": MambaAttentionBackendEnum.MAMBA2.name,
-    "short_conv": MambaAttentionBackendEnum.SHORT_CONV.name,
-    "linear_attention": MambaAttentionBackendEnum.LINEAR.name,
-    "gdn_attention": MambaAttentionBackendEnum.GDN_ATTN.name,
-    "custom": MambaAttentionBackendEnum.CUSTOM.name,
-}
-
-
 _ATTN_OVERRIDES: dict[AttentionBackendEnum, str] = {}
 _MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {}
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 01280cd4f48b..5dbedc86bc02 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -324,22 +324,17 @@ def reshape_and_cache_shuffle_triton(
 @dataclass
 class AiterFlashAttentionDecodeMetadata:
     max_query_len: int
-    min_query_len: int
-    max_seq_len: int
-    query_start_loc: torch.Tensor
 
 
 @dataclass
 class AiterFlashAttentionPrefillMetadata:
     max_query_len: int
-    min_query_len: int
     max_seq_len: int
     query_start_loc: torch.Tensor
 
 
 @dataclass
 class AiterChunkSlidingWindowMetadata:
-    swa_seqlens: torch.Tensor
     swa_cu_seqlens: torch.Tensor
     swa_seq_starts: torch.Tensor
     swa_token_to_batch: torch.Tensor
@@ -354,9 +349,7 @@ class AiterChunkContextMetadata:
     cu_seq_lens_chunk: torch.Tensor
     chunk_starts: torch.Tensor
     token_to_batch: torch.Tensor
-    seq_tot: list[int]
     max_seq_lens: list[int]
-    seq_lens: torch.Tensor
     num_chunks: int
     total_token_per_batch: list[int]
     swa_metadata: AiterChunkSlidingWindowMetadata | None
@@ -365,7 +358,6 @@ class AiterChunkContextMetadata:
 @dataclass
 class AiterFlashAttentionChunkPrefillMetadata:
     max_query_len: int
-    min_query_len: int
     max_seq_len: int
     query_start_loc: torch.Tensor
     chunk_context_metadata: AiterChunkContextMetadata
@@ -382,8 +374,6 @@ class AiterFlashAttentionMetadata:
     #                                   |-- query_len ---|
 
     num_actual_tokens: int  # Number of tokens excluding padding.
-    num_actual_kv_tokens: int
-    max_query_len: int
     query_start_loc: torch.Tensor
     max_seq_len: int
     seq_lens: torch.Tensor
@@ -395,7 +385,6 @@ class AiterFlashAttentionMetadata:
     num_decodes: int
     num_decode_tokens: int
     num_prefills: int
-    num_prefill_tokens: int
     num_extends: int
     num_extend_tokens: int
 
@@ -405,8 +394,6 @@ class AiterFlashAttentionMetadata:
 
     # For cascade attention.
     use_cascade: bool
-    common_prefix_len: int
-    total_tokens: int
 
     # Only for fp8 shuffle layout kv cache, we allocate kv_scale for each layer
     # since we might integrate per token quant for kv cache in the future.
@@ -441,7 +428,6 @@ def __init__(
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
         self.aot_sliding_window: tuple[int, int] | None = None
-        self.total_tokens: int = 0
         self._init_reorder_batch_threshold(1, supports_spec_as_decode=True)
 
         sliding_window_configs: set[tuple[int, int] | None] = set()
@@ -473,13 +459,9 @@ def __init__(
     def build_for_cudagraph_capture(
         self, common_attn_metadata: CommonAttentionMetadata
     ):
-        self.total_tokens = (
-            self.model_config.max_model_len
-            * self.vllm_config.scheduler_config.max_num_partial_prefills
+        return self.build(
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata
         )
-        res = self.build(common_prefix_len=0, common_attn_metadata=common_attn_metadata)
-        self.total_tokens = 0
-        return res
 
     def build(
         self,
@@ -515,12 +497,18 @@ def build(
             num_prefills,
             num_decode_tokens,
             num_extend_tokens,
-            num_prefill_tokens,
+            _,
         ) = split_ret
 
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
 
-        seq_lens = common_attn_metadata.seq_lens.cpu()
+        # Only copy seq_lens to CPU when prefill or extend is present to avoid a
+        # blocking device→host transfer.
+        seq_lens = (
+            common_attn_metadata.seq_lens.cpu()
+            if num_prefills > 0 or num_extends > 0
+            else None
+        )
 
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
@@ -528,26 +516,24 @@ def build(
         if num_decodes > 0:
             decode_metadata = AiterFlashAttentionDecodeMetadata(
                 max_query_len=query_lens_cpu[:num_decodes].max().item(),
-                min_query_len=query_lens_cpu[:num_decodes].min().item(),
-                max_seq_len=seq_lens[:num_decodes].max().item(),
-                query_start_loc=common_attn_metadata.query_start_loc[: num_decodes + 1],
             )
 
         prefill_metadata = None
         if num_prefills > 0:
+            assert seq_lens is not None
             query_lens_for_prefill = query_lens_cpu[num_decodes + num_extends :]
             query_start_loc_device = common_attn_metadata.query_start_loc[
                 num_decodes + num_extends :
             ]
             prefill_metadata = AiterFlashAttentionPrefillMetadata(
                 max_query_len=query_lens_for_prefill.max().item(),
-                min_query_len=query_lens_for_prefill.min().item(),
                 max_seq_len=seq_lens[num_decodes + num_extends :].max().item(),
                 query_start_loc=query_start_loc_device - query_start_loc_device[0],
             )
 
         extend_metadata = None
         if num_extends > 0:
+            assert seq_lens is not None
             num_extends_slice = slice(num_decodes, num_decodes + num_extends)
             query_lens_for_extend = query_lens_cpu[num_extends_slice]
             seq_lens_for_extend = seq_lens[num_extends_slice]
@@ -591,9 +577,6 @@ def build(
                 total_tokens = cu_seq_lens[-1].item()
 
                 swa_metadata = AiterChunkSlidingWindowMetadata(
-                    swa_seqlens=swa_seqlen_for_extend.to(
-                        self.device, non_blocking=True
-                    ),
                     swa_cu_seqlens=cu_seq_lens.to(self.device, non_blocking=True),
                     swa_seq_starts=seq_starts.to(self.device, non_blocking=True),
                     swa_token_to_batch=token_to_seq.to(self.device, non_blocking=True),
@@ -638,10 +621,8 @@ def build(
                 workspace=self.extend_workspace,
                 cu_seq_lens_chunk=cu_seq_lens_cpu.to(self.device, non_blocking=True),
                 chunk_starts=chunk_starts.to(self.device, non_blocking=True),
-                seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
-                max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
-                seq_lens=chunk_seq_lens,
                 token_to_batch=token_to_batch_tensor.to(self.device, non_blocking=True),
+                max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                 num_chunks=num_chunks,
                 total_token_per_batch=cu_seq_lens_cpu[:, -1].tolist(),
                 swa_metadata=swa_metadata,
@@ -659,20 +640,15 @@ def build(
             )
             extend_metadata = AiterFlashAttentionChunkPrefillMetadata(
                 max_query_len=query_lens_for_extend.max().item(),
-                min_query_len=query_lens_for_extend.min().item(),
                 max_seq_len=seq_lens[num_extends_slice].max().item(),
                 query_start_loc=query_start_loc_device - query_start_loc_device[0],
                 chunk_context_metadata=chunk_context_metadata,
             )
 
-        num_actual_kv_tokens = torch.sum(seq_lens).item()
-
         use_cascade = common_prefix_len > 0
 
         attn_metadata = AiterFlashAttentionMetadata(
             num_actual_tokens=common_attn_metadata.num_actual_tokens,
-            num_actual_kv_tokens=num_actual_kv_tokens,
-            max_query_len=common_attn_metadata.max_query_len,
             query_start_loc=common_attn_metadata.query_start_loc,
             max_seq_len=common_attn_metadata.max_seq_len,
             seq_lens=common_attn_metadata.seq_lens,
@@ -682,15 +658,12 @@ def build(
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
             num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
             num_extends=num_extends,
             num_extend_tokens=num_extend_tokens,
             decode_metadata=decode_metadata,
             prefill_metadata=prefill_metadata,
             extend_metadata=extend_metadata,
             use_cascade=use_cascade,
-            common_prefix_len=common_prefix_len,
-            total_tokens=self.total_tokens,
             k_scale=self.scale,
             v_scale=self.scale,
         )
@@ -713,15 +686,10 @@ def build_for_drafting(
 
         decode_metadata = AiterFlashAttentionDecodeMetadata(
             max_query_len=common_attn_metadata.max_query_len,
-            min_query_len=common_attn_metadata.max_query_len,  # uniform batch
-            max_seq_len=common_attn_metadata.max_seq_len,
-            query_start_loc=common_attn_metadata.query_start_loc,
         )
 
         return AiterFlashAttentionMetadata(
             num_actual_tokens=num_tokens,
-            num_actual_kv_tokens=0,  # not used in unified_attention path
-            max_query_len=common_attn_metadata.max_query_len,
             query_start_loc=common_attn_metadata.query_start_loc,
             max_seq_len=common_attn_metadata.max_seq_len,
             seq_lens=common_attn_metadata.seq_lens,
@@ -731,15 +699,12 @@ def build_for_drafting(
             num_decodes=num_reqs,
             num_decode_tokens=num_tokens,
             num_prefills=0,
-            num_prefill_tokens=0,
             num_extends=0,
             num_extend_tokens=0,
             decode_metadata=decode_metadata,
             prefill_metadata=None,
             extend_metadata=None,
             use_cascade=False,
-            common_prefix_len=0,
-            total_tokens=self.total_tokens,
             k_scale=self.scale,
             v_scale=self.scale,
         )
@@ -932,8 +897,8 @@ def extend_forward(
         output: torch.Tensor,
         cu_seqlens_q: torch.Tensor,
         max_seqlen_q: int,
-        max_seqlen_k: int,
         min_seqlen_q: int,
+        max_seqlen_k: int,
         block_table: torch.Tensor,
         slot_mapping: torch.Tensor,
         k_scale: torch.Tensor,
@@ -1160,8 +1125,8 @@ def forward(
                     output=extend_outputs,
                     cu_seqlens_q=attn_metadata.extend_metadata.query_start_loc,
                     max_seqlen_q=attn_metadata.extend_metadata.max_query_len,
-                    max_seqlen_k=attn_metadata.extend_metadata.max_seq_len,
                     min_seqlen_q=1,
+                    max_seqlen_k=attn_metadata.extend_metadata.max_seq_len,
                     block_table=attn_metadata.block_table[
                         num_decodes : num_decodes + num_extends
                     ],
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index a238ff4ad530..d533268e2176 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -29,6 +29,7 @@
 )
 from vllm.v1.attention.ops.chunked_prefill_paged_decode import (
     chunked_prefill_paged_decode,
+    has_native_kv_cache_layout,
 )
 from vllm.v1.attention.ops.paged_attn import PagedAttention
 from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
@@ -468,9 +469,10 @@ def do_kv_cache_update(
         # Get the actual block_size from value_cache
         # value_cache shape: [num_blocks, num_heads, head_size, block_size]
         block_size = value_cache.shape[3]
+        has_native_layout = has_native_kv_cache_layout(key_cache, value_cache)
 
-        if block_size in (16, 32):
-            # Normal 16, 32, use vLLM native HIP C++ logic
+        if block_size in (16, 32) and has_native_layout:
+            # Normal 16, 32 with contiguous blocks: use vLLM native HIP C++ logic.
             PagedAttention.write_to_paged_cache(
                 key,
                 value,
@@ -482,8 +484,10 @@ def do_kv_cache_update(
                 layer._v_scale,
             )
         else:
-            # Case B: Non-standard blocks (e.g., 64, 128, 544 in Qwen3Next or Qwen3.5 ),
-            # force using our modified Triton logic
+            # Non-standard blocks and hybrid attention/Mamba layouts need the
+            # stride-aware Triton writer. The native reshape_and_cache kernel
+            # assumes contiguous block storage and writes to the wrong hybrid
+            # cache blocks.
             triton_reshape_and_cache_flash(
                 key,
                 value,
diff --git a/vllm/v1/attention/ops/chunked_prefill_paged_decode.py b/vllm/v1/attention/ops/chunked_prefill_paged_decode.py
index ea1f075ef65a..77eb3ac60b1f 100644
--- a/vllm/v1/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/v1/attention/ops/chunked_prefill_paged_decode.py
@@ -21,6 +21,22 @@
 float8_info = torch.finfo(current_platform.fp8_dtype())
 
 
+def has_native_kv_cache_layout(
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+) -> bool:
+    """Return whether KV cache blocks can use the native ROCm pairing.
+
+    The native reshape_and_cache writer assumes packed blocks. If cache update
+    needs reshape_and_cache_flash for a stride-padded hybrid layout, decode
+    should use the matching Triton path too.
+    """
+    return (
+        key_cache.stride(0) == key_cache.shape[1:].numel()
+        and value_cache.stride(0) == value_cache.shape[1:].numel()
+    )
+
+
 @triton.jit
 def cdiv_fn(x, y):
     return (x + y - 1) // y
@@ -346,14 +362,12 @@ def chunked_prefill_paged_decode(
         alibi_slopes,
         sinks,
     )
-    # Triton is only forced when encountering a non-standard block
-    # like Qwen3 with a size of 544.
-    # 1. Check if block_size is a power of 2 (16, 32, 64...)
-    # 2. If it's a power of 2, we trust the vLLM's native use_custom decision.
-    # 3. If it's not a power of 2 (such as Qwen3's 544),
-    # then our Triton path is forced.
+    has_native_layout = has_native_kv_cache_layout(key_cache, value_cache)
+    # Force Triton for non-standard blocks like Qwen3's 544 and for
+    # stride-padded hybrid layouts. The latter use reshape_and_cache_flash
+    # during cache update, so keep decode on the matching stride-aware path.
     is_pow2 = block_size > 0 and (block_size & (block_size - 1) == 0)
-    if not is_pow2:
+    if not is_pow2 or not has_native_layout:
         use_custom = False
 
     if use_custom:
@@ -404,7 +418,12 @@ def chunked_prefill_paged_decode(
         real_block_size = value_cache.shape[3]
         # The standard model directly uses the original block_size.
         # Non-standard 544 uses 32 to accommodate integer division logic.
-        TRITON_BLOCK_SIZE = block_size if is_pow2 else 32
+        # Cap at 128 to avoid exceeding GPU shared memory limits
+        # (e.g. hybrid Mamba models inflate block_size to 2048).
+        # The kernel handles TRITON_BLOCK_SIZE != PHYSICAL_BLOCK_SIZE
+        # via the l_block_idx/internal_offsets addressing logic.
+        MAX_TRITON_BLOCK_SIZE = 128
+        TRITON_BLOCK_SIZE = min(block_size, MAX_TRITON_BLOCK_SIZE) if is_pow2 else 32
         if is_block_table_ptr:
             # Using the physical base address of tensors
             kv_element_size = key_cache.element_size()
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/cache_utils.py b/vllm/v1/attention/ops/deepseek_v4_ops/cache_utils.py
index 69d20c107e11..dfb107b515eb 100644
--- a/vllm/v1/attention/ops/deepseek_v4_ops/cache_utils.py
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/cache_utils.py
@@ -17,6 +17,7 @@
 import torch
 
 from vllm.triton_utils import tl, triton
+from vllm.utils.import_utils import has_cutedsl
 
 
 @triton.jit
@@ -303,7 +304,7 @@ def _dequantize_and_gather_k_kernel(
             tl.store(output_row_ptr + bf16_output_offset + chunk_offsets, bf16_vals)
 
 
-def dequantize_and_gather_k_cache(
+def dequantize_and_gather_k_cache_triton(
     # [num_reqs, max_num_tokens, head_size]
     out: torch.Tensor,
     # [num_blocks, block_size, head_bytes]
@@ -349,6 +350,34 @@ def dequantize_and_gather_k_cache(
     )
 
 
+def dequantize_and_gather_k_cache(
+    # [num_reqs, max_num_tokens, head_size]
+    out: torch.Tensor,
+    # [num_blocks, block_size, head_bytes]
+    k_cache: torch.Tensor,
+    # [num_reqs]
+    seq_lens: torch.Tensor,
+    # [num_reqs]
+    gather_lens: torch.Tensor | None,
+    # [num_reqs, max_blocks_per_seq]
+    block_table: torch.Tensor,
+    block_size: int,
+    offset: int,
+) -> None:
+    if has_cutedsl():
+        # lazily import, otherwise some tests fail due to CUDA driver init failure.
+        from .dequant_gather_k_cutedsl import dequantize_and_gather_k_cache_cutedsl
+
+        dequantize_and_gather_k_cache_cutedsl(
+            out, k_cache, seq_lens, gather_lens, block_table, block_size, offset
+        )
+        return
+
+    dequantize_and_gather_k_cache_triton(
+        out, k_cache, seq_lens, gather_lens, block_table, block_size, offset
+    )
+
+
 def compute_global_topk_indices_and_lens(
     topk_indices: torch.Tensor,
     token_to_req_indices: torch.Tensor,
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/cutedsl_utils.py b/vllm/v1/attention/ops/deepseek_v4_ops/cutedsl_utils.py
new file mode 100644
index 000000000000..dddd3f544f8e
--- /dev/null
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/cutedsl_utils.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Uint32
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import llvm, vector
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+
+@dsl_user_op
+def _recast_val(x, dtype, *, loc=None, ip=None):
+    return dtype(llvm.bitcast(dtype.mlir_type, x.ir_value(loc=loc, ip=ip)))
+
+
+@dsl_user_op
+def _fp32x2_to_bf16x2(a: Float32, b: Float32, *, loc=None, ip=None) -> Uint32:
+    out = llvm.inline_asm(
+        T.i32(),
+        [a.ir_value(loc=loc, ip=ip), b.ir_value(loc=loc, ip=ip)],
+        "cvt.rn.bf16x2.f32 $0, $2, $1;",
+        "=r,f,f",
+        has_side_effects=False,
+        is_align_stack=False,
+    )
+    return Uint32(out)
+
+
+@dsl_user_op
+def _bf16x2_to_fp32(data: Uint32, *, loc=None, ip=None) -> tuple[Float32, Float32]:
+    out = llvm.inline_asm(
+        llvm.StructType.get_literal([T.f32(), T.f32()]),
+        [data.ir_value(loc=loc, ip=ip)],
+        "shl.b32 $0, $2, 16;\n\tand.b32 $1, $2, 0xFFFF0000;\n",
+        "=f,=f,r",
+        has_side_effects=False,
+        is_align_stack=False,
+    )
+    return (
+        Float32(llvm.extractvalue(T.f32(), out, [0], loc=loc, ip=ip)),
+        Float32(llvm.extractvalue(T.f32(), out, [1], loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def _bf16x2_abs(a: Uint32, *, loc=None, ip=None) -> Uint32:
+    out = llvm.inline_asm(
+        T.i32(),
+        [a.ir_value(loc=loc, ip=ip)],
+        "abs.bf16x2 $0, $1;",
+        "=r,r",
+        has_side_effects=False,
+        is_align_stack=False,
+    )
+    return Uint32(out)
+
+
+@dsl_user_op
+def _bf16x2_max(a: Uint32, b: Uint32, *, loc=None, ip=None) -> Uint32:
+    out = llvm.inline_asm(
+        T.i32(),
+        [a.ir_value(loc=loc, ip=ip), b.ir_value(loc=loc, ip=ip)],
+        "max.bf16x2 $0, $1, $2;",
+        "=r,r,r",
+        has_side_effects=False,
+        is_align_stack=False,
+    )
+    return Uint32(out)
+
+
+@dsl_user_op
+def _bf16x2_mul(a: Uint32, b: Uint32, *, loc=None, ip=None) -> Uint32:
+    out = llvm.inline_asm(
+        T.i32(),
+        [a.ir_value(loc=loc, ip=ip), b.ir_value(loc=loc, ip=ip)],
+        "mul.rn.bf16x2 $0, $1, $2;",
+        "=r,r,r",
+        has_side_effects=False,
+        is_align_stack=False,
+    )
+    return Uint32(out)
+
+
+@dsl_user_op
+def _fp8x4_to_bf16x4(x: Uint32, *, loc=None, ip=None) -> cute.TensorSSA:
+    # there is only fp8->fp16 conversion, hence we need to go
+    # round trip through fp16.
+    out = llvm.inline_asm(
+        llvm.StructType.get_literal([T.i32()] * 2),
+        [x.ir_value(loc=loc, ip=ip)],
+        "{\n\t"
+        ".reg .b16 x0, x1;\n\t"
+        ".reg .b16 t00, t01, t10, t11;\n\t"
+        "mov.b32 {x0, x1}, $2;\n\t"
+        "cvt.rn.f16x2.e4m3x2 $0, x0;\n\t"
+        "cvt.rn.f16x2.e4m3x2 $1, x1;\n\t"
+        "mov.b32 {t00, t01}, $0;\n\t"
+        "mov.b32 {t10, t11}, $1;\n\t"
+        "cvt.rn.bf16.f16 t00, t00;\n\t"
+        "cvt.rn.bf16.f16 t01, t01;\n\t"
+        "cvt.rn.bf16.f16 t10, t10;\n\t"
+        "cvt.rn.bf16.f16 t11, t11;\n\t"
+        "mov.b32 $0, {t00, t01};\n\t"
+        "mov.b32 $1, {t10, t11};\n\t"
+        "}\n",
+        "=r,=r,r",
+        has_side_effects=False,
+        is_align_stack=False,
+    )
+    vec = vector.from_elements(
+        ir.VectorType.get([2], T.i32(), loc=loc),
+        [llvm.extractvalue(T.i32(), out, [i], loc=loc, ip=ip) for i in range(2)],
+        loc=loc,
+        ip=ip,
+    )
+    return cute.TensorSSA(vec, 2, Uint32)
+
+
+@dsl_user_op
+def _fp32x8_to_fp4x8(
+    vals: cute.Tensor,
+    offset: cutlass.Constexpr[int],
+    *,
+    loc=None,
+    ip=None,
+) -> Uint32:
+    # Pack eight scaled FP32 values into four E2M1x2 bytes, returned as one b32.
+    assert vals.element_type is Float32
+    out = llvm.inline_asm(
+        T.i32(),
+        [vals[offset + i].ir_value(loc=loc, ip=ip) for i in range(8)],
+        "{\n\t"
+        ".reg .b8 x0, x1, x2, x3;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 x0, $2, $1;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 x1, $4, $3;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 x2, $6, $5;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 x3, $8, $7;\n\t"
+        "mov.b32 $0, {x0, x1, x2, x3};\n\t"
+        "}\n",
+        "=r,f,f,f,f,f,f,f,f",
+        has_side_effects=False,
+        is_align_stack=False,
+    )
+    return Uint32(out)
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/dequant_gather_k_cutedsl.py b/vllm/v1/attention/ops/deepseek_v4_ops/dequant_gather_k_cutedsl.py
new file mode 100644
index 000000000000..bb0818ada39f
--- /dev/null
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/dequant_gather_k_cutedsl.py
@@ -0,0 +1,334 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from functools import cache
+
+import cutlass
+import cutlass.cute as cute
+import torch
+from cuda.bindings.driver import CUstream
+from cutlass import BFloat16, Int32, Uint8, Uint32
+from cutlass.cute.nvgpu import cpasync
+from quack.compile_utils import make_fake_tensor
+
+from vllm.v1.attention.ops.deepseek_v4_ops.cutedsl_utils import (
+    _bf16x2_mul,
+    _fp8x4_to_bf16x4,
+)
+
+
+def dequantize_and_gather_k_cache_cutedsl(
+    out: torch.Tensor,
+    k_cache: torch.Tensor,
+    seq_lens: torch.Tensor,
+    gather_lens: torch.Tensor | None,
+    block_table: torch.Tensor,
+    block_size: int,
+    offset: int,
+) -> None:
+    DequantGatherKCacheKernel.compile(
+        block_size=block_size,
+        has_gather_lens=gather_lens is not None,
+    )(out, k_cache, seq_lens, gather_lens, block_table, offset)
+
+
+class DequantGatherKCacheKernel:
+    # Hard-coded for DSv4.
+    head_dim = 512
+    group_size = 64  # 1 scale per 64 elems
+
+    def __init__(self, fp8_dim: int = 448, block_size: int = 64):
+        self.fp8_dim = fp8_dim
+        self.bf16_dim = self.head_dim - fp8_dim
+        self.data_dim = fp8_dim + self.bf16_dim * 2
+        self.block_size = block_size
+
+        self.num_warps = 4
+        self.tb_size = self.num_warps * 32
+        self.num_stages = 4
+
+    @cute.jit
+    def __call__(
+        self,
+        out: cute.Tensor,
+        k_cache: cute.Tensor,
+        seq_lens: cute.Tensor,
+        gather_lens: cute.Tensor | None,
+        block_table: cute.Tensor,
+        offset: Int32,
+        stream: CUstream,
+    ):
+        # Split k_cache into k_data and k_scale. Each [block_size, head_bytes]
+        # block is actually a concat of
+        # [block_size, fp8_dim + bf16_dim * 2] and [block_size, 8].
+        k_data = cute.make_tensor(
+            k_cache.iterator,
+            layout=cute.make_layout(
+                (k_cache.shape[0], self.block_size, self.data_dim),
+                stride=(k_cache.stride[0], self.data_dim, 1),
+            ),
+        )
+        k_scale = cute.make_tensor(
+            k_cache.iterator + (self.block_size * self.data_dim),
+            layout=cute.make_layout(
+                (k_cache.shape[0], self.block_size, 8),
+                stride=(k_cache.stride[0], 8, 1),
+            ),
+        )
+
+        grid = (out.shape[0], 1024, 1)
+        self.kernel(
+            out,
+            k_data,
+            k_scale,
+            seq_lens,
+            gather_lens,
+            block_table,
+            offset,
+        ).launch(grid=grid, block=(self.tb_size, 1, 1), stream=stream)
+
+    @cute.jit
+    def load_g2s(
+        self,
+        k_data_slice: cute.Tensor,
+        k_scale: cute.Tensor,
+        block_table: cute.Tensor,
+        s_kdata_slice: cute.Tensor,
+        s_kscale: cute.Tensor,
+        req_id,
+        pos,
+        lane_id,
+        stage_id,
+    ):
+        # k_data_slice: [num_blocks, block_size, (16, data_dim/16)]
+        # s_kdata_slice: [(4, data_dim/16), num_stages]
+
+        op = cpasync.CopyG2SOp(cute.nvgpu.LoadCacheMode.GLOBAL)
+        cp16_atom = cute.make_copy_atom(op, Uint32, num_bits_per_copy=128)
+        cp8_atom = cute.make_copy_atom(cpasync.CopyG2SOp(), Uint8, num_bits_per_copy=64)
+        page_id = block_table[req_id, pos // self.block_size]
+        block_offset = pos % self.block_size
+
+        # Load the first 512 bytes (32x16B).
+        idx = lane_id
+        src = k_data_slice[page_id, block_offset, (None, idx)]
+        cute.copy(
+            cp16_atom,
+            cute.recast_tensor(src, Uint32),
+            s_kdata_slice[(None, idx), stage_id],
+        )
+
+        # Load the tail 64 bytes.
+        idx += 32
+        if idx < cutlass.const_expr(self.data_dim // 16):
+            src = k_data_slice[page_id, block_offset, (None, idx)]
+            cute.copy(
+                cp16_atom,
+                cute.recast_tensor(src, Uint32),
+                s_kdata_slice[(None, idx), stage_id],
+            )
+        elif idx == cutlass.const_expr(self.data_dim // 16):
+            cute.copy(
+                cp8_atom,
+                k_scale[page_id, block_offset, None],
+                s_kscale[None, stage_id],
+            )
+
+    @cute.kernel
+    def kernel(
+        self,
+        out: cute.Tensor,
+        k_data: cute.Tensor,
+        k_scale: cute.Tensor,
+        seq_lens: cute.Tensor,
+        gather_lens: cute.Tensor | None,
+        block_table: cute.Tensor,
+        offset: Int32,
+    ):
+        req_id, worker_id, _ = cute.arch.block_idx()
+        tid, _, _ = cute.arch.thread_idx()
+        warp_id = cute.arch.make_warp_uniform(tid // 32)
+        lane_id = tid % 32
+
+        _, num_workers, _ = cute.arch.grid_dim()
+
+        # Prepare smem.
+        smem = cutlass.utils.SmemAllocator()
+        s_kdata = smem.allocate_tensor(
+            Uint32,
+            cute.make_layout((self.data_dim // 4, self.num_warps, self.num_stages)),
+            byte_alignment=16,
+        )[None, warp_id, None]
+        s_kscale = smem.allocate_tensor(
+            Uint8,
+            cute.make_layout((8, self.num_warps, self.num_stages)),
+            byte_alignment=8,
+        )[None, warp_id, None]
+
+        # Prepare for 16B cp.async, also for BF16 smem loads later.
+        k_data_slice = cute.logical_divide(k_data, (None, None, 16))
+        s_kdata_16B_slice = cute.logical_divide(s_kdata, (4, None))
+
+        # Load FP8 elems in 8B units, so once dequantized, they are 16B units.
+        s_kdata_8B_slice = cute.logical_divide(s_kdata, (2, None))
+
+        # 16B st.global.
+        out_slice = cute.logical_divide(out, (None, None, 8))
+
+        cp_op = cute.nvgpu.CopyUniversalOp()
+        cp8_atom = cute.make_copy_atom(cp_op, Uint32, num_bits_per_copy=64)
+        cp16_atom = cute.make_copy_atom(cp_op, Uint32, num_bits_per_copy=128)
+
+        seq_len = seq_lens[req_id]
+        gather_len = seq_len
+        if cutlass.const_expr(gather_lens is not None):
+            gather_len = gather_lens[req_id]  # type: ignore[index]
+        start_pos = seq_len - gather_len
+
+        # Start prefetch.
+        for i in cutlass.range_constexpr(self.num_stages - 1):
+            next_pos = (
+                start_pos
+                + worker_id * self.num_warps
+                + warp_id
+                + i * num_workers * self.num_warps
+            )
+            if next_pos < seq_len:
+                self.load_g2s(
+                    k_data_slice,
+                    k_scale,
+                    block_table,
+                    s_kdata_16B_slice,
+                    s_kscale,
+                    req_id,
+                    next_pos,
+                    lane_id,
+                    i,
+                )
+            cute.arch.cp_async_commit_group()
+        prefetch_stage = self.num_stages - 1
+        compute_stage = 0
+
+        # Main loop.
+        for i in range(
+            worker_id * self.num_warps + warp_id,
+            gather_len,
+            num_workers * self.num_warps,
+        ):
+            pos = start_pos + i
+
+            # Prefetch next stage.
+            next_pos = pos + num_workers * self.num_warps * (self.num_stages - 1)
+            if next_pos < seq_len:
+                self.load_g2s(
+                    k_data_slice,
+                    k_scale,
+                    block_table,
+                    s_kdata_16B_slice,
+                    s_kscale,
+                    req_id,
+                    next_pos,
+                    lane_id,
+                    prefetch_stage,
+                )
+                prefetch_stage = (prefetch_stage + 1) % self.num_stages
+            cute.arch.cp_async_commit_group()
+
+            # Wait for gmem->smem to finish.
+            cute.arch.cp_async_wait_group(self.num_stages - 1)
+            cute.arch.sync_warp()
+
+            # There are 512 elems per token. As a warp, data0 holds the first
+            # 256 elems and data1 holds the second 256 elems, i.e. each thread
+            # holds 8 FP8 elems. This keeps the dequantized 8 BF16 elems as
+            # contiguous 16B global stores. On Blackwell, this might not be
+            # necessary as we have 32B global stores, but doing it this way
+            # does not seem to be slower.
+            data0 = cute.make_rmem_tensor((2,), Uint32)
+            data1 = cute.make_rmem_tensor((2,), Uint32)
+            cute.copy(cp8_atom, s_kdata_8B_slice[(None, lane_id), compute_stage], data0)
+            cute.copy(
+                cp8_atom,
+                s_kdata_8B_slice[(None, lane_id + 32), compute_stage],
+                data1,
+            )
+
+            # Convert to bf16x2 via bit manipulation. FP8 scales are per 64
+            # elements. An 8-element chunk advances the scale index by
+            # chunk_id * 8 // group_size.
+            scale0_u32 = Uint32(s_kscale[lane_id * 8 // self.group_size, compute_stage])
+            scale0_bf16x2 = (scale0_u32 << Uint32(23)) | (scale0_u32 << Uint32(7))
+            scale1_u32 = Uint32(
+                s_kscale[(lane_id + 32) * 8 // self.group_size, compute_stage]
+            )
+            scale1_bf16x2 = (scale1_u32 << Uint32(23)) | (scale1_u32 << Uint32(7))
+
+            # cvt.rn.scaled::n2::ue8m0.bf16x2.e4m3x2 requires PTX 9.2
+            # (CUDA 13.2).
+            dequant0 = cute.make_rmem_tensor(4, Uint32)
+            dequant1 = cute.make_rmem_tensor(4, Uint32)
+            for j in cutlass.range_constexpr(2):
+                tmp0 = _fp8x4_to_bf16x4(data0[j])
+                tmp1 = _fp8x4_to_bf16x4(data1[j])
+
+                # BF16 multiply is safe because the scales are exact powers of 2.
+                dequant0[j * 2] = _bf16x2_mul(tmp0[0], scale0_bf16x2)
+                dequant1[j * 2] = _bf16x2_mul(tmp1[0], scale1_bf16x2)
+                dequant0[j * 2 + 1] = _bf16x2_mul(tmp0[1], scale0_bf16x2)
+                dequant1[j * 2 + 1] = _bf16x2_mul(tmp1[1], scale1_bf16x2)
+
+            # Last 64 elems are BF16 tail, corresponds to dequant1 of last
+            # 8 threads. We have 448 FP8 + 64 BF16 -> 28x 16B for FP8 +
+            # 8x 16B for BF16.
+            if lane_id + 32 >= self.fp8_dim // 8:
+                idx = self.fp8_dim // 16 + (lane_id + 32) - self.fp8_dim // 8
+                cute.copy(
+                    cp16_atom,
+                    s_kdata_16B_slice[(None, idx), compute_stage],
+                    dequant1,
+                )
+
+            # Store two 16B BF16 chunks per lane: first half, then second half.
+            dst = out_slice[req_id, offset + i, (None, lane_id)]
+            cute.copy(cp16_atom, dequant0, cute.recast_tensor(dst, Uint32))
+
+            dst = out_slice[req_id, offset + i, (None, lane_id + 32)]
+            cute.copy(cp16_atom, dequant1, cute.recast_tensor(dst, Uint32))
+
+            compute_stage = (compute_stage + 1) % self.num_stages
+
+    @cache
+    @staticmethod
+    def compile(
+        fp8_dim: int = 448,
+        block_size: int = 64,
+        has_gather_lens: bool = True,
+    ):
+        num_reqs = cute.sym_int()
+        head_dim = DequantGatherKCacheKernel.head_dim
+        head_bytes = fp8_dim + (head_dim - fp8_dim) * 2 + 8
+
+        out = make_fake_tensor(BFloat16, (num_reqs, cute.sym_int(), head_dim), 16)
+        k_cache = cute.runtime.make_fake_tensor(
+            Uint8,
+            (cute.sym_int(), block_size, head_bytes),
+            stride=(cute.sym_int64(divisibility=32), head_bytes, 1),
+            assumed_align=32,
+        )
+        seq_lens = make_fake_tensor(Int32, (num_reqs,))
+        gather_lens = make_fake_tensor(Int32, (num_reqs,)) if has_gather_lens else None
+        block_table = make_fake_tensor(Int32, (num_reqs, cute.sym_int()))
+
+        kernel = DequantGatherKCacheKernel(fp8_dim, block_size)
+        stream = cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True)
+        return cute.compile(
+            kernel,
+            out,
+            k_cache,
+            seq_lens,
+            gather_lens,
+            block_table,
+            Int32(0),
+            stream,
+            options="--enable-tvm-ffi",
+        )
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py b/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
index f94fc013f5c6..ec880f7ab4c4 100644
--- a/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import torch
 
 from vllm.triton_utils import tl, triton
+from vllm.utils.import_utils import has_cutedsl
 
 # MXFP4: 32 elements per block, packed 2 nibbles per byte, ue8m0 block scale.
 MXFP4_BLOCK_SIZE = 32
@@ -342,30 +344,49 @@ def fused_indexer_q_rope_quant(
             dtype=torch.uint8,
             device=index_q.device,
         )
-        _fused_indexer_q_rope_mxfp4_kernel[(num_tokens, num_index_q_heads)](
-            positions,
-            index_q,
-            index_q.stride(0),
-            index_q.stride(1),
-            index_q_cos_sin_cache,
-            index_q_cos_sin_cache.stride(0),
-            index_q_cos_sin_cache.shape[-1] // 2,
-            index_q_packed,
-            index_q_packed.stride(0),
-            index_q_packed.stride(1),
-            index_q_scale,
-            index_q_scale.stride(0),
-            index_q_scale.stride(1),
-            index_q_head_dim,
-            MXFP4_BLOCK_SIZE,
-            index_weights,
-            index_weights.stride(0),
-            index_weights_softmax_scale,
-            index_weights_head_scale,
-            index_weights_out,
-            index_weights_out.stride(0),
-            num_warps=1,  # TODO: Tune this
-        )
+        if has_cutedsl():
+            # lazily import, otherwise some tests fail due to CUDA driver init failure.
+            from .fused_indexer_q_cutedsl import (
+                fused_indexer_q_rope_quant_mxfp4_cutedsl,
+            )
+
+            fused_indexer_q_rope_quant_mxfp4_cutedsl(
+                positions,
+                index_q,
+                index_q_cos_sin_cache,
+                index_weights,
+                index_weights_softmax_scale,
+                index_weights_head_scale,
+                index_q_packed,
+                index_q_scale,
+                index_weights_out,
+            )
+        else:
+            _fused_indexer_q_rope_mxfp4_kernel[(num_tokens, num_index_q_heads)](
+                positions,
+                index_q,
+                index_q.stride(0),
+                index_q.stride(1),
+                index_q_cos_sin_cache,
+                index_q_cos_sin_cache.stride(0),
+                index_q_cos_sin_cache.shape[-1] // 2,
+                index_q_packed,
+                index_q_packed.stride(0),
+                index_q_packed.stride(1),
+                index_q_scale,
+                index_q_scale.stride(0),
+                index_q_scale.stride(1),
+                index_q_head_dim,
+                MXFP4_BLOCK_SIZE,
+                index_weights,
+                index_weights.stride(0),
+                index_weights_softmax_scale,
+                index_weights_head_scale,
+                index_weights_out,
+                index_weights_out.stride(0),
+                num_warps=1,  # TODO: Tune this
+            )
+
         # Values stay uint8 (2 E2M1 nibbles per byte). Scales are 4 ue8m0
         # bytes per (token, head) reinterpreted as one int32, then squeezed
         # from (T, H, 1) to (T, H) to match DeepGEMM's expected q_sf rank
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q_cutedsl.py b/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q_cutedsl.py
new file mode 100644
index 000000000000..4468a95651bf
--- /dev/null
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/fused_indexer_q_cutedsl.py
@@ -0,0 +1,339 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from functools import cache
+
+import cutlass
+import cutlass.cute as cute
+import torch
+from cuda.bindings.driver import CUstream
+from cutlass import BFloat16, Float32, Int64, Uint8, Uint32, const_expr
+from quack.compile_utils import make_fake_tensor
+
+from vllm.v1.attention.ops.deepseek_v4_ops.cutedsl_utils import (
+    _bf16x2_abs,
+    _bf16x2_max,
+    _bf16x2_to_fp32,
+    _fp32x2_to_bf16x2,
+    _fp32x8_to_fp4x8,
+    _recast_val,
+)
+from vllm.vllm_flash_attn.cute import utils as cute_utils
+
+# MXFP4: 32 elements per block, packed 2 nibbles per byte, ue8m0 block scale.
+MXFP4_BLOCK_SIZE = 32
+
+_TORCH_TO_CUTE = {
+    torch.bfloat16: BFloat16,
+    torch.float32: Float32,
+}
+
+
+def fused_indexer_q_rope_quant_mxfp4_cutedsl(
+    positions: torch.Tensor,
+    index_q: torch.Tensor,
+    index_q_cos_sin_cache: torch.Tensor,
+    index_weights: torch.Tensor,
+    index_weights_softmax_scale: float,
+    index_weights_head_scale: float,
+    index_q_packed: torch.Tensor,
+    index_q_scale: torch.Tensor,
+    index_weights_out: torch.Tensor,
+) -> None:
+    num_tokens, num_heads, head_dim = index_q.shape
+    rope_dim = index_q_cos_sin_cache.shape[-1]
+    rope_type = _TORCH_TO_CUTE[index_q_cos_sin_cache.dtype]
+
+    # compile all variants at first invocation
+    for coarsen in (1, 4):
+        IndexerQMxFp4Kernel.compile(head_dim, rope_dim, num_heads, rope_type, coarsen)
+
+    # heuristic
+    coarsen = 1 if num_tokens < 512 else 4
+    compiled = IndexerQMxFp4Kernel.compile(
+        head_dim, rope_dim, num_heads, rope_type, coarsen
+    )
+    scale = float(index_weights_softmax_scale * index_weights_head_scale)
+    compiled(
+        positions,
+        index_q,
+        index_q_cos_sin_cache,
+        index_weights,
+        index_q_packed,
+        index_q_scale,
+        index_weights_out,
+        scale,
+    )
+
+
+class IndexerQMxFp4Kernel:
+    """Eight-thread subwarps process one ``(token, head)`` row."""
+
+    def __init__(
+        self,
+        head_dim: int = 128,
+        rope_dim: int = 64,
+        num_heads: int = 64,
+        cos_sin_dtype: type[cutlass.Numeric] = Float32,
+        coarsen: int = 4,
+    ):
+        self.head_dim = head_dim
+        self.rope_dim = rope_dim
+        self.nope_dim = head_dim - rope_dim
+        self.num_heads = num_heads
+        self.cos_sin_dtype = cos_sin_dtype
+
+        # process multiple heads at the same time to armotize RoPE load costs
+        assert num_heads % coarsen == 0
+        self.coarsen = coarsen
+
+        # later we will use 32B load = 16 BF16 elems
+        # thus, head_dim=128 requires 8 threads to handle.
+        # let's call subwarp = 8 threads.
+        self.subwarp_size = head_dim // 16
+        self.tb_size = 128
+        self.threads_per_token = (self.num_heads // self.coarsen) * self.subwarp_size
+
+    @cute.jit
+    def __call__(
+        self,
+        positions: cute.Tensor,
+        q: cute.Tensor,
+        cos_sin_cache: cute.Tensor,
+        weights: cute.Tensor,
+        q_fp4: cute.Tensor,
+        q_scale: cute.Tensor,
+        weights_out: cute.Tensor,
+        scale: Float32,
+        stream: CUstream,
+    ):
+        total_threads = q.shape[0] * self.threads_per_token
+        grid = (cute.ceil_div(total_threads, self.tb_size), 1, 1)
+        self.kernel(
+            positions,
+            q,
+            cos_sin_cache,
+            weights,
+            q_fp4,
+            q_scale,
+            weights_out,
+            scale,
+        ).launch(grid=grid, block=(self.tb_size, 1, 1), stream=stream)
+
+    @cute.kernel
+    def kernel(
+        self,
+        positions: cute.Tensor,
+        q: cute.Tensor,
+        cos_sin_cache: cute.Tensor,
+        weights: cute.Tensor,
+        q_fp4: cute.Tensor,
+        q_scale: cute.Tensor,
+        weights_out: cute.Tensor,
+        scale: Float32,
+    ):
+        block_id, _, _ = cute.arch.block_idx()
+        tid, _, _ = cute.arch.thread_idx()
+
+        num_token_heads = q.shape[0] * self.num_heads
+        global_tid = block_id * self.tb_size + tid
+
+        global_subwarp_id = global_tid // self.subwarp_size
+        sublane = tid % self.subwarp_size
+
+        token_id = global_subwarp_id // (self.num_heads // self.coarsen)
+        head_tile_id = global_subwarp_id % (self.num_heads // self.coarsen)
+        head_start = head_tile_id * self.coarsen
+
+        # NOTE: token_id may exceed bounds, hence we need to add load/store guards
+        # we can't do early exit because CuteDSL doesn't support it. and we also need
+        # all threads in a warp to be active since we utilize warp shuffle later.
+        # must_in_bounds is constexpr, True when 1 threadblock fit within 1 token
+        # position. the compiler will remove bounds check when that happens.
+        must_in_bounds = cutlass.const_expr(self.tb_size % self.threads_per_token == 0)
+        in_bounds = must_in_bounds or (token_id < q.shape[0])
+
+        cp_op = cute.nvgpu.CopyUniversalOp()
+
+        _layout = cute.make_layout((self.coarsen, 8), stride=(8, 1))
+        q_bf16x2 = cute.make_rmem_tensor(_layout, Uint32)
+
+        if in_bounds:
+            # we can't do cute.copy() on the whole 2D tile directly because
+            # cute.copy() wants the 1st mode to be covered by the copy atom,
+            # and other modes as for loop. there is no fast way to
+            # "transpose" the tensor view.
+            q_tile = cute.local_tile(
+                q[token_id, None, None],
+                tiler=(self.coarsen, 16),
+                coord=(head_tile_id, sublane),
+            )
+            cp_u32x8 = cute.make_copy_atom(cp_op, Uint32, num_bits_per_copy=256)
+            for i in cutlass.range_constexpr(self.coarsen):
+                src = cute.recast_tensor(q_tile[i, None], Uint32)
+                cute.copy(cp_u32x8, src, q_bf16x2[i, None])
+
+        # RoPE applies only to the trailing rope_dim values. We keep the rounded
+        # BF16 result in q_bits so the later amax and quantization see BF16.
+        # cos_sin_cache layout: [max_pos, rope_dim]
+        if in_bounds and sublane * 16 >= self.nope_dim:
+            cos_vals = cute.make_rmem_tensor((8,), Float32)
+            sin_vals = cute.make_rmem_tensor((8,), Float32)
+
+            pos = positions[token_id]
+
+            # select 8 elems from cos and sin
+            cos_id = sublane - self.nope_dim // 16
+            sin_id = cos_id + self.rope_dim // 16
+            cos_src = cute.local_tile(
+                cos_sin_cache[pos, None], tiler=(8,), coord=(cos_id,)
+            )
+            sin_src = cute.local_tile(
+                cos_sin_cache[pos, None], tiler=(8,), coord=(sin_id,)
+            )
+
+            cp_f32x8 = cute.make_copy_atom(cp_op, Float32, num_bits_per_copy=256)
+            cp_u32x4 = cute.make_copy_atom(cp_op, Uint32, num_bits_per_copy=128)
+
+            if const_expr(self.cos_sin_dtype is Float32):
+                cute.copy(cp_f32x8, cos_src, cos_vals)
+                cute.copy(cp_f32x8, sin_src, sin_vals)
+            else:
+                cos_bf16x2 = cute.make_rmem_tensor((4,), Uint32)
+                sin_bf16x2 = cute.make_rmem_tensor((4,), Uint32)
+                cute.copy(cp_u32x4, cute.recast_tensor(cos_src, Uint32), cos_bf16x2)
+                cute.copy(cp_u32x4, cute.recast_tensor(sin_src, Uint32), sin_bf16x2)
+
+                for i in cutlass.range_constexpr(4):
+                    cos0, cos1 = _bf16x2_to_fp32(cos_bf16x2[i])
+                    sin0, sin1 = _bf16x2_to_fp32(sin_bf16x2[i])
+                    cos_vals[i * 2] = cos0
+                    cos_vals[i * 2 + 1] = cos1
+                    sin_vals[i * 2] = sin0
+                    sin_vals[i * 2 + 1] = sin1
+
+            for i in cutlass.range_constexpr(self.coarsen):
+                for j in cutlass.range_constexpr(8):
+                    q0, q1 = _bf16x2_to_fp32(q_bf16x2[i, j])
+                    rot0 = q0 * cos_vals[j] - q1 * sin_vals[j]
+                    rot1 = q0 * sin_vals[j] + q1 * cos_vals[j]
+                    # convert back to BF16 to match numerics
+                    q_bf16x2[i, j] = _fp32x2_to_bf16x2(rot0, rot1)
+
+        # layout: [coarsen, 8]
+        q_fp4_tile = cute.local_tile(
+            q_fp4[token_id, None, None],
+            tiler=(self.coarsen, 8),
+            coord=(head_tile_id, sublane),
+        )
+
+        for i in cutlass.range_constexpr(self.coarsen):
+            # compute amax in packed bf16x2 to save instructions
+            # Each thread holds 16 elems. Two adjacent threads form one 32-elem
+            # MXFP4 block, so a width-2 shuffle gives the block amax.
+            amax_bf16x2 = _bf16x2_abs(q_bf16x2[i, 0])
+            for j in cutlass.range_constexpr(1, 8):
+                amax_bf16x2 = _bf16x2_max(amax_bf16x2, _bf16x2_abs(q_bf16x2[i, j]))
+            amax_bf16x2 = cute_utils.warp_reduce(
+                amax_bf16x2,
+                _bf16x2_max,
+                width=MXFP4_BLOCK_SIZE // 16,
+            )
+            amax_pair = _bf16x2_to_fp32(amax_bf16x2)
+            amax = cute_utils.fmax(amax_pair[0], amax_pair[1])
+
+            if in_bounds:
+                # compute block scale with bit manipulation
+                # UE8M0 stores ceil(log2(fp4_scale)) + 127. Adding the mantissa mask
+                # increments the exponent whenever fp4_scale is not exactly a power of 2
+                eps = cutlass.const_expr(float.fromhex("0x6p-126"))
+                fp4_scale = cute_utils.fmax(amax, eps) * Float32(1.0 / 6.0)
+                bits = _recast_val(fp4_scale, Uint32)
+                ue8m0 = cute_utils.shr_u32(
+                    bits + Uint32(0x7FFFFF), Uint32(23)
+                ) & Uint32(0xFF)
+
+                # Only one of the two threads in an MXFP4 block writes the shared scale.
+                if tid % 2 == 0:
+                    mx_block = sublane // 2
+                    q_scale[token_id, head_start + i, mx_block] = Uint8(ue8m0)
+
+                # If scale = 2^A and ue8m0 = A + 127, then inverse scale has exponent
+                # -A + 127 = 254 - ue8m0.
+                inv_scale_bits = (Uint32(254) - ue8m0) << Uint32(23)
+                inv_fp4_scale = _recast_val(inv_scale_bits, Float32)
+
+                vals = cute.make_rmem_tensor(16, Float32)
+                for j in cutlass.range_constexpr(8):
+                    q0, q1 = _bf16x2_to_fp32(q_bf16x2[i, j])
+                    vals[j * 2] = q0 * inv_fp4_scale
+                    vals[j * 2 + 1] = q1 * inv_fp4_scale
+
+                # pack to FP4
+                packed = cute.make_rmem_tensor((2,), Uint32)
+                packed[0] = _fp32x8_to_fp4x8(vals, 0)
+                packed[1] = _fp32x8_to_fp4x8(vals, 8)
+
+                dst = q_fp4_tile[i, None]
+                cp_u32x2 = cute.make_copy_atom(cp_op, Uint32, num_bits_per_copy=64)
+                cute.copy(cp_u32x2, packed, cute.recast_tensor(dst, Uint32))
+
+        # Weight scaling is independent of the Q subwarp work. The first
+        # num_tokens * num_heads logical threads cover one weight each.
+        if global_tid < num_token_heads:
+            weight_token_id = global_tid // self.num_heads
+            weight_head_id = global_tid % self.num_heads
+            weights_out[weight_token_id, weight_head_id] = (
+                weights[weight_token_id, weight_head_id].to(Float32) * scale
+            )
+
+    @cache
+    @staticmethod
+    def compile(
+        head_dim: int = 128,
+        rope_dim: int = 64,
+        num_heads: int = 64,
+        cos_sin_dtype: type[cutlass.Numeric] = Float32,
+        coarsen: int = 4,
+    ):
+        num_tokens = cute.sym_int()
+        max_pos = cute.sym_int()
+
+        q = make_fake_tensor(
+            BFloat16, (num_tokens, num_heads, head_dim), divisibility=16
+        )
+        positions = make_fake_tensor(Int64, (num_tokens,), divisibility=1)
+        cos_sin_cache = make_fake_tensor(
+            cos_sin_dtype,
+            (max_pos, rope_dim),
+            divisibility=8,
+        )
+        weights = make_fake_tensor(BFloat16, (num_tokens, num_heads), divisibility=8)
+        q_fp4 = make_fake_tensor(
+            Uint8,
+            (num_tokens, num_heads, head_dim // 2),
+            divisibility=16,
+        )
+        q_scale = make_fake_tensor(
+            Uint8,
+            (num_tokens, num_heads, head_dim // MXFP4_BLOCK_SIZE),
+            divisibility=4,
+        )
+        weights_out = make_fake_tensor(Float32, (num_tokens, num_heads), divisibility=4)
+
+        kernel = IndexerQMxFp4Kernel(
+            head_dim, rope_dim, num_heads, cos_sin_dtype, coarsen
+        )
+        stream = cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True)
+        return cute.compile(
+            kernel,
+            positions,
+            q,
+            cos_sin_cache,
+            weights,
+            q_fp4,
+            q_scale,
+            weights_out,
+            Float32(0.0),
+            stream,
+            options="--enable-tvm-ffi",
+        )
diff --git a/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
index 5d0343ffd607..936839b457ed 100644
--- a/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
+++ b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
@@ -905,185 +905,757 @@ def rocm_inv_rope_einsum(
     return torch.einsum("tgd,grd->tgr", o_ref, wo_a_weight)
 
 
-def rocm_ref_sparse_attn_prefill(
+_DSV4_SPARSE_NOPE_DIM = 448
+_DSV4_SPARSE_ROPE_DIM = 64
+
+
+def _validate_dsv4_sparse_dims(
+    head_dim: int,
+    nope_head_dim: int,
+    rope_head_dim: int,
+    op_name: str,
+) -> None:
+    assert head_dim == nope_head_dim + rope_head_dim, (
+        f"{op_name} expected head_dim={nope_head_dim + rope_head_dim}, got {head_dim}"
+    )
+    assert (
+        nope_head_dim == _DSV4_SPARSE_NOPE_DIM
+        and rope_head_dim == _DSV4_SPARSE_ROPE_DIM
+    ), (
+        f"{op_name} expects {_DSV4_SPARSE_NOPE_DIM} NoPE dims and "
+        f"{_DSV4_SPARSE_ROPE_DIM} RoPE dims"
+    )
+
+
+@triton.jit
+def _pack_dense_prefix_to_ragged_kernel(
+    indices_ptr,
+    lengths_ptr,
+    indptr_ptr,
+    out_ptr,
+    indices_stride0,
+    num_rows_limit,
+    row_width,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    block_idx = tl.program_id(1)
+    offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+
+    row_len = tl.load(lengths_ptr + row_idx)
+    if block_idx * BLOCK_SIZE >= row_len:
+        return
+
+    mask = offsets < row_len
+    vals = tl.load(
+        indices_ptr + row_idx * indices_stride0 + offsets,
+        mask=mask & (offsets < row_width),
+        other=-1,
+    ).to(tl.int32)
+    if num_rows_limit >= 0:
+        vals = tl.where((vals >= 0) & (vals < num_rows_limit), vals, -1)
+
+    out_start = tl.load(indptr_ptr + row_idx)
+    tl.store(out_ptr + out_start + offsets, vals, mask=mask)
+
+
+def build_ragged_indices_from_dense(
+    indices: torch.Tensor,
+    lengths: torch.Tensor,
+    num_rows: int = -1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    indices = indices.reshape(indices.shape[0], -1)
+    lengths = lengths.to(device=indices.device, dtype=torch.int32).reshape(-1)
+    assert lengths.numel() == indices.shape[0], (
+        f"Expected one length per row, got {lengths.shape} for indices {indices.shape}"
+    )
+
+    max_width = indices.shape[1] if indices.ndim == 2 else 0
+    lengths = lengths.clamp(min=0, max=max_width).contiguous()
+
+    indptr = torch.empty(indices.shape[0] + 1, dtype=torch.int32, device=indices.device)
+    indptr[0] = 0
+    torch.cumsum(lengths, dim=0, out=indptr[1:])
+
+    if indices.numel() == 0:
+        flat = torch.empty(0, dtype=torch.int32, device=indices.device)
+    else:
+        flat = torch.empty(
+            int(indptr[-1].item()), dtype=torch.int32, device=indices.device
+        )
+        if flat.numel() > 0:
+            block_size = 128
+            _pack_dense_prefix_to_ragged_kernel[
+                (indices.shape[0], triton.cdiv(max_width, block_size))
+            ](
+                indices,
+                lengths,
+                indptr,
+                flat,
+                indices.stride(0),
+                int(num_rows),
+                max_width,
+                BLOCK_SIZE=block_size,
+            )
+
+    return flat, indptr
+
+
+def _as_int32_contiguous_1d(x: torch.Tensor) -> torch.Tensor:
+    if x.dtype == torch.int32 and x.ndim == 1 and x.is_contiguous():
+        return x
+    return x.to(torch.int32).contiguous()
+
+
+@triton.jit
+def _sparse_attn_prefill_ragged_kernel(
+    q_ptr,
+    kv_ptr,
+    kv_indices_ptr,
+    kv_indptr_ptr,
+    attn_sink_ptr,
+    out_ptr,
+    q_stride_t,
+    q_stride_h,
+    q_stride_d,
+    kv_stride_n,
+    kv_stride_d,
+    out_stride_t,
+    out_stride_h,
+    out_stride_d,
+    num_heads,
+    head_dim,
+    num_kv,
+    scale,
+    HAS_ATTN_SINK: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    query_idx = tl.program_id(0)
+    pid_h = tl.program_id(1)
+
+    head_offsets = pid_h * BLOCK_H + tl.arange(0, BLOCK_H)
+    dim_offsets = tl.arange(0, BLOCK_D)
+    head_mask = head_offsets < num_heads
+    dim_mask = dim_offsets < head_dim
+
+    q = tl.load(
+        q_ptr
+        + query_idx * q_stride_t
+        + head_offsets[:, None] * q_stride_h
+        + dim_offsets[None, :] * q_stride_d,
+        mask=head_mask[:, None] & dim_mask[None, :],
+        other=0.0,
+    )
+
+    neg_large = -3.4028234663852886e38
+    m_i = tl.full((BLOCK_H,), neg_large, dtype=tl.float32)
+    l_i = tl.zeros((BLOCK_H,), dtype=tl.float32)
+    acc = tl.zeros((BLOCK_H, BLOCK_D), dtype=tl.float32)
+
+    kv_start = tl.load(kv_indptr_ptr + query_idx)
+    kv_end = tl.load(kv_indptr_ptr + query_idx + 1)
+    kv_len = kv_end - kv_start
+
+    k_offsets = tl.arange(0, BLOCK_K)
+    for k_start in tl.range(0, kv_len, BLOCK_K):
+        k_pos = k_start + k_offsets
+        in_range = k_pos < kv_len
+        slot = tl.load(kv_indices_ptr + kv_start + k_pos, mask=in_range, other=-1)
+        valid = in_range & (slot >= 0) & (slot < num_kv)
+
+        kv = tl.load(
+            kv_ptr + slot[:, None] * kv_stride_n + dim_offsets[None, :] * kv_stride_d,
+            mask=valid[:, None] & dim_mask[None, :],
+            other=0.0,
+        )
+        kv = tl.where(valid[:, None] & dim_mask[None, :], kv, 0.0)
+
+        scores = tl.dot(q, tl.trans(kv)) * scale
+        scores = tl.where(head_mask[:, None] & valid[None, :], scores, neg_large)
+
+        m_block = tl.max(scores, axis=1)
+        m_new = tl.maximum(m_i, m_block)
+        alpha = tl.exp(m_i - m_new)
+        p = tl.exp(scores - m_new[:, None])
+        p = tl.where(head_mask[:, None] & valid[None, :], p, 0.0)
+        l_new = l_i * alpha + tl.sum(p, axis=1)
+
+        acc = acc * alpha[:, None] + tl.dot(p.to(kv.dtype), kv)
+        m_i = m_new
+        l_i = l_new
+
+    if HAS_ATTN_SINK:
+        sink = tl.load(
+            attn_sink_ptr + head_offsets, mask=head_mask, other=neg_large
+        ).to(tl.float32)
+        m_final = tl.maximum(m_i, sink)
+        alpha = tl.exp(m_i - m_final)
+        l_final = l_i * alpha + tl.exp(sink - m_final)
+        denom = tl.maximum(l_final, 1.0e-30)
+        out = tl.where(
+            l_final[:, None] > 0.0,
+            (acc * alpha[:, None]) / denom[:, None],
+            0.0,
+        )
+    else:
+        denom = tl.maximum(l_i, 1.0e-30)
+        out = tl.where(l_i[:, None] > 0.0, acc / denom[:, None], 0.0)
+
+    tl.store(
+        out_ptr
+        + query_idx * out_stride_t
+        + head_offsets[:, None] * out_stride_h
+        + dim_offsets[None, :] * out_stride_d,
+        out,
+        mask=head_mask[:, None] & dim_mask[None, :],
+    )
+
+
+@triton.jit
+def _sparse_attn_decode_ragged_kernel(
+    q_ptr,
+    main_cache_ptr,
+    main_indices_ptr,
+    main_indptr_ptr,
+    extra_cache_ptr,
+    extra_indices_ptr,
+    extra_indptr_ptr,
+    attn_sink_ptr,
+    out_ptr,
+    q_stride0,
+    q_stride1,
+    out_stride0,
+    out_stride1,
+    main_cache_stride0,
+    extra_cache_stride0,
+    main_num_rows,
+    extra_num_rows,
+    main_block_size,
+    extra_block_size,
+    scale,
+    num_heads,
+    HAS_ATTN_SINK: tl.constexpr,
+    HAS_EXTRA: tl.constexpr,
+    NOPE_DIM: tl.constexpr,
+    NOPE_BLOCK: tl.constexpr,
+    ROPE_DIM: tl.constexpr,
+    IS_FNUZ: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    query_idx = tl.program_id(0)
+    pid_h = tl.program_id(1)
+
+    head_offsets = pid_h * BLOCK_H + tl.arange(0, BLOCK_H)
+    head_mask = head_offsets < num_heads
+    nope_offsets = tl.arange(0, NOPE_BLOCK)
+    nope_mask = nope_offsets < NOPE_DIM
+    rope_offsets = tl.arange(0, ROPE_DIM)
+
+    q_row_ptr = q_ptr + query_idx * q_stride0 + head_offsets[:, None] * q_stride1
+    q_nope = tl.load(
+        q_row_ptr + nope_offsets[None, :],
+        mask=head_mask[:, None] & nope_mask[None, :],
+        other=0.0,
+    )
+    q_rope = tl.load(
+        q_row_ptr + NOPE_DIM + rope_offsets[None, :],
+        mask=head_mask[:, None],
+        other=0.0,
+    )
+
+    neg_large = -3.4028234663852886e38
+    m_i = tl.full((BLOCK_H,), neg_large, dtype=tl.float32)
+    l_i = tl.zeros((BLOCK_H,), dtype=tl.float32)
+    acc_nope = tl.zeros((BLOCK_H, NOPE_BLOCK), dtype=tl.float32)
+    acc_rope = tl.zeros((BLOCK_H, ROPE_DIM), dtype=tl.float32)
+    k_offsets = tl.arange(0, BLOCK_K)
+
+    main_start = tl.load(main_indptr_ptr + query_idx)
+    main_end = tl.load(main_indptr_ptr + query_idx + 1)
+    main_len = main_end - main_start
+
+    zero_nope = tl.zeros((BLOCK_K, NOPE_BLOCK), dtype=tl.bfloat16)
+    zero_rope = tl.zeros((BLOCK_K, ROPE_DIM), dtype=tl.bfloat16)
+
+    for k_start in tl.range(0, main_len, BLOCK_K):
+        k_pos = k_start + k_offsets
+        in_range = k_pos < main_len
+        slot = tl.load(main_indices_ptr + main_start + k_pos, mask=in_range, other=-1)
+        valid = in_range & (slot >= 0) & (slot < main_num_rows)
+        safe_slot = tl.where(valid, slot, 0)
+
+        block_idx = safe_slot // main_block_size
+        pos_in_block = safe_slot % main_block_size
+        cache_block_ptr = main_cache_ptr + block_idx.to(tl.int64) * main_cache_stride0
+        token_data_ptr = cache_block_ptr + pos_in_block * 576
+        token_scale_ptr = cache_block_ptr + main_block_size * 576 + pos_in_block * 8
+
+        x_uint8 = tl.load(
+            token_data_ptr[:, None] + nope_offsets[None, :],
+            mask=valid[:, None] & nope_mask[None, :],
+            other=0,
+        )
+        if IS_FNUZ:
+            x_fp8 = x_uint8.to(tl.float8e4b15, bitcast=True)
+        else:
+            x_fp8 = x_uint8.to(tl.float8e4nv, bitcast=True)
+        encoded_scales = tl.load(
+            token_scale_ptr[:, None] + nope_offsets[None, :] // 64,
+            mask=valid[:, None] & nope_mask[None, :],
+            other=127,
+        )
+        scales = tl.exp2(encoded_scales.to(tl.float32) - 127.0)
+        k_nope = x_fp8.to(tl.bfloat16) * scales.to(tl.bfloat16)
+        k_nope = tl.where(valid[:, None] & nope_mask[None, :], k_nope, zero_nope)
+        k_nope = tl.where(k_nope == k_nope, k_nope, zero_nope)
+
+        rope_ptr = (token_data_ptr + NOPE_DIM).to(tl.pointer_type(tl.bfloat16))
+        k_rope = tl.load(
+            rope_ptr[:, None] + rope_offsets[None, :],
+            mask=valid[:, None],
+            other=0.0,
+        )
+        k_rope = tl.where(valid[:, None], k_rope, zero_rope)
+        k_rope = tl.where(k_rope == k_rope, k_rope, zero_rope)
+
+        scores = tl.dot(q_nope, tl.trans(k_nope)) + tl.dot(q_rope, tl.trans(k_rope))
+        scores *= scale
+        scores = tl.where(head_mask[:, None] & valid[None, :], scores, neg_large)
+
+        m_block = tl.max(scores, axis=1)
+        m_new = tl.maximum(m_i, m_block)
+        alpha = tl.exp(m_i - m_new)
+        p = tl.exp(scores - m_new[:, None])
+        p = tl.where(head_mask[:, None] & valid[None, :], p, 0.0)
+        l_new = l_i * alpha + tl.sum(p, axis=1)
+
+        acc_nope = acc_nope * alpha[:, None] + tl.dot(p.to(k_nope.dtype), k_nope)
+        acc_rope = acc_rope * alpha[:, None] + tl.dot(p.to(k_rope.dtype), k_rope)
+        m_i = m_new
+        l_i = l_new
+
+    if HAS_EXTRA:
+        extra_start = tl.load(extra_indptr_ptr + query_idx)
+        extra_end = tl.load(extra_indptr_ptr + query_idx + 1)
+        extra_len = extra_end - extra_start
+
+        for k_start in tl.range(0, extra_len, BLOCK_K):
+            k_pos = k_start + k_offsets
+            in_range = k_pos < extra_len
+            slot = tl.load(
+                extra_indices_ptr + extra_start + k_pos, mask=in_range, other=-1
+            )
+            valid = in_range & (slot >= 0) & (slot < extra_num_rows)
+            safe_slot = tl.where(valid, slot, 0)
+
+            block_idx = safe_slot // extra_block_size
+            pos_in_block = safe_slot % extra_block_size
+            cache_block_ptr = (
+                extra_cache_ptr + block_idx.to(tl.int64) * extra_cache_stride0
+            )
+            token_data_ptr = cache_block_ptr + pos_in_block * 576
+            token_scale_ptr = (
+                cache_block_ptr + extra_block_size * 576 + pos_in_block * 8
+            )
+
+            x_uint8 = tl.load(
+                token_data_ptr[:, None] + nope_offsets[None, :],
+                mask=valid[:, None] & nope_mask[None, :],
+                other=0,
+            )
+            if IS_FNUZ:
+                x_fp8 = x_uint8.to(tl.float8e4b15, bitcast=True)
+            else:
+                x_fp8 = x_uint8.to(tl.float8e4nv, bitcast=True)
+            encoded_scales = tl.load(
+                token_scale_ptr[:, None] + nope_offsets[None, :] // 64,
+                mask=valid[:, None] & nope_mask[None, :],
+                other=127,
+            )
+            scales = tl.exp2(encoded_scales.to(tl.float32) - 127.0)
+            k_nope = x_fp8.to(tl.bfloat16) * scales.to(tl.bfloat16)
+            k_nope = tl.where(valid[:, None] & nope_mask[None, :], k_nope, zero_nope)
+            k_nope = tl.where(k_nope == k_nope, k_nope, zero_nope)
+
+            rope_ptr = (token_data_ptr + NOPE_DIM).to(tl.pointer_type(tl.bfloat16))
+            k_rope = tl.load(
+                rope_ptr[:, None] + rope_offsets[None, :],
+                mask=valid[:, None],
+                other=0.0,
+            )
+            k_rope = tl.where(valid[:, None], k_rope, zero_rope)
+            k_rope = tl.where(k_rope == k_rope, k_rope, zero_rope)
+
+            scores = tl.dot(q_nope, tl.trans(k_nope)) + tl.dot(
+                q_rope,
+                tl.trans(k_rope),
+            )
+            scores *= scale
+            scores = tl.where(head_mask[:, None] & valid[None, :], scores, neg_large)
+
+            m_block = tl.max(scores, axis=1)
+            m_new = tl.maximum(m_i, m_block)
+            alpha = tl.exp(m_i - m_new)
+            p = tl.exp(scores - m_new[:, None])
+            p = tl.where(head_mask[:, None] & valid[None, :], p, 0.0)
+            l_new = l_i * alpha + tl.sum(p, axis=1)
+
+            acc_nope = acc_nope * alpha[:, None] + tl.dot(p.to(k_nope.dtype), k_nope)
+            acc_rope = acc_rope * alpha[:, None] + tl.dot(p.to(k_rope.dtype), k_rope)
+            m_i = m_new
+            l_i = l_new
+
+    if HAS_ATTN_SINK:
+        sink = tl.load(
+            attn_sink_ptr + head_offsets, mask=head_mask, other=neg_large
+        ).to(tl.float32)
+        m_final = tl.maximum(m_i, sink)
+        alpha = tl.exp(m_i - m_final)
+        l_final = l_i * alpha + tl.exp(sink - m_final)
+        denom = tl.maximum(l_final, 1.0e-30)
+        out_nope = tl.where(
+            l_final[:, None] > 0.0,
+            (acc_nope * alpha[:, None]) / denom[:, None],
+            0.0,
+        )
+        out_rope = tl.where(
+            l_final[:, None] > 0.0,
+            (acc_rope * alpha[:, None]) / denom[:, None],
+            0.0,
+        )
+    else:
+        denom = tl.maximum(l_i, 1.0e-30)
+        out_nope = tl.where(l_i[:, None] > 0.0, acc_nope / denom[:, None], 0.0)
+        out_rope = tl.where(l_i[:, None] > 0.0, acc_rope / denom[:, None], 0.0)
+
+    out_row_ptr = (
+        out_ptr + query_idx * out_stride0 + head_offsets[:, None] * out_stride1
+    )
+    tl.store(
+        out_row_ptr + nope_offsets[None, :],
+        out_nope,
+        mask=head_mask[:, None] & nope_mask[None, :],
+    )
+    tl.store(
+        out_row_ptr + NOPE_DIM + rope_offsets[None, :],
+        out_rope,
+        mask=head_mask[:, None],
+    )
+
+
+def _rocm_sparse_attn_prefill_ragged_triton(
     q: torch.Tensor,
     kv: torch.Tensor,
     indices: torch.Tensor,
-    topk_length: torch.Tensor | None,
+    indptr: torch.Tensor,
     scale: float,
-    head_dim: int,
     attn_sink: torch.Tensor | None,
+    nope_head_dim: int,
+    rope_head_dim: int,
 ) -> torch.Tensor:
-    indices = indices.clone().squeeze(1)
-    s_q, h_q, d_qk = q.shape
-    topk = indices.shape[-1]
-    s_kv = kv.shape[0]
-    if topk_length is not None:
-        mask = torch.arange(topk, device=indices.device).unsqueeze(
-            0
-        ) >= topk_length.unsqueeze(1)
-        indices[mask] = -1
-    invalid_mask = (indices < 0) | (indices >= s_kv)
-    indices[invalid_mask] = 0
-
-    qf = q.float()
-    gathered_kv = kv.index_select(0, indices.flatten()).reshape(s_q, topk, d_qk).float()
-    scores = qf @ gathered_kv.transpose(1, 2)
-    scores *= scale
-    scores[invalid_mask.unsqueeze(1).expand_as(scores)] = float("-inf")
-
-    orig_lse = torch.logsumexp(scores, dim=-1)
-    lse_for_o = orig_lse
-    if attn_sink is not None:
-        lse_for_o = torch.logsumexp(
-            torch.stack(
-                [orig_lse, attn_sink[:h_q].view(1, h_q).expand_as(orig_lse)],
-                dim=0,
-            ),
-            dim=0,
-        )
-    lse_for_o = lse_for_o.clone()
-    lse_for_o[lse_for_o == float("-inf")] = float("+inf")
-    probs = torch.exp(scores - lse_for_o.unsqueeze(-1))
-    out = probs @ gathered_kv[..., :head_dim]
-    lonely_q_mask = orig_lse == float("-inf")
-    out[lonely_q_mask.unsqueeze(-1).expand_as(out)] = 0.0
-    return out.to(torch.bfloat16)
+    assert q.ndim == 3, f"expected q=[sq,h,d], got {q.shape}"
+    assert kv.ndim == 2, f"expected kv=[skv,d], got {kv.shape}"
+    assert indices.ndim == 1, f"expected indices=[nnz], got {indices.shape}"
+    assert indptr.ndim == 1, f"expected indptr=[sq+1], got {indptr.shape}"
+    assert q.is_cuda and kv.is_cuda and indices.is_cuda and indptr.is_cuda
+
+    indices = _as_int32_contiguous_1d(indices)
+    indptr = _as_int32_contiguous_1d(indptr)
+    has_attn_sink = attn_sink is not None
+    if attn_sink is None:
+        attn_sink = torch.empty(1, device=q.device, dtype=torch.float32)
+    else:
+        attn_sink = attn_sink.contiguous()
 
+    num_queries, num_heads, head_dim = q.shape
+    assert indptr.numel() == num_queries + 1, (
+        f"expected indptr shape [{num_queries + 1}], got {indptr.shape}"
+    )
+    _validate_dsv4_sparse_dims(
+        head_dim,
+        nope_head_dim,
+        rope_head_dim,
+        "_rocm_sparse_attn_prefill_ragged_triton",
+    )
 
-def rocm_sparse_attn_prefill(
+    block_h = 16
+    block_d = triton.next_power_of_2(head_dim)
+    block_k = 16 if head_dim >= 256 else 32
+    out = torch.empty_like(q, dtype=torch.bfloat16)
+    _sparse_attn_prefill_ragged_kernel[(num_queries, triton.cdiv(num_heads, block_h))](
+        q,
+        kv,
+        indices,
+        indptr,
+        attn_sink,
+        out,
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        kv.stride(0),
+        kv.stride(1),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        num_heads,
+        head_dim,
+        kv.shape[0],
+        float(scale),
+        HAS_ATTN_SINK=has_attn_sink,
+        BLOCK_H=block_h,
+        BLOCK_D=block_d,
+        BLOCK_K=block_k,
+        num_warps=8,
+    )
+    return out
+
+
+def _rocm_sparse_attn_prefill_triton(
     q: torch.Tensor,
     kv: torch.Tensor,
     indices: torch.Tensor,
-    topk_length: torch.Tensor | None,
     scale: float,
-    head_dim: int,
     attn_sink: torch.Tensor | None,
-    output: torch.Tensor,
-) -> None:
-    output_chunk = rocm_ref_sparse_attn_prefill(
+    nope_head_dim: int,
+    rope_head_dim: int,
+    topk_length: torch.Tensor | None = None,
+) -> torch.Tensor:
+    ragged_indices, ragged_indptr = build_ragged_indices_from_dense(
+        indices,
+        topk_length
+        if topk_length is not None
+        else (indices >= 0).sum(dim=-1, dtype=torch.int32),
+        num_rows=kv.shape[0],
+    )
+    return _rocm_sparse_attn_prefill_ragged_triton(
         q=q,
         kv=kv,
-        indices=indices,
-        topk_length=topk_length,
+        indices=ragged_indices,
+        indptr=ragged_indptr,
         scale=scale,
-        head_dim=head_dim,
         attn_sink=attn_sink,
+        nope_head_dim=nope_head_dim,
+        rope_head_dim=rope_head_dim,
     )
-    output.copy_(output_chunk.to(output.dtype))
 
 
-def rocm_dequantize_blocked_k_cache(
-    quant_k_cache: torch.Tensor,
-    head_dim: int,
+def _rocm_sparse_attn_decode_ragged_triton(
+    q: torch.Tensor,
+    main_cache: torch.Tensor,
+    main_indices: torch.Tensor,
+    main_indptr: torch.Tensor,
+    scale: float,
+    attn_sink: torch.Tensor | None,
     nope_head_dim: int,
     rope_head_dim: int,
+    extra_cache: torch.Tensor | None = None,
+    extra_indices: torch.Tensor | None = None,
+    extra_indptr: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    fp8_dtype = current_platform.fp8_dtype()
-    tile_size = 64
-    num_tiles = nope_head_dim // tile_size
-
-    num_blocks, block_size, _ = quant_k_cache.shape
-    quant_k_cache = quant_k_cache.view(num_blocks, -1)
-    input_nope_rope = quant_k_cache[
-        :, : block_size * (nope_head_dim + 2 * rope_head_dim)
-    ].view(num_blocks, block_size, nope_head_dim + 2 * rope_head_dim)
-    input_nope = input_nope_rope[:, :, :nope_head_dim].view(fp8_dtype)
-    input_rope = input_nope_rope[:, :, nope_head_dim:].view(torch.bfloat16)
-    input_scale = (
-        quant_k_cache[:, block_size * (nope_head_dim + 2 * rope_head_dim) :]
-        .view(num_blocks, block_size, 8)[:, :, :num_tiles]
-        .view(torch.float8_e8m0fnu)
+    assert q.ndim == 3, f"expected q=[b,h,d], got {q.shape}"
+    assert main_cache.ndim == 3, (
+        f"expected main_cache=[blocks,block,bytes], got {main_cache.shape}"
+    )
+    assert main_indices.ndim == 1, (
+        f"expected main_indices=[nnz], got {main_indices.shape}"
+    )
+    assert main_indptr.ndim == 1, f"expected main_indptr=[b+1], got {main_indptr.shape}"
+    assert (
+        q.is_cuda
+        and main_cache.is_cuda
+        and main_indices.is_cuda
+        and main_indptr.is_cuda
     )
 
-    result = torch.empty(
-        (num_blocks, block_size, 1, head_dim),
-        dtype=torch.bfloat16,
-        device=quant_k_cache.device,
+    main_indices = _as_int32_contiguous_1d(main_indices)
+    main_indptr = _as_int32_contiguous_1d(main_indptr)
+    has_attn_sink = attn_sink is not None
+    if attn_sink is None:
+        attn_sink = torch.empty(1, device=q.device, dtype=torch.float32)
+    else:
+        attn_sink = attn_sink.contiguous()
+
+    num_queries, num_heads, head_dim = q.shape
+    assert main_indptr.numel() == num_queries + 1, (
+        f"expected main_indptr shape [{num_queries + 1}], got {main_indptr.shape}"
+    )
+    _validate_dsv4_sparse_dims(
+        head_dim,
+        nope_head_dim,
+        rope_head_dim,
+        "_rocm_sparse_attn_decode_ragged_triton",
+    )
+
+    has_extra = (
+        extra_cache is not None
+        and extra_indices is not None
+        and extra_indptr is not None
+    )
+    if has_extra:
+        assert extra_cache is not None
+        assert extra_indices is not None
+        assert extra_indptr is not None
+        assert extra_indices.ndim == 1, (
+            f"expected extra_indices=[nnz], got {extra_indices.shape}"
+        )
+        assert extra_indptr.ndim == 1, (
+            f"expected extra_indptr=[b+1], got {extra_indptr.shape}"
+        )
+        extra_indices = _as_int32_contiguous_1d(extra_indices)
+        extra_indptr = _as_int32_contiguous_1d(extra_indptr)
+        assert extra_indptr.numel() == num_queries + 1, (
+            f"expected extra_indptr shape [{num_queries + 1}], got {extra_indptr.shape}"
+        )
+    else:
+        extra_cache = main_cache
+        extra_indices = torch.empty(0, device=q.device, dtype=torch.int32)
+        extra_indptr = torch.zeros(num_queries + 1, device=q.device, dtype=torch.int32)
+
+    block_h = 16
+    block_k = 16 if head_dim >= 256 else 32
+    out = torch.empty_like(q, dtype=torch.bfloat16)
+    _sparse_attn_decode_ragged_kernel[(num_queries, triton.cdiv(num_heads, block_h))](
+        q,
+        main_cache,
+        main_indices,
+        main_indptr,
+        extra_cache,
+        extra_indices,
+        extra_indptr,
+        attn_sink,
+        out,
+        q.stride(0),
+        q.stride(1),
+        out.stride(0),
+        out.stride(1),
+        main_cache.stride(0),
+        extra_cache.stride(0),
+        main_cache.shape[0] * main_cache.shape[1],
+        extra_cache.shape[0] * extra_cache.shape[1],
+        main_cache.shape[1],
+        extra_cache.shape[1],
+        scale,
+        num_heads,
+        HAS_ATTN_SINK=has_attn_sink,
+        HAS_EXTRA=has_extra,
+        NOPE_DIM=nope_head_dim,
+        NOPE_BLOCK=triton.next_power_of_2(nope_head_dim),
+        ROPE_DIM=rope_head_dim,
+        IS_FNUZ=current_platform.is_fp8_fnuz(),
+        BLOCK_H=block_h,
+        BLOCK_K=block_k,
+        num_warps=8,
     )
-    result[..., nope_head_dim:] = input_rope.unsqueeze(2)
-    for tile_idx in range(num_tiles):
-        cur_nope = input_nope[
-            ..., tile_idx * tile_size : (tile_idx + 1) * tile_size
-        ].to(torch.bfloat16)
-        cur_scales = input_scale[:, :, tile_idx].to(torch.bfloat16).unsqueeze(-1)
-        result[..., tile_idx * tile_size : (tile_idx + 1) * tile_size] = (
-            cur_nope * cur_scales
-        ).unsqueeze(2)
-    return result
-
-
-def rocm_ref_sparse_attn_decode(
+    return out
+
+
+def _rocm_sparse_attn_decode_triton(
     q: torch.Tensor,
-    blocked_k: torch.Tensor,
-    indices_in_kvcache: torch.Tensor,
-    topk_length: torch.Tensor | None,
+    main_cache: torch.Tensor,
+    main_indices: torch.Tensor,
     scale: float,
-    head_dim: int,
     attn_sink: torch.Tensor | None,
-    extra_blocked_k: torch.Tensor | None = None,
-    extra_indices_in_kvcache: torch.Tensor | None = None,
-    extra_topk_length: torch.Tensor | None = None,
+    nope_head_dim: int,
+    rope_head_dim: int,
+    extra_cache: torch.Tensor | None = None,
+    extra_indices: torch.Tensor | None = None,
+    main_lengths: torch.Tensor | None = None,
+    extra_lengths: torch.Tensor | None = None,
+    main_ragged_indices: torch.Tensor | None = None,
+    main_ragged_indptr: torch.Tensor | None = None,
+    extra_ragged_indices: torch.Tensor | None = None,
+    extra_ragged_indptr: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    b, s_q, h_q, d_qk = q.shape
-
-    def process_scope(
-        cur_blocked_k: torch.Tensor,
-        cur_indices: torch.Tensor,
-        cur_topk_length: torch.Tensor | None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        cur_indices = cur_indices.reshape(b, s_q, -1)
-        topk = cur_indices.size(-1)
-        fixed_indices = torch.clamp_min(cur_indices, 0)
-        gathered_kv = (
-            cur_blocked_k.view(-1, d_qk)
-            .index_select(0, fixed_indices.view(-1))
-            .view(b, s_q, topk, d_qk)
+    if main_ragged_indices is None or main_ragged_indptr is None:
+        main_ragged_indices, main_ragged_indptr = build_ragged_indices_from_dense(
+            main_indices,
+            main_lengths
+            if main_lengths is not None
+            else (main_indices >= 0).sum(dim=-1, dtype=torch.int32),
+            num_rows=main_cache.shape[0] * main_cache.shape[1],
+        )
+
+    if (
+        (extra_ragged_indices is None or extra_ragged_indptr is None)
+        and extra_cache is not None
+        and extra_indices is not None
+    ):
+        extra_ragged_indices, extra_ragged_indptr = build_ragged_indices_from_dense(
+            extra_indices,
+            extra_lengths
+            if extra_lengths is not None
+            else (extra_indices >= 0).sum(dim=-1, dtype=torch.int32),
+            num_rows=extra_cache.shape[0] * extra_cache.shape[1],
         )
-        invalid_mask = cur_indices == -1
-        if cur_topk_length is not None:
-            cur_topk_length = cur_topk_length.reshape(b)
-            invalid_mask |= torch.arange(0, topk, device=invalid_mask.device).view(
-                1, 1, topk
-            ) >= cur_topk_length.view(b, 1, 1)
-        return gathered_kv, invalid_mask
-
-    gathered_kv, invalid_mask = process_scope(
-        blocked_k, indices_in_kvcache, topk_length
+
+    return _rocm_sparse_attn_decode_ragged_triton(
+        q=q,
+        main_cache=main_cache,
+        main_indices=main_ragged_indices,
+        main_indptr=main_ragged_indptr,
+        scale=scale,
+        attn_sink=attn_sink,
+        nope_head_dim=nope_head_dim,
+        rope_head_dim=rope_head_dim,
+        extra_cache=extra_cache,
+        extra_indices=extra_ragged_indices,
+        extra_indptr=extra_ragged_indptr,
+    )
+
+
+def rocm_sparse_attn_prefill(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    indices: torch.Tensor,
+    topk_length: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    nope_head_dim: int,
+    rope_head_dim: int,
+    attn_sink: torch.Tensor | None,
+    output: torch.Tensor,
+    ragged_indices: torch.Tensor | None = None,
+    ragged_indptr: torch.Tensor | None = None,
+) -> None:
+    assert kv.ndim == 3 and kv.shape[1] == 1, (
+        f"ROCm Triton sparse prefill expects kv=[skv,1,d], got {kv.shape}"
+    )
+    _validate_dsv4_sparse_dims(
+        head_dim,
+        nope_head_dim,
+        rope_head_dim,
+        "rocm_sparse_attn_prefill",
     )
-    if extra_blocked_k is not None:
-        assert extra_indices_in_kvcache is not None
-        gathered_kv1, invalid_mask1 = process_scope(
-            extra_blocked_k, extra_indices_in_kvcache, extra_topk_length
+    if ragged_indices is not None and ragged_indptr is not None:
+        output_chunk = _rocm_sparse_attn_prefill_ragged_triton(
+            q=q,
+            kv=kv.squeeze(1),
+            indices=ragged_indices,
+            indptr=ragged_indptr,
+            scale=scale,
+            attn_sink=None if attn_sink is None else attn_sink[: q.shape[1]],
+            nope_head_dim=nope_head_dim,
+            rope_head_dim=rope_head_dim,
         )
-        gathered_kv = torch.cat([gathered_kv, gathered_kv1], dim=2)
-        invalid_mask = torch.cat([invalid_mask, invalid_mask1], dim=2)
-
-    gathered_kv = gathered_kv.view(b * s_q, -1, d_qk).float()
-    gathered_kv[gathered_kv != gathered_kv] = 0.0
-    qf = q.float().view(b * s_q, h_q, d_qk)
-    attn_weight = qf @ gathered_kv.transpose(-1, -2)
-    attn_weight *= scale
-    attn_weight[
-        invalid_mask.view(b * s_q, 1, -1).expand(b * s_q, h_q, invalid_mask.size(-1))
-    ] = float("-inf")
-    lse = attn_weight.logsumexp(dim=-1)
-    attn_weight = torch.exp(attn_weight - lse.unsqueeze(-1))
-    output = attn_weight @ gathered_kv[..., :head_dim]
-    output = output.view(b, s_q, h_q, head_dim)
-    lse = lse.view(b, s_q, h_q)
-
-    if attn_sink is not None:
-        output *= (1.0 / (1.0 + torch.exp(attn_sink.view(1, 1, h_q) - lse))).unsqueeze(
-            -1
+    else:
+        indices_2d = indices.reshape(indices.shape[0], -1)
+        output_chunk = _rocm_sparse_attn_prefill_triton(
+            q=q,
+            kv=kv.squeeze(1),
+            indices=indices_2d,
+            scale=scale,
+            attn_sink=None if attn_sink is None else attn_sink[: q.shape[1]],
+            nope_head_dim=nope_head_dim,
+            rope_head_dim=rope_head_dim,
+            topk_length=topk_length,
         )
-
-    lonely_q_mask = lse == float("-inf")
-    output[lonely_q_mask.unsqueeze(-1).expand_as(output)] = 0.0
-    return output.squeeze(1).to(torch.bfloat16)
+    output.copy_(output_chunk.to(output.dtype))
 
 
-def rocm_forward_decode_fallback(
+def rocm_sparse_attn_decode(
     q: torch.Tensor,
     kv_cache: torch.Tensor | None,
     swa_k_cache: torch.Tensor,
@@ -1092,6 +1664,10 @@ def rocm_forward_decode_fallback(
     topk_lens: torch.Tensor | None,
     swa_indices: torch.Tensor,
     swa_lens: torch.Tensor,
+    swa_ragged_indices: torch.Tensor | None,
+    swa_ragged_indptr: torch.Tensor | None,
+    topk_ragged_indices: torch.Tensor | None,
+    topk_ragged_indptr: torch.Tensor | None,
     attn_sink: torch.Tensor | None,
     scale: float,
     head_dim: int,
@@ -1099,31 +1675,49 @@ def rocm_forward_decode_fallback(
     rope_head_dim: int,
     output: torch.Tensor,
 ) -> None:
-    blocked_swa = rocm_dequantize_blocked_k_cache(
-        swa_k_cache,
-        head_dim=head_dim,
-        nope_head_dim=nope_head_dim,
-        rope_head_dim=rope_head_dim,
+    assert swa_k_cache.dtype == torch.uint8, (
+        "ROCm Triton sparse decode expects uint8 fp8_ds_mla SWA cache, "
+        f"got {swa_k_cache.dtype}"
+    )
+    _validate_dsv4_sparse_dims(
+        head_dim,
+        nope_head_dim,
+        rope_head_dim,
+        "rocm_sparse_attn_decode",
     )
-    blocked_extra = None
+
+    main_indices = swa_indices.reshape(swa_indices.shape[0], -1)
+
+    extra_cache = None
+    extra_indices = None
     if not swa_only:
         assert kv_cache is not None
-        blocked_extra = rocm_dequantize_blocked_k_cache(
-            kv_cache,
-            head_dim=head_dim,
-            nope_head_dim=nope_head_dim,
-            rope_head_dim=rope_head_dim,
+        assert topk_indices is not None or (
+            topk_ragged_indices is not None and topk_ragged_indptr is not None
         )
-    attn_out = rocm_ref_sparse_attn_decode(
-        q=q.unsqueeze(1),
-        blocked_k=blocked_swa,
-        indices_in_kvcache=swa_indices.unsqueeze(1),
-        topk_length=swa_lens,
+        assert kv_cache.dtype == torch.uint8, (
+            "ROCm Triton sparse decode expects uint8 fp8_ds_mla extra cache, "
+            f"got {kv_cache.dtype}"
+        )
+        extra_cache = kv_cache
+        if topk_indices is not None:
+            extra_indices = topk_indices.reshape(topk_indices.shape[0], -1)
+
+    attn_out = _rocm_sparse_attn_decode_triton(
+        q=q,
+        main_cache=swa_k_cache,
+        main_indices=main_indices,
         scale=scale,
-        head_dim=head_dim,
-        attn_sink=attn_sink[: q.shape[1]] if attn_sink is not None else None,
-        extra_blocked_k=blocked_extra,
-        extra_indices_in_kvcache=topk_indices,
-        extra_topk_length=topk_lens,
+        attn_sink=None if attn_sink is None else attn_sink[: q.shape[1]],
+        nope_head_dim=nope_head_dim,
+        rope_head_dim=rope_head_dim,
+        extra_cache=extra_cache,
+        extra_indices=extra_indices,
+        main_lengths=swa_lens,
+        extra_lengths=topk_lens,
+        main_ragged_indices=swa_ragged_indices,
+        main_ragged_indptr=swa_ragged_indptr,
+        extra_ragged_indices=topk_ragged_indices,
+        extra_ragged_indptr=topk_ragged_indptr,
     )
     output.copy_(attn_out.to(output.dtype))
diff --git a/vllm/v1/attention/ops/triton_decode_attention.py b/vllm/v1/attention/ops/triton_decode_attention.py
index e1059b47bcba..c58a7026e89b 100644
--- a/vllm/v1/attention/ops/triton_decode_attention.py
+++ b/vllm/v1/attention/ops/triton_decode_attention.py
@@ -459,25 +459,29 @@ def _decode_grouped_att_m_fwd(
 ):
     # with is_mla there is only a single c_kv in smem.
     # could increase BLOCK or num_stages.
-    BLOCK = 32
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
 
-    # [TODO] work around shmem limit on MI3xx
-    if is_hip_ and Lk >= 576:
-        BLOCK = 16
-
-    if Lk == 576:
-        BLOCK_DMODEL = 512
-        BLOCK_DPE = 64
-    elif Lk == 288:
-        BLOCK_DMODEL = 256
-        BLOCK_DPE = 32
+    # Align tile dimensions with latent rank for MLA to avoid shape mismatch.
+    if is_mla:
+        if not is_hip_ and Lk == 576:
+            BLOCK_DMODEL = 512
+            BLOCK_DPE = 64
+        elif not is_hip_ and Lk == 288:
+            BLOCK_DMODEL = 256
+            BLOCK_DPE = 32
+        else:
+            BLOCK_DMODEL = triton.next_power_of_2(Lv)
+            BLOCK_DPE = triton.next_power_of_2(Lk - Lv) if Lk > Lv else 0
     else:
         BLOCK_DMODEL = triton.next_power_of_2(Lk)
         BLOCK_DPE = 0
     BLOCK_DV = triton.next_power_of_2(Lv)
 
+    BLOCK = 32
+    if is_hip_:
+        BLOCK = 16
+
     batch, head_num = q.shape[0], q.shape[1]
     kv_group_num = q.shape[1] // k_buffer.shape[-2]
 
@@ -496,6 +500,11 @@ def _decode_grouped_att_m_fwd(
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
         extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
         num_stages = 1
+    elif not is_hip_ and BLOCK_DMODEL >= 1024:
+        # Avoid shared memory overflow on NVIDIA when BLOCK_DMODEL is large
+        # like non-MLA D_QK=576, BLOCK_DMODEL=1024, BLOCK_H=16
+        # exceeds 101376 bytes limit
+        num_stages = 1
 
     _fwd_grouped_kernel_stage1[grid](
         q,
diff --git a/vllm/v1/attention/selector.py b/vllm/v1/attention/selector.py
index a328b1830103..d98de6966f52 100644
--- a/vllm/v1/attention/selector.py
+++ b/vllm/v1/attention/selector.py
@@ -12,7 +12,6 @@
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.v1.attention.backend import AttentionBackend, AttentionType
 from vllm.v1.attention.backends.registry import (
-    MAMBA_TYPE_TO_BACKEND_MAP,
     MambaAttentionBackendEnum,
 )
 
@@ -138,7 +137,7 @@ def _cached_get_attn_backend(
 
 
 def get_mamba_attn_backend(
-    mamba_type: str,
+    mamba_type: MambaAttentionBackendEnum,
 ) -> type[AttentionBackend]:
     """Select which mamba attention backend to use and lazily import it."""
     return _cached_get_mamba_attn_backend(mamba_type)
@@ -146,21 +145,11 @@ def get_mamba_attn_backend(
 
 @cache
 def _cached_get_mamba_attn_backend(
-    mamba_type: str,
+    mamba_type: MambaAttentionBackendEnum,
 ) -> type[AttentionBackend]:
-    assert mamba_type and isinstance(mamba_type, str)
+    assert mamba_type and isinstance(mamba_type, MambaAttentionBackendEnum)
 
-    selected_backend = None
-    try:
-        backend_name = MAMBA_TYPE_TO_BACKEND_MAP[mamba_type]
-        selected_backend = MambaAttentionBackendEnum[backend_name]
-    except KeyError as e:
-        raise ValueError(
-            f"Invalid mamba attention backend type: '{mamba_type}'. Valid "
-            f"types are: {list(MAMBA_TYPE_TO_BACKEND_MAP.keys())}"
-        ) from e
-
-    mamba_attn_backend = selected_backend.get_class()
+    mamba_attn_backend = mamba_type.get_class()
     if envs.VLLM_BATCH_INVARIANT and not mamba_attn_backend.supports_batch_invariance():
         raise RuntimeError(
             "VLLM batch_invariant mode is not supported for "
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 8aaeb3970079..39e2f966adbe 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1681,6 +1681,8 @@ def add_request(self, request: Request) -> None:
                 request.streaming_queue = deque()
             self._enqueue_waiting_request(request)
             self.requests[request.request_id] = request
+            if self.connector is not None:
+                self.connector.on_new_request(request)
             if self.log_stats:
                 request.record_event(EngineCoreEventType.QUEUED)
 
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 8172ead08319..f6c80055b651 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -122,6 +122,12 @@ class EngineCoreRequest(
     reasoning_ended: bool | None = None
     reasoning_parser_kwargs: dict[str, Any] | None = None
 
+    # If True, the request should be added to the scheduler's waiting queue
+    # and immediately aborted, so connector-side cleanup runs via the standard
+    # request_finished hook. Used to free P-side prefill blocks when a
+    # KV-transfer request is rejected on the D node before engine admission.
+    abort_immediately: bool = False
+
     @property
     def params(self) -> SamplingParams | PoolingParams:
         """Return the processed params (sampling or pooling)."""
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 02ed2917c7d6..32cc3c0d2d05 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -720,6 +720,33 @@ async def abort(
         if self.log_requests:
             logger.info("Aborted request(s) %s.", ",".join(request_ids))
 
+    async def notify_kv_transfer_request_rejected(
+        self,
+        request_id: str,
+        kv_transfer_params: dict[str, Any],
+        *,
+        data_parallel_rank: int | None = None,
+    ) -> None:
+        """Submit a pre-aborted request so the connector's request_finished
+        hook runs to free any pre-admission KV-transfer resources (e.g. NIXL
+        prefill blocks pinned on the P node)."""
+        request = EngineCoreRequest(
+            request_id=request_id,
+            prompt_token_ids=[0],
+            mm_features=None,
+            sampling_params=SamplingParams(
+                max_tokens=1,
+                extra_args={"kv_transfer_params": dict(kv_transfer_params)},
+            ),
+            pooling_params=None,
+            arrival_time=time.time(),
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=data_parallel_rank,
+            abort_immediately=True,
+        )
+        await self.engine_core.add_request_async(request)
+
     async def pause_generation(
         self,
         *,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 11c5ee19a664..07d3cca05188 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -344,6 +344,10 @@ def add_request(self, request: Request, request_wave: int = 0):
             )
 
         self.scheduler.add_request(request)
+        if request.abort_immediately:
+            # Immediately abort so the connector's request_finished hook runs
+            # to free any pre-admission KV-transfer resources.
+            self.abort_requests([request.request_id])
 
     def abort_requests(self, request_ids: list[str]):
         """Abort requests from the scheduler."""
@@ -2001,6 +2005,8 @@ def __init__(
         vllm_config.parallel_config.data_parallel_index = dp_rank
         vllm_config.parallel_config.data_parallel_rank_local = local_dp_rank
 
+        self._set_nixl_side_channel_host()
+
         # Set CUDA_VISIBLE_DEVICES as early as possible in actor life cycle
         # NOTE: in MP we set CUDA_VISIBLE_DEVICES at process creation time,
         # and this cannot be done in the same way for Ray because:
@@ -2020,6 +2026,16 @@ def __init__(
         # of ray.
         self._set_visible_devices(vllm_config, local_dp_rank)
 
+    @staticmethod
+    def _set_nixl_side_channel_host():
+        import ray
+
+        # The driver-side value is excluded from Ray actor env propagation.
+        # Fill in an actor-local default while preserving explicit overrides.
+        os.environ.setdefault(
+            "VLLM_NIXL_SIDE_CHANNEL_HOST", ray.util.get_node_ip_address()
+        )
+
     def _set_visible_devices(self, vllm_config: VllmConfig, local_dp_rank: int):
         from vllm.platforms import current_platform
 
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index a28c1366dd84..28df7bbc718d 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -27,6 +27,7 @@
 from vllm.utils.system_utils import get_mp_context
 from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.executor import Executor
+from vllm.v1.executor.ray_utils import WORKER_SPECIFIC_ENV_VARS
 from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
 
 if TYPE_CHECKING:
@@ -356,7 +357,10 @@ def __init__(
         self.local_engine_actors: list[ray.ActorHandle] = []
         self.remote_engine_actors: list[ray.ActorHandle] = []
 
-        env_vars_list = get_env_vars_to_copy(destination=actor_class.__name__)
+        env_vars_list = get_env_vars_to_copy(
+            destination=actor_class.__name__,
+            exclude_vars=WORKER_SPECIFIC_ENV_VARS,
+        )
         self.env_vars_dict = {
             name: os.environ[name] for name in env_vars_list if name in os.environ
         }
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index 1541b24deaaf..9083b9195912 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -33,6 +33,7 @@
 WORKER_SPECIFIC_ENV_VARS: set[str] = {
     "VLLM_HOST_IP",
     "VLLM_HOST_PORT",
+    "VLLM_NIXL_SIDE_CHANNEL_HOST",
     "LOCAL_RANK",
     "CUDA_VISIBLE_DEVICES",
     "HIP_VISIBLE_DEVICES",
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 19438fb1e42d..cf50dbff179a 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -16,6 +16,7 @@
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import get_dtype_size, nvfp4_kv_cache_full_dim
+from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -532,7 +533,7 @@ class MambaSpec(KVCacheSpec):
     shapes: tuple[tuple[int, ...], ...]
     dtypes: tuple[torch.dtype]
     page_size_padded: int | None = None
-    mamba_type: str = "mamba2"
+    mamba_type: MambaAttentionBackendEnum = MambaAttentionBackendEnum.MAMBA2
     mamba_cache_mode: str = "none"
     num_speculative_blocks: int = 0
 
diff --git a/vllm/v1/kv_offload/base.py b/vllm/v1/kv_offload/base.py
index 3d403ea50837..fed47cf5993a 100644
--- a/vllm/v1/kv_offload/base.py
+++ b/vllm/v1/kv_offload/base.py
@@ -147,22 +147,24 @@ def prepare_load(
         """
         pass
 
-    def touch(self, keys: Collection[OffloadKey]):
+    def touch(self, keys: Collection[OffloadKey], req_context: ReqContext):
         """
         Mark the given blocks as recently used.
         This could in practice mean moving them to the end of an LRU list.
 
         Args:
             keys: the keys identifying the blocks.
+            req_context: per-request context (e.g. kv_transfer_params).
         """
         return
 
-    def complete_load(self, keys: Collection[OffloadKey]):
+    def complete_load(self, keys: Collection[OffloadKey], req_context: ReqContext):
         """
         Marks previous blocks that were prepared to load as done loading.
 
         Args:
             keys: the keys identifying the blocks.
+            req_context: per-request context (e.g. kv_transfer_params).
         """
         return
 
@@ -189,7 +191,12 @@ def prepare_store(
         """
         pass
 
-    def complete_store(self, keys: Collection[OffloadKey], success: bool = True):
+    def complete_store(
+        self,
+        keys: Collection[OffloadKey],
+        req_context: ReqContext,
+        success: bool = True,
+    ):
         """
         Marks blocks which were previously prepared to be stored, as stored.
         Following this call, the blocks become loadable.
@@ -198,6 +205,7 @@ def complete_store(self, keys: Collection[OffloadKey], success: bool = True):
 
         Args:
             keys: the keys identifying the blocks.
+            req_context: per-request context (e.g. kv_transfer_params).
             success: whether the blocks were stored successfully.
         """
         return
diff --git a/vllm/v1/kv_offload/cpu/gpu_worker.py b/vllm/v1/kv_offload/cpu/gpu_worker.py
index c8cd0ad3b46c..2da3f1038e96 100644
--- a/vllm/v1/kv_offload/cpu/gpu_worker.py
+++ b/vllm/v1/kv_offload/cpu/gpu_worker.py
@@ -313,10 +313,22 @@ def transfer_async(self, job_id: int, transfer_spec: TransferSpec) -> bool:
             last_event = last_transfer.end_event
             # assure job will start only after the previous one completes
             stream.wait_event(last_event)
+        # CPU->GPU reads from host pinned memory, which is never written
+        # by a concurrent GPU stream, so CU_MEMCPY_SRC_ACCESS_ORDER_ANY is
+        # safe and lets the driver pipeline source reads. GPU->CPU reads
+        # from the live GPU KV cache, which the compute stream keeps
+        # writing; we must keep STREAM ordering so source reads are gated
+        # by the transfer stream's wait_stream(compute) barrier.
+        is_src_access_order_any = not self.gpu_to_cpu
         with torch.cuda.stream(stream):
             start_event.record(stream)
             if num_copy_ops > 0:
-                ops.swap_blocks_batch(batch_src, batch_dst, batch_sizes)
+                ops.swap_blocks_batch(
+                    batch_src,
+                    batch_dst,
+                    batch_sizes,
+                    is_src_access_order_any=is_src_access_order_any,
+                )
             end_event.record(stream)
 
         self._transfer_events[job_id] = end_event
diff --git a/vllm/v1/kv_offload/cpu/manager.py b/vllm/v1/kv_offload/cpu/manager.py
index 80bcb568f99a..9751f616dcf4 100644
--- a/vllm/v1/kv_offload/cpu/manager.py
+++ b/vllm/v1/kv_offload/cpu/manager.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
 from collections.abc import Collection, Iterable
 from typing import Literal
 
@@ -37,6 +38,8 @@ def __init__(
         num_blocks: int,
         cache_policy: Literal["lru", "arc"] = "lru",
         enable_events: bool = False,
+        store_threshold: int = 1,
+        max_tracker_size: int = 64_000,
     ):
         self.medium: str = CPULoadStoreSpec.medium()
         self._num_blocks: int = num_blocks
@@ -50,6 +53,13 @@ def __init__(
                 f"Supported: {list(_CACHE_POLICIES)}"
             )
         self._policy: CachePolicy = policy_cls(cache_capacity=num_blocks)
+        self.store_threshold: int = store_threshold
+        self.max_tracker_size: int = max_tracker_size
+
+        # Number of block references. It is ordered so can evict the LRU entry in O(1).
+        self.counts: OrderedDict[OffloadKey, int] | None = (
+            OrderedDict() if store_threshold >= 2 else None
+        )
 
     # --- block pool ---
 
@@ -85,6 +95,14 @@ def _get_load_store_spec(
     # --- OffloadingManager interface ---
 
     def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
+        if self.counts is not None:
+            if key in self.counts:
+                self.counts.move_to_end(key)
+                self.counts[key] += 1
+            else:
+                if len(self.counts) >= self.max_tracker_size:
+                    self.counts.popitem(last=False)
+                self.counts[key] = 1
         block = self._policy.get(key)
         if block is None:
             return False
@@ -106,10 +124,12 @@ def prepare_load(
             blocks.append(block)
         return self._get_load_store_spec(keys, blocks)
 
-    def touch(self, keys: Collection[OffloadKey]) -> None:
+    def touch(self, keys: Collection[OffloadKey], req_context: ReqContext) -> None:
         self._policy.touch(keys)
 
-    def complete_load(self, keys: Collection[OffloadKey]) -> None:
+    def complete_load(
+        self, keys: Collection[OffloadKey], req_context: ReqContext
+    ) -> None:
         for key in keys:
             block = self._policy.get(key)
             assert block is not None, f"Block {key!r} not found"
@@ -121,6 +141,8 @@ def prepare_store(
         keys: Collection[OffloadKey],
         req_context: ReqContext,
     ) -> PrepareStoreOutput | None:
+        if self.counts is not None:
+            keys = [k for k in keys if self.counts.get(k, 0) >= self.store_threshold]
         # filter out blocks that are already stored
         keys_to_store = [k for k in keys if self._policy.get(k) is None]
 
@@ -172,7 +194,10 @@ def prepare_store(
         )
 
     def complete_store(
-        self, keys: Collection[OffloadKey], success: bool = True
+        self,
+        keys: Collection[OffloadKey],
+        req_context: ReqContext,
+        success: bool = True,
     ) -> None:
         stored_keys: list[OffloadKey] = []
 
diff --git a/vllm/v1/kv_offload/cpu/spec.py b/vllm/v1/kv_offload/cpu/spec.py
index 54046d98f452..754c60f1321e 100644
--- a/vllm/v1/kv_offload/cpu/spec.py
+++ b/vllm/v1/kv_offload/cpu/spec.py
@@ -15,7 +15,6 @@
 from vllm.v1.kv_offload.cpu.common import CPULoadStoreSpec
 from vllm.v1.kv_offload.cpu.gpu_worker import CpuGpuOffloadingHandlers
 from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
-from vllm.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 
@@ -61,25 +60,21 @@ def get_manager(self) -> OffloadingManager:
                 kv_events_config is not None and kv_events_config.enable_kv_cache_events
             )
 
+            # store_threshold: how many times a block must appear in lookup()
+            # before it is eligible for CPU offloading.  Values < 2 disable
+            # filtering (a threshold of 1 equals no filter; 0 is the default).
+            store_threshold = int(self.extra_config.get("store_threshold", 0))
+
+            # Maximum entries in the internal tracker's LRU table.
+            max_tracker_size = int(self.extra_config.get("max_tracker_size", 64_000))
+
             self._manager = CPUOffloadingManager(
                 num_blocks=self.num_blocks,
                 cache_policy=self.eviction_policy,  # type: ignore[arg-type]
                 enable_events=enable_events,
+                store_threshold=store_threshold,
+                max_tracker_size=max_tracker_size,
             )
-
-            # store_threshold: how many times a block must appear in lookup()
-            # before it is eligible for CPU offloading.  Values < 2 disable
-            # filtering (a threshold of 1 equals no filter; 0 is the default).
-            store_threshold = int(self.extra_config.get("store_threshold", 0))
-            if store_threshold >= 2:
-                max_tracker_size = int(
-                    self.extra_config.get("max_tracker_size", 64_000)
-                )
-                self._manager = FilterReusedOffloadingManager(
-                    backing=self._manager,
-                    store_threshold=store_threshold,
-                    max_tracker_size=max_tracker_size,
-                )
         return self._manager
 
     def get_handlers(
diff --git a/vllm/v1/kv_offload/reuse_manager.py b/vllm/v1/kv_offload/reuse_manager.py
deleted file mode 100644
index 6cb0a5f7591c..000000000000
--- a/vllm/v1/kv_offload/reuse_manager.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Reuse-frequency gating for CPU KV-cache offload stores.
-
-FilterReusedOffloadingManager — OffloadingManager decorator that skips
-    storing blocks that have not yet been seen enough times.
-"""
-
-from collections import OrderedDict
-from collections.abc import Collection, Iterable
-
-from vllm.v1.kv_offload.base import (
-    LoadStoreSpec,
-    OffloadingEvent,
-    OffloadingManager,
-    OffloadKey,
-    PrepareStoreOutput,
-    ReqContext,
-)
-
-
-class FilterReusedOffloadingManager(OffloadingManager):
-    """An :class:`OffloadingManager` decorator that skips storing blocks
-    whose reuse frequency is below *store_threshold*.
-
-    All methods are delegated to the *backing* manager.  Two methods are
-    intercepted:
-
-    * ``prepare_store`` — filters out keys that have not yet
-    * ``lookup`` — records the visited key in an internal LRU
-      counter, then delegates to the backing manager.
-      crossed the threshold *before* calling the backing
-      ``prepare_store``.
-
-    Args:
-        backing: The underlying ``OffloadingManager`` to delegate to.
-        store_threshold: A block must be seen at least this many times in
-            ``lookup()`` before it is eligible for offloading.  Must be >= 2
-            (a value of 1 would be equivalent to no filtering).
-        max_tracker_size: Maximum entries in the internal tracker's LRU table.
-    """
-
-    def __init__(
-        self,
-        backing: OffloadingManager,
-        store_threshold: int = 2,
-        max_tracker_size: int = 64_000,
-    ):
-        if store_threshold < 2:
-            raise ValueError(
-                "FilterReusedOffloadingManager store_threshold must be >= 2, "
-                f"got {store_threshold}"
-            )
-        if max_tracker_size < 1:
-            raise ValueError(
-                "FilterReusedOffloadingManager max_tracker_size must be >= 1, "
-                f"got {max_tracker_size}"
-            )
-        self._backing = backing
-        self.store_threshold = store_threshold
-        self.max_tracker_size = max_tracker_size
-        # Ordered so we can evict the LRU entry in O(1).
-        self.counts: OrderedDict[OffloadKey, int] = OrderedDict()
-
-    # ------------------------------------------------------------------
-    # Intercepted methods
-    # ------------------------------------------------------------------
-
-    def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
-        """Record the key, then delegate lookup to backing manager."""
-        if key in self.counts:
-            self.counts.move_to_end(key)
-            self.counts[key] += 1
-        else:
-            if len(self.counts) >= self.max_tracker_size:
-                self.counts.popitem(last=False)  # evict LRU
-            self.counts[key] = 1
-        return self._backing.lookup(key, req_context)
-
-    def prepare_store(
-        self, keys: Collection[OffloadKey], req_context: ReqContext
-    ) -> PrepareStoreOutput | None:
-        """Filter out blocks below threshold, then delegate to backing.
-
-        Filtering is evaluated *before* calling the backing manager's
-        ``prepare_store`` so that blocks that would be skipped do not
-        consume any CPU offload capacity.
-        """
-        eligible = [
-            key for key in keys if self.counts.get(key, 0) >= self.store_threshold
-        ]
-
-        # Passing an empty list is intentional and safe — CPUOffloadingManager
-        # handles it correctly, returning a PrepareStoreOutput with empty lists.
-        # Delegate to the backing manager with only the eligible keys.
-        return self._backing.prepare_store(eligible, req_context)
-
-    # ------------------------------------------------------------------
-    # Delegated methods
-    # ------------------------------------------------------------------
-
-    def prepare_load(
-        self, keys: Collection[OffloadKey], req_context: ReqContext
-    ) -> LoadStoreSpec:
-        return self._backing.prepare_load(keys, req_context)
-
-    def touch(self, keys: Collection[OffloadKey]) -> None:
-        return self._backing.touch(keys)
-
-    def complete_load(self, keys: Collection[OffloadKey]) -> None:
-        return self._backing.complete_load(keys)
-
-    def complete_store(
-        self, keys: Collection[OffloadKey], success: bool = True
-    ) -> None:
-        return self._backing.complete_store(keys, success)
-
-    def take_events(self) -> Iterable[OffloadingEvent]:
-        return self._backing.take_events()
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 738a68c83680..0d435deb3f05 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -76,6 +76,7 @@ def __init__(
         resumable: bool = False,
         reasoning_ended: bool | None = None,
         reasoning_parser_kwargs: dict[str, Any] | None = None,
+        abort_immediately: bool = False,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -182,6 +183,10 @@ def __init__(
         # None entry in the queue means finished.
         self.streaming_queue: deque[StreamingUpdate | None] | None = None
 
+        # If True, request should be aborted immediately after being added to
+        # the scheduler so the connector's request_finished hook runs.
+        self.abort_immediately = abort_immediately
+
     @classmethod
     def from_engine_core_request(
         cls,
@@ -206,6 +211,7 @@ def from_engine_core_request(
             resumable=request.resumable,
             reasoning_ended=request.reasoning_ended,
             reasoning_parser_kwargs=request.reasoning_parser_kwargs,
+            abort_immediately=request.abort_immediately,
         )
 
     def append_output_token_ids(
diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py
index 56972e517980..62ea430ac694 100644
--- a/vllm/v1/sample/ops/bad_words.py
+++ b/vllm/v1/sample/ops/bad_words.py
@@ -23,7 +23,8 @@ def _apply_bad_words_single_batch(
         assert len(actual_prefix) == len(expected_prefix)
 
         if actual_prefix == expected_prefix:
-            logits[last_token_id] = _SMALLEST_LOGIT
+            # Assign to slice to avoid cpu->gpu sync.
+            logits[last_token_id : last_token_id + 1] = _SMALLEST_LOGIT
 
 
 def apply_bad_words(
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index a77eafba2556..d4ec2a2ddcab 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -175,27 +175,30 @@ def gather_specific_token_logprobs(
 
         # Find max number of tokens across all requests
         max_num_tokens = max(len(tids) for tids in logprob_token_ids.values())
+        pin = self.pin_memory
 
-        # Create padded token_ids tensor: [batch_size, max_num_tokens + 1]
-        # +1 for sampled token in first position
-        token_ids_tensor = torch.zeros(
-            batch_size, max_num_tokens + 1, dtype=torch.int64, device=device
+        # Build the padded token_ids and valid_mask matrices on pinned CPU,
+        # then upload non-blocking.
+        token_ids_cpu = torch.zeros(
+            batch_size, max_num_tokens + 1, dtype=torch.int64, pin_memory=pin
         )
-        token_ids_tensor[:, 0] = sampled  # First column is sampled token
-
         # Create mask for valid positions (True = valid, False = padded)
-        valid_mask = torch.zeros(
-            batch_size, max_num_tokens + 1, dtype=torch.bool, device=device
+        valid_mask_cpu = torch.zeros(
+            batch_size, max_num_tokens + 1, dtype=torch.bool, pin_memory=pin
         )
-        valid_mask[:, 0] = True  # Sampled token is always valid
-
-        # Fill in token IDs for each request
+        valid_mask_cpu[:, 0] = True  # Sampled token is always valid
         for req_idx, token_ids in logprob_token_ids.items():
             num_tokens = len(token_ids)
-            token_ids_tensor[req_idx, 1 : num_tokens + 1] = torch.tensor(
-                token_ids, dtype=torch.int64, device=device
+            token_ids_cpu[req_idx, 1 : num_tokens + 1] = torch.as_tensor(
+                token_ids, dtype=torch.int64
             )
-            valid_mask[req_idx, 1 : num_tokens + 1] = True
+            valid_mask_cpu[req_idx, 1 : num_tokens + 1] = True
+
+        token_ids_tensor = token_ids_cpu.to(device, non_blocking=True)
+        valid_mask = valid_mask_cpu.to(device, non_blocking=True)
+        # Sampled token in column 0 — fill on-device from the sampled GPU
+        # tensor so we don't need to D2H + re-upload.
+        token_ids_tensor[:, 0] = sampled
 
         # Compute logprobs using the fused Triton kernel (log_softmax + gather)
         logprobs = compute_token_logprobs(logits, token_ids_tensor)
diff --git a/vllm/v1/spec_decode/llm_base_proposer.py b/vllm/v1/spec_decode/llm_base_proposer.py
index 08ed9798ed6b..cc113025c129 100644
--- a/vllm/v1/spec_decode/llm_base_proposer.py
+++ b/vllm/v1/spec_decode/llm_base_proposer.py
@@ -1168,6 +1168,7 @@ def load_model(self, target_model: nn.Module) -> None:
             # handle multimodality
             assert hasattr(target_model, "config")
             if self.get_model_name(target_model) in [
+                "Cohere2VisionForConditionalGeneration",
                 "Exaone4_5_ForConditionalGeneration",
                 "GlmOcrForConditionalGeneration",
                 "HunYuanVLForConditionalGeneration",
diff --git a/vllm/v1/spec_decode/ngram_proposer_gpu.py b/vllm/v1/spec_decode/ngram_proposer_gpu.py
index eb24a9c933e2..7759d5c32f60 100644
--- a/vllm/v1/spec_decode/ngram_proposer_gpu.py
+++ b/vllm/v1/spec_decode/ngram_proposer_gpu.py
@@ -18,6 +18,7 @@
     VllmConfig,
 )
 from vllm.forward_context import set_forward_context
+from vllm.utils.torch_utils import async_tensor_h2d
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.utils import record_function_or_nullcontext
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -569,8 +570,8 @@ def update_ngram_gpu_tensors_incremental(
             reorder_dst.append(curr_idx)
 
     if reorder_src:
-        src_tensor = torch.tensor(reorder_src, dtype=torch.long, device=device)
-        dst_tensor = torch.tensor(reorder_dst, dtype=torch.long, device=device)
+        src_tensor = async_tensor_h2d(reorder_src, dtype=torch.long, device=device)
+        dst_tensor = async_tensor_h2d(reorder_dst, dtype=torch.long, device=device)
 
         temp_token_ids = token_ids_gpu_tensor[src_tensor].clone()
         temp_num_tokens = num_tokens_no_spec_gpu[src_tensor].clone()
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 64f4e59031e6..e7c6d81a9929 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -43,11 +43,13 @@ def _run_ar(
     dp_size = parallel_config.data_parallel_size
     dp_rank = parallel_config.data_parallel_rank
     device, group = _get_device_and_group(parallel_config)
-    tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32)
-    tensor[0][dp_rank] = orig_num_tokens_per_ubatch
-    tensor[1][dp_rank] = padded_num_tokens_per_ubatch
-    tensor[2][dp_rank] = 1 if should_ubatch else 0
-    tensor[3][dp_rank] = cudagraph_mode
+    # Populate this rank's contribution on CPU to reduce GPU syncs.
+    tensor_cpu = torch.zeros(4, dp_size, dtype=torch.int32)
+    tensor_cpu[0][dp_rank] = orig_num_tokens_per_ubatch
+    tensor_cpu[1][dp_rank] = padded_num_tokens_per_ubatch
+    tensor_cpu[2][dp_rank] = 1 if should_ubatch else 0
+    tensor_cpu[3][dp_rank] = cudagraph_mode
+    tensor = tensor_cpu.to(device, non_blocking=True)
     dist.all_reduce(tensor, group=group)
     return tensor
 
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index b81bd0dc59fc..6485444e6a2f 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -712,15 +712,26 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None:
             self.sampler.apply_staged_writes()
 
     def update_requests(self, scheduler_output: SchedulerOutput) -> None:
-        # Add new blocks for the existing requests.
+        # Add new blocks and update num_computed_tokens for the existing requests.
         reqs = scheduler_output.scheduled_cached_reqs
-        for req_new_block_ids, req_id in zip(reqs.new_block_ids, reqs.req_ids):
+        num_computed_tokens_np = self.req_states.num_computed_tokens_np
+        for req_id, num_computed_tokens, req_new_block_ids in zip(
+            reqs.req_ids, reqs.num_computed_tokens, reqs.new_block_ids
+        ):
+            req_index = self.req_states.req_id_to_index[req_id]
+            num_computed_tokens_np[req_index] = num_computed_tokens
             if req_new_block_ids is not None:
-                req_index = self.req_states.req_id_to_index[req_id]
                 self.block_tables.append_block_ids(
                     req_index, req_new_block_ids, overwrite=False
                 )
 
+        # Update num_computed_prefill_tokens.
+        np.minimum(
+            self.req_states.num_computed_tokens_np,
+            self.req_states.prefill_len.np,
+            out=self.req_states.num_computed_prefill_tokens,
+        )
+
     def prepare_inputs(
         self, scheduler_output: SchedulerOutput, batch_desc: BatchExecutionDescriptor
     ) -> InputBatch:
@@ -787,10 +798,7 @@ def prepare_inputs(
         async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc)
         query_start_loc_np = query_start_loc_np[: num_reqs_padded + 1]
         query_start_loc = self.input_buffers.query_start_loc[: num_reqs_padded + 1]
-        is_prefilling_np = (
-            self.req_states.num_computed_prefill_tokens[idx_mapping_np]
-            < self.req_states.prefill_len.np[idx_mapping_np]
-        )
+        is_prefilling_np = self.req_states.is_prefilling(idx_mapping_np)
 
         # Get prefill tokens if any.
         if np.any(is_prefilling_np):
@@ -973,18 +981,6 @@ def postprocess(
             self.req_states.total_len.gpu,
         )
 
-        # Update the number of computed prefill tokens.
-        idx_mapping_np = input_batch.idx_mapping_np
-        computed_prefill = self.req_states.num_computed_prefill_tokens
-        computed_prefill[idx_mapping_np] += input_batch.num_scheduled_tokens
-        np.minimum(
-            computed_prefill, self.req_states.prefill_len.np, out=computed_prefill
-        )
-        # Advance the CPU mirror optimistically (assume all scheduled accepted).
-        self.req_states.num_computed_tokens_np[idx_mapping_np] += (
-            input_batch.num_scheduled_tokens
-        )
-
         self.model_state.postprocess_state(input_batch, num_sampled)
 
     @torch.inference_mode()
@@ -1357,18 +1353,6 @@ def postprocess_pool(self, input_batch: InputBatch) -> None:
             input_batch.query_start_loc,
         )
 
-        # Update the number of computed prefill tokens.
-        idx_mapping_np = input_batch.idx_mapping_np
-        computed_prefill = self.req_states.num_computed_prefill_tokens
-        computed_prefill[idx_mapping_np] += input_batch.num_scheduled_tokens
-        np.minimum(
-            computed_prefill, self.req_states.prefill_len.np, out=computed_prefill
-        )
-        # Advance the CPU mirror optimistically (assume all scheduled accepted).
-        self.req_states.num_computed_tokens_np[idx_mapping_np] += (
-            input_batch.num_scheduled_tokens
-        )
-
     def shutdown(self) -> None:
         """Release GPU tensors (model weights, KV caches, workspace) so that
         memory is reclaimable when running in the same process."""
diff --git a/vllm/v1/worker/gpu/sample/logprob.py b/vllm/v1/worker/gpu/sample/logprob.py
index 7530337fcd12..cf24c186e93a 100644
--- a/vllm/v1/worker/gpu/sample/logprob.py
+++ b/vllm/v1/worker/gpu/sample/logprob.py
@@ -124,9 +124,13 @@ def compute_topk_logprobs(
         # tokens where applicable.
         assert logprob_token_ids_state is not None
         assert expanded_idx_mapping is not None
-        topk_indices = None
+
         if num_logprobs > 0:
-            topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+            topk_token_ids = torch.topk(logits, num_logprobs, dim=-1).indices
+            topk_token_ids = topk_token_ids.to(torch.int32)
+        else:
+            # This tensor just used as an int32 pointer, data not accessed.
+            topk_token_ids = logprob_token_ids_state.token_ids.gpu
 
         num_cols = max(num_logprobs, max_per_req_token_ids)
         logprob_token_ids = sampled_token_ids.new_zeros((batch_size, 1 + num_cols))
@@ -137,8 +141,8 @@ def compute_topk_logprobs(
             valid_mask,
             valid_mask.stride(0),
             sampled_token_ids,
-            topk_indices if topk_indices is not None else logprob_token_ids,
-            topk_indices.stride(0) if topk_indices is not None else 0,
+            topk_token_ids,
+            topk_token_ids.stride(0),
             expanded_idx_mapping,
             logprob_token_ids_state.num_token_ids.gpu,
             logprob_token_ids_state.token_ids.gpu,
@@ -202,14 +206,12 @@ def _fill_logprob_token_ids_kernel(
         # Override topk with per-request custom tokens.
         src = per_req_token_ids_ptr + req_state_idx * per_req_token_ids_stride
         valid = col < num_custom
-        # per_req_token_ids is int32; output is int64.
-        tokens = tl.load(src + col, mask=valid, other=0).to(tl.int64)
     else:
         # Fill with topk indices (no-op when NUM_TOPK == 0).
         src = topk_indices_ptr + batch_idx * topk_indices_stride
         valid = col < NUM_TOPK
-        tokens = tl.load(src + col, mask=valid, other=0)
 
+    tokens = tl.load(src + col, mask=valid, other=0).to(tl.int64)
     tl.store(tid_base + col, tokens, mask=valid)
     tl.store(mask_base + col, tl.full([PADDED_COLS], 1, tl.int1), mask=valid)
 
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index 057517479b51..2cfa8b0aec40 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -283,8 +283,10 @@ def bincount(
     output_bin_counts: torch.Tensor,
     max_prefill_len: int,
 ) -> None:
-    prompt_bin_mask[expanded_idx_mapping] = 0
-    output_bin_counts[expanded_idx_mapping] = 0
+    # Use index_fill_ instead of `tensor[idx] = 0` to avoid sync.
+    idx_long = expanded_idx_mapping.long()
+    prompt_bin_mask.index_fill_(0, idx_long, 0)
+    output_bin_counts.index_fill_(0, idx_long, 0)
     num_tokens = expanded_idx_mapping.shape[0]
     BLOCK_SIZE = 1024
     num_blocks = triton.cdiv(max_prefill_len, BLOCK_SIZE)
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 6268ea0ba673..cdd7286fa56e 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -104,9 +104,8 @@ def add_request(
         self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
         self.num_computed_tokens_np[req_idx] = num_computed_tokens
         self.num_computed_tokens.stage_write_elem(req_idx, num_computed_tokens)
-        self.num_computed_tokens_np[req_idx] = num_computed_tokens
 
-        if num_computed_tokens > 0 and num_computed_tokens <= prefill_len:
+        if 0 < num_computed_tokens <= prefill_len:
             # For PD disagg or resumed requests: set last_sampled to the last
             # computed token so the first decode step gets the right input_id.
             # For fresh prefill requests (num_computed_tokens == 0) the tensor
@@ -134,8 +133,8 @@ def remove_request(self, req_id: str) -> bool:
         self.free_indices.append(req_idx)
         return True
 
-    def any_prefills(self, idx_mapping_np: np.ndarray) -> bool:
-        return np.any(
+    def is_prefilling(self, idx_mapping_np: np.ndarray) -> np.ndarray:
+        return (
             self.num_computed_prefill_tokens[idx_mapping_np]
             < self.prefill_len.np[idx_mapping_np]
         )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bef78130680d..df061255b3a5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -995,16 +995,23 @@ def _init_model_kwargs(self):
         if len(token_type_id_requests) == 0:
             return model_kwargs
 
-        seq_lens = self.seq_lens[:num_reqs]
+        # Build ids on CPU using the CPU-resident upper bound for seq_lens;
+        # `torch.arange(seq_lens[i])` with a GPU scalar would force a sync.
+        seq_lens_cpu = self.optimistic_seq_lens_cpu[:num_reqs].tolist()
         token_type_ids = []
 
         for i in range(num_reqs):
-            pos = token_type_id_requests.get(i, seq_lens[i])
-            ids = (torch.arange(seq_lens[i]) >= pos).int()
+            seq_len_i = seq_lens_cpu[i]
+            pos = token_type_id_requests.get(i, seq_len_i)
+            ids = (torch.arange(seq_len_i) >= pos).int()
             token_type_ids.append(ids)
 
-        model_kwargs["token_type_ids"] = torch.concat(token_type_ids).to(
-            device=self.device
+        token_type_ids_cpu = torch.empty(
+            sum(seq_lens_cpu), dtype=torch.int32, pin_memory=self.pin_memory
+        )
+        torch.cat(token_type_ids, out=token_type_ids_cpu)
+        model_kwargs["token_type_ids"] = token_type_ids_cpu.to(
+            device=self.device, non_blocking=True
         )
         return model_kwargs
 
@@ -2715,10 +2722,9 @@ def _prepare_kv_sharing_fast_prefill(
         # There might have leftover indices in logits_indices[num_logits:]
         # from previous iterations, whose values may be greater than the
         # batch size in the current iteration. To ensure indices are always
-        # valid, we fill the padded indices with the last index.
-        self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
-            logits_indices[-1].item()
-        )
+        # valid, fill the padded indices with the last index. Broadcast the
+        # scalar GPU-side to avoid a D2H sync on `.item()`.
+        self.kv_sharing_fast_prefill_logits_indices[num_logits:] = logits_indices[-1]
         # Dispatch for the decoder portion of the model.
         _, batch_desc = self.cudagraph_dispatcher.dispatch(
             num_logits, invalid_modes={CUDAGraphMode.FULL}
@@ -3129,7 +3135,7 @@ def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
 
         return tuple(tasks)
 
-    def sync_and_slice_intermediate_tensors(
+    def sync_and_gather_intermediate_tensors(
         self,
         num_tokens: int,
         intermediate_tensors: IntermediateTensors | None,
@@ -3140,24 +3146,23 @@ def sync_and_slice_intermediate_tensors(
         tp = self.vllm_config.parallel_config.tensor_parallel_size
         is_rs = is_residual_scattered_for_sp(self.vllm_config, num_tokens)
 
-        # When sequence parallelism is enabled, the "residual" tensor is sharded
-        # across tensor parallel ranks, so each rank only needs its own slice.
+        # When sequence parallelism is enabled, the "residual" tensor is
+        # sharded across TP ranks. All-gather it here because downstream
+        # QKV + Attention needs the full residual before the SP split point.
         if sync_self:
             assert intermediate_tensors is not None
             for k, v in intermediate_tensors.items():
                 is_scattered = k == "residual" and is_rs
-                copy_len = num_tokens // tp if is_scattered else num_tokens
-                self.intermediate_tensors[k][:copy_len].copy_(
-                    v[:copy_len], non_blocking=True
+                if is_scattered:
+                    local_len = num_tokens // tp
+                    v = get_tp_group().all_gather(v[:local_len], dim=0)
+
+                self.intermediate_tensors[k][:num_tokens].copy_(
+                    v[:num_tokens], non_blocking=True
                 )
 
         return IntermediateTensors(
-            {
-                k: v[: num_tokens // tp]
-                if k == "residual" and is_rs
-                else v[:num_tokens]
-                for k, v in self.intermediate_tensors.items()
-            }
+            {k: v[:num_tokens] for k, v in self.intermediate_tensors.items()}
         )
 
     def eplb_step(self, is_dummy: bool = False, is_profile: bool = False) -> None:
@@ -3373,7 +3378,7 @@ def _preprocess(
             intermediate_tensors = None
         else:
             assert intermediate_tensors is not None
-            intermediate_tensors = self.sync_and_slice_intermediate_tensors(
+            intermediate_tensors = self.sync_and_gather_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True
             )
 
@@ -5589,7 +5594,7 @@ def _dummy_run(
                         )
                     )
 
-                intermediate_tensors = self.sync_and_slice_intermediate_tensors(
+                intermediate_tensors = self.sync_and_gather_intermediate_tensors(
                     num_tokens_padded, None, False
                 )
 
@@ -6154,7 +6159,8 @@ def capture_model(self) -> int:
                 "Skipping CUDA graph capture. To turn on CUDA graph capture, "
                 "ensure `cudagraph_mode` was not manually set to `NONE`"
             )
-            self.init_routed_experts_capturer()
+            if self.model_config.enable_return_routed_experts:
+                self.init_routed_experts_capturer()
             return 0
 
         # Initialize encoder CUDA graph manager if enabled.
@@ -6193,7 +6199,8 @@ def capture_model(self) -> int:
         # address is baked into the graph.  Do NOT call this inside
         # _capture_cudagraphs() -- creating the capturer twice replaces
         # the device buffer, causing graphs to write to a dead buffer.
-        self.init_routed_experts_capturer()
+        if self.model_config.enable_return_routed_experts:
+            self.init_routed_experts_capturer()
 
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
@@ -6989,6 +6996,10 @@ def init_routed_experts_capturer(self):
             "Initializing routed experts capturer, enable_return_routed_experts: %s",
             self.model_config.enable_return_routed_experts,
         )
+        if not self.model_config.enable_return_routed_experts:
+            self.routed_experts_initialized = False
+            return
+
         from vllm.distributed import get_tp_group
 
         if hasattr(self.model_config.hf_text_config, "n_shared_experts"):